{ "best_metric": 0.056979671120643616, "best_model_checkpoint": "autotrain-st-pair-class/checkpoint-339505", "epoch": 5.0, "eval_steps": 500, "global_step": 339505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003681830900870385, "grad_norm": 0.8678207397460938, "learning_rate": 1.472710671261524e-08, "loss": 0.7265, "step": 25 }, { "epoch": 0.000736366180174077, "grad_norm": 1.7817554473876953, "learning_rate": 2.945421342523048e-08, "loss": 0.7232, "step": 50 }, { "epoch": 0.0011045492702611154, "grad_norm": 1.143256425857544, "learning_rate": 4.418132013784572e-08, "loss": 0.7232, "step": 75 }, { "epoch": 0.001472732360348154, "grad_norm": 1.4593238830566406, "learning_rate": 5.890842685046096e-08, "loss": 0.7206, "step": 100 }, { "epoch": 0.0018409154504351924, "grad_norm": 1.088563084602356, "learning_rate": 7.36355335630762e-08, "loss": 0.7212, "step": 125 }, { "epoch": 0.0022090985405222308, "grad_norm": 1.2012892961502075, "learning_rate": 8.836264027569144e-08, "loss": 0.7245, "step": 150 }, { "epoch": 0.0025772816306092695, "grad_norm": 0.7117802500724792, "learning_rate": 1.0308974698830668e-07, "loss": 0.7222, "step": 175 }, { "epoch": 0.002945464720696308, "grad_norm": 0.8568280935287476, "learning_rate": 1.1781685370092192e-07, "loss": 0.7207, "step": 200 }, { "epoch": 0.003313647810783346, "grad_norm": 0.6288392543792725, "learning_rate": 1.3254396041353717e-07, "loss": 0.7234, "step": 225 }, { "epoch": 0.003681830900870385, "grad_norm": 1.2614713907241821, "learning_rate": 1.472710671261524e-07, "loss": 0.717, "step": 250 }, { "epoch": 0.004050013990957424, "grad_norm": 0.634227454662323, "learning_rate": 1.6199817383876764e-07, "loss": 0.7217, "step": 275 }, { "epoch": 0.0044181970810444615, "grad_norm": 0.5872971415519714, "learning_rate": 1.7672528055138287e-07, "loss": 0.7196, "step": 300 }, { "epoch": 0.0047863801711315, "grad_norm": 0.9929088354110718, "learning_rate": 1.914523872639981e-07, "loss": 0.72, "step": 325 }, { "epoch": 0.005154563261218539, "grad_norm": 1.1321295499801636, "learning_rate": 2.0617949397661337e-07, "loss": 0.7167, "step": 350 }, { "epoch": 0.005522746351305577, "grad_norm": 0.7045577764511108, "learning_rate": 2.209066006892286e-07, "loss": 0.7169, "step": 375 }, { "epoch": 0.005890929441392616, "grad_norm": 0.9594325423240662, "learning_rate": 2.3563370740184384e-07, "loss": 0.7167, "step": 400 }, { "epoch": 0.006259112531479654, "grad_norm": 0.6934572458267212, "learning_rate": 2.503608141144591e-07, "loss": 0.7171, "step": 425 }, { "epoch": 0.006627295621566692, "grad_norm": 0.6293821334838867, "learning_rate": 2.6508792082707434e-07, "loss": 0.7148, "step": 450 }, { "epoch": 0.006995478711653731, "grad_norm": 0.725359320640564, "learning_rate": 2.7981502753968957e-07, "loss": 0.7142, "step": 475 }, { "epoch": 0.00736366180174077, "grad_norm": 0.6588715314865112, "learning_rate": 2.945421342523048e-07, "loss": 0.7168, "step": 500 }, { "epoch": 0.0077318448918278085, "grad_norm": 0.7691717743873596, "learning_rate": 3.0926924096492004e-07, "loss": 0.7133, "step": 525 }, { "epoch": 0.008100027981914847, "grad_norm": 0.7171897292137146, "learning_rate": 3.239963476775353e-07, "loss": 0.7119, "step": 550 }, { "epoch": 0.008468211072001885, "grad_norm": 0.6589468121528625, "learning_rate": 3.3872345439015056e-07, "loss": 0.7134, "step": 575 }, { "epoch": 0.008836394162088923, "grad_norm": 0.5842499136924744, "learning_rate": 3.5345056110276574e-07, "loss": 0.711, "step": 600 }, { "epoch": 0.009204577252175963, "grad_norm": 0.732860267162323, "learning_rate": 3.6817766781538103e-07, "loss": 0.7122, "step": 625 }, { "epoch": 0.009572760342263, "grad_norm": 0.9822880625724792, "learning_rate": 3.829047745279962e-07, "loss": 0.7116, "step": 650 }, { "epoch": 0.009940943432350038, "grad_norm": 0.7484005093574524, "learning_rate": 3.976318812406115e-07, "loss": 0.7116, "step": 675 }, { "epoch": 0.010309126522437078, "grad_norm": 1.1163777112960815, "learning_rate": 4.1235898795322674e-07, "loss": 0.7066, "step": 700 }, { "epoch": 0.010677309612524116, "grad_norm": 1.132763385772705, "learning_rate": 4.27086094665842e-07, "loss": 0.7095, "step": 725 }, { "epoch": 0.011045492702611154, "grad_norm": 1.3809318542480469, "learning_rate": 4.418132013784572e-07, "loss": 0.7047, "step": 750 }, { "epoch": 0.011413675792698193, "grad_norm": 0.5831483602523804, "learning_rate": 4.565403080910725e-07, "loss": 0.7103, "step": 775 }, { "epoch": 0.011781858882785231, "grad_norm": 0.5990871787071228, "learning_rate": 4.712674148036877e-07, "loss": 0.7076, "step": 800 }, { "epoch": 0.01215004197287227, "grad_norm": 0.9539962410926819, "learning_rate": 4.85994521516303e-07, "loss": 0.7065, "step": 825 }, { "epoch": 0.012518225062959309, "grad_norm": 1.093916893005371, "learning_rate": 5.007216282289182e-07, "loss": 0.7073, "step": 850 }, { "epoch": 0.012886408153046347, "grad_norm": 0.7179368734359741, "learning_rate": 5.154487349415335e-07, "loss": 0.7045, "step": 875 }, { "epoch": 0.013254591243133385, "grad_norm": 0.6126222610473633, "learning_rate": 5.301758416541487e-07, "loss": 0.7022, "step": 900 }, { "epoch": 0.013622774333220424, "grad_norm": 0.7177501916885376, "learning_rate": 5.44902948366764e-07, "loss": 0.7036, "step": 925 }, { "epoch": 0.013990957423307462, "grad_norm": 0.8863348960876465, "learning_rate": 5.596300550793791e-07, "loss": 0.6987, "step": 950 }, { "epoch": 0.014359140513394502, "grad_norm": 0.8881144523620605, "learning_rate": 5.743571617919944e-07, "loss": 0.701, "step": 975 }, { "epoch": 0.01472732360348154, "grad_norm": 1.149389386177063, "learning_rate": 5.890842685046096e-07, "loss": 0.7, "step": 1000 }, { "epoch": 0.015095506693568577, "grad_norm": 0.6700385212898254, "learning_rate": 6.038113752172248e-07, "loss": 0.6989, "step": 1025 }, { "epoch": 0.015463689783655617, "grad_norm": 0.7707194685935974, "learning_rate": 6.185384819298401e-07, "loss": 0.6975, "step": 1050 }, { "epoch": 0.015831872873742655, "grad_norm": 0.6434314846992493, "learning_rate": 6.332655886424554e-07, "loss": 0.6999, "step": 1075 }, { "epoch": 0.016200055963829695, "grad_norm": 0.575720489025116, "learning_rate": 6.479926953550705e-07, "loss": 0.6978, "step": 1100 }, { "epoch": 0.01656823905391673, "grad_norm": 1.1561897993087769, "learning_rate": 6.627198020676858e-07, "loss": 0.696, "step": 1125 }, { "epoch": 0.01693642214400377, "grad_norm": 1.0465784072875977, "learning_rate": 6.774469087803011e-07, "loss": 0.6939, "step": 1150 }, { "epoch": 0.01730460523409081, "grad_norm": 0.6008294224739075, "learning_rate": 6.921740154929163e-07, "loss": 0.6914, "step": 1175 }, { "epoch": 0.017672788324177846, "grad_norm": 1.0956335067749023, "learning_rate": 7.069011222055315e-07, "loss": 0.688, "step": 1200 }, { "epoch": 0.018040971414264886, "grad_norm": 0.6495553255081177, "learning_rate": 7.216282289181469e-07, "loss": 0.6899, "step": 1225 }, { "epoch": 0.018409154504351925, "grad_norm": 0.5202560424804688, "learning_rate": 7.363553356307621e-07, "loss": 0.6894, "step": 1250 }, { "epoch": 0.01877733759443896, "grad_norm": 0.7886903882026672, "learning_rate": 7.510824423433772e-07, "loss": 0.6863, "step": 1275 }, { "epoch": 0.019145520684526, "grad_norm": 0.6157426834106445, "learning_rate": 7.658095490559924e-07, "loss": 0.6868, "step": 1300 }, { "epoch": 0.01951370377461304, "grad_norm": 0.8816072344779968, "learning_rate": 7.805366557686078e-07, "loss": 0.6886, "step": 1325 }, { "epoch": 0.019881886864700077, "grad_norm": 0.7643646597862244, "learning_rate": 7.95263762481223e-07, "loss": 0.684, "step": 1350 }, { "epoch": 0.020250069954787116, "grad_norm": 0.7116807103157043, "learning_rate": 8.099908691938382e-07, "loss": 0.6852, "step": 1375 }, { "epoch": 0.020618253044874156, "grad_norm": 0.8170478343963623, "learning_rate": 8.247179759064535e-07, "loss": 0.6824, "step": 1400 }, { "epoch": 0.020986436134961192, "grad_norm": 0.7388837933540344, "learning_rate": 8.394450826190688e-07, "loss": 0.6823, "step": 1425 }, { "epoch": 0.021354619225048232, "grad_norm": 0.5734406113624573, "learning_rate": 8.54172189331684e-07, "loss": 0.6806, "step": 1450 }, { "epoch": 0.02172280231513527, "grad_norm": 0.7802960276603699, "learning_rate": 8.688992960442992e-07, "loss": 0.6791, "step": 1475 }, { "epoch": 0.022090985405222308, "grad_norm": 0.6324337720870972, "learning_rate": 8.836264027569144e-07, "loss": 0.675, "step": 1500 }, { "epoch": 0.022459168495309347, "grad_norm": 0.8283900618553162, "learning_rate": 8.983535094695296e-07, "loss": 0.6787, "step": 1525 }, { "epoch": 0.022827351585396387, "grad_norm": 1.1485595703125, "learning_rate": 9.13080616182145e-07, "loss": 0.677, "step": 1550 }, { "epoch": 0.023195534675483423, "grad_norm": 0.918366551399231, "learning_rate": 9.278077228947602e-07, "loss": 0.6717, "step": 1575 }, { "epoch": 0.023563717765570463, "grad_norm": 0.6480444073677063, "learning_rate": 9.425348296073754e-07, "loss": 0.6697, "step": 1600 }, { "epoch": 0.023931900855657502, "grad_norm": 0.6028603315353394, "learning_rate": 9.572619363199905e-07, "loss": 0.6708, "step": 1625 }, { "epoch": 0.02430008394574454, "grad_norm": 1.0693798065185547, "learning_rate": 9.71989043032606e-07, "loss": 0.6684, "step": 1650 }, { "epoch": 0.024668267035831578, "grad_norm": 0.8398106694221497, "learning_rate": 9.867161497452211e-07, "loss": 0.6721, "step": 1675 }, { "epoch": 0.025036450125918618, "grad_norm": 0.6863375902175903, "learning_rate": 1.0014432564578364e-06, "loss": 0.6653, "step": 1700 }, { "epoch": 0.025404633216005654, "grad_norm": 0.9351726174354553, "learning_rate": 1.0161703631704515e-06, "loss": 0.6677, "step": 1725 }, { "epoch": 0.025772816306092693, "grad_norm": 0.6590425372123718, "learning_rate": 1.030897469883067e-06, "loss": 0.6611, "step": 1750 }, { "epoch": 0.026140999396179733, "grad_norm": 0.5982798933982849, "learning_rate": 1.045624576595682e-06, "loss": 0.6593, "step": 1775 }, { "epoch": 0.02650918248626677, "grad_norm": 1.231074333190918, "learning_rate": 1.0603516833082973e-06, "loss": 0.6609, "step": 1800 }, { "epoch": 0.02687736557635381, "grad_norm": 0.562767744064331, "learning_rate": 1.0750787900209126e-06, "loss": 0.6549, "step": 1825 }, { "epoch": 0.02724554866644085, "grad_norm": 0.5797823667526245, "learning_rate": 1.089805896733528e-06, "loss": 0.6584, "step": 1850 }, { "epoch": 0.027613731756527884, "grad_norm": 0.6395525932312012, "learning_rate": 1.104533003446143e-06, "loss": 0.6496, "step": 1875 }, { "epoch": 0.027981914846614924, "grad_norm": 0.7141817808151245, "learning_rate": 1.1192601101587583e-06, "loss": 0.654, "step": 1900 }, { "epoch": 0.028350097936701964, "grad_norm": 0.5764775276184082, "learning_rate": 1.1339872168713736e-06, "loss": 0.6527, "step": 1925 }, { "epoch": 0.028718281026789003, "grad_norm": 0.5608025193214417, "learning_rate": 1.1487143235839889e-06, "loss": 0.6462, "step": 1950 }, { "epoch": 0.02908646411687604, "grad_norm": 1.2220577001571655, "learning_rate": 1.1634414302966041e-06, "loss": 0.6511, "step": 1975 }, { "epoch": 0.02945464720696308, "grad_norm": 0.5681129097938538, "learning_rate": 1.1781685370092192e-06, "loss": 0.6443, "step": 2000 }, { "epoch": 0.02982283029705012, "grad_norm": 1.0518550872802734, "learning_rate": 1.1928956437218345e-06, "loss": 0.644, "step": 2025 }, { "epoch": 0.030191013387137155, "grad_norm": 0.5548666715621948, "learning_rate": 1.2076227504344496e-06, "loss": 0.6385, "step": 2050 }, { "epoch": 0.030559196477224194, "grad_norm": 0.7215397953987122, "learning_rate": 1.222349857147065e-06, "loss": 0.6379, "step": 2075 }, { "epoch": 0.030927379567311234, "grad_norm": 0.5453612208366394, "learning_rate": 1.2370769638596802e-06, "loss": 0.6319, "step": 2100 }, { "epoch": 0.031295562657398274, "grad_norm": 0.8071221113204956, "learning_rate": 1.2518040705722957e-06, "loss": 0.631, "step": 2125 }, { "epoch": 0.03166374574748531, "grad_norm": 0.5668207406997681, "learning_rate": 1.2665311772849107e-06, "loss": 0.6291, "step": 2150 }, { "epoch": 0.032031928837572346, "grad_norm": 1.0014539957046509, "learning_rate": 1.281258283997526e-06, "loss": 0.6269, "step": 2175 }, { "epoch": 0.03240011192765939, "grad_norm": 0.7369861006736755, "learning_rate": 1.295985390710141e-06, "loss": 0.6242, "step": 2200 }, { "epoch": 0.032768295017746425, "grad_norm": 0.6795928478240967, "learning_rate": 1.3107124974227564e-06, "loss": 0.6245, "step": 2225 }, { "epoch": 0.03313647810783346, "grad_norm": 0.648115873336792, "learning_rate": 1.3254396041353717e-06, "loss": 0.6223, "step": 2250 }, { "epoch": 0.033504661197920504, "grad_norm": 0.5944338440895081, "learning_rate": 1.3401667108479868e-06, "loss": 0.6179, "step": 2275 }, { "epoch": 0.03387284428800754, "grad_norm": 0.6033785343170166, "learning_rate": 1.3548938175606023e-06, "loss": 0.6145, "step": 2300 }, { "epoch": 0.03424102737809458, "grad_norm": 0.8906118273735046, "learning_rate": 1.3696209242732175e-06, "loss": 0.6069, "step": 2325 }, { "epoch": 0.03460921046818162, "grad_norm": 0.6462284326553345, "learning_rate": 1.3843480309858326e-06, "loss": 0.6062, "step": 2350 }, { "epoch": 0.034977393558268656, "grad_norm": 0.5796801447868347, "learning_rate": 1.399075137698448e-06, "loss": 0.6062, "step": 2375 }, { "epoch": 0.03534557664835569, "grad_norm": 1.3210780620574951, "learning_rate": 1.413802244411063e-06, "loss": 0.6066, "step": 2400 }, { "epoch": 0.035713759738442735, "grad_norm": 0.9987689256668091, "learning_rate": 1.4285293511236783e-06, "loss": 0.6045, "step": 2425 }, { "epoch": 0.03608194282852977, "grad_norm": 0.5557406544685364, "learning_rate": 1.4432564578362938e-06, "loss": 0.6007, "step": 2450 }, { "epoch": 0.03645012591861681, "grad_norm": 0.7247636914253235, "learning_rate": 1.4579835645489088e-06, "loss": 0.5916, "step": 2475 }, { "epoch": 0.03681830900870385, "grad_norm": 0.8754281997680664, "learning_rate": 1.4727106712615241e-06, "loss": 0.594, "step": 2500 }, { "epoch": 0.03718649209879089, "grad_norm": 0.6612271070480347, "learning_rate": 1.4874377779741392e-06, "loss": 0.5903, "step": 2525 }, { "epoch": 0.03755467518887792, "grad_norm": 0.6568219661712646, "learning_rate": 1.5021648846867545e-06, "loss": 0.584, "step": 2550 }, { "epoch": 0.037922858278964966, "grad_norm": 0.7208539843559265, "learning_rate": 1.5168919913993698e-06, "loss": 0.5881, "step": 2575 }, { "epoch": 0.038291041369052, "grad_norm": 0.6969273090362549, "learning_rate": 1.5316190981119849e-06, "loss": 0.5826, "step": 2600 }, { "epoch": 0.03865922445913904, "grad_norm": 0.7232186198234558, "learning_rate": 1.5463462048246004e-06, "loss": 0.5789, "step": 2625 }, { "epoch": 0.03902740754922608, "grad_norm": 0.5670535564422607, "learning_rate": 1.5610733115372156e-06, "loss": 0.5786, "step": 2650 }, { "epoch": 0.03939559063931312, "grad_norm": 0.5489992499351501, "learning_rate": 1.5758004182498307e-06, "loss": 0.5755, "step": 2675 }, { "epoch": 0.039763773729400154, "grad_norm": 0.6829290390014648, "learning_rate": 1.590527524962446e-06, "loss": 0.5749, "step": 2700 }, { "epoch": 0.0401319568194872, "grad_norm": 0.5697892308235168, "learning_rate": 1.605254631675061e-06, "loss": 0.5666, "step": 2725 }, { "epoch": 0.04050013990957423, "grad_norm": 0.5284579992294312, "learning_rate": 1.6199817383876764e-06, "loss": 0.558, "step": 2750 }, { "epoch": 0.04086832299966127, "grad_norm": 0.6674126386642456, "learning_rate": 1.6347088451002919e-06, "loss": 0.5617, "step": 2775 }, { "epoch": 0.04123650608974831, "grad_norm": 0.6509599089622498, "learning_rate": 1.649435951812907e-06, "loss": 0.5595, "step": 2800 }, { "epoch": 0.04160468917983535, "grad_norm": 0.5450132489204407, "learning_rate": 1.6641630585255222e-06, "loss": 0.5586, "step": 2825 }, { "epoch": 0.041972872269922384, "grad_norm": 0.5366148352622986, "learning_rate": 1.6788901652381375e-06, "loss": 0.5531, "step": 2850 }, { "epoch": 0.04234105536000943, "grad_norm": 0.5460155010223389, "learning_rate": 1.6936172719507526e-06, "loss": 0.5549, "step": 2875 }, { "epoch": 0.042709238450096464, "grad_norm": 0.5339788794517517, "learning_rate": 1.708344378663368e-06, "loss": 0.5486, "step": 2900 }, { "epoch": 0.0430774215401835, "grad_norm": 0.5116156339645386, "learning_rate": 1.723071485375983e-06, "loss": 0.5528, "step": 2925 }, { "epoch": 0.04344560463027054, "grad_norm": 1.195949673652649, "learning_rate": 1.7377985920885985e-06, "loss": 0.5507, "step": 2950 }, { "epoch": 0.04381378772035758, "grad_norm": 0.5309670567512512, "learning_rate": 1.7525256988012138e-06, "loss": 0.5407, "step": 2975 }, { "epoch": 0.044181970810444615, "grad_norm": 0.7596344351768494, "learning_rate": 1.7672528055138288e-06, "loss": 0.5418, "step": 3000 }, { "epoch": 0.04455015390053166, "grad_norm": 0.47329920530319214, "learning_rate": 1.7819799122264441e-06, "loss": 0.5357, "step": 3025 }, { "epoch": 0.044918336990618694, "grad_norm": 0.5916969776153564, "learning_rate": 1.7967070189390592e-06, "loss": 0.5359, "step": 3050 }, { "epoch": 0.04528652008070573, "grad_norm": 0.6137320399284363, "learning_rate": 1.8114341256516747e-06, "loss": 0.529, "step": 3075 }, { "epoch": 0.045654703170792774, "grad_norm": 0.6124922633171082, "learning_rate": 1.82616123236429e-06, "loss": 0.5292, "step": 3100 }, { "epoch": 0.04602288626087981, "grad_norm": 0.4892748296260834, "learning_rate": 1.840888339076905e-06, "loss": 0.5356, "step": 3125 }, { "epoch": 0.046391069350966846, "grad_norm": 0.6712266206741333, "learning_rate": 1.8556154457895203e-06, "loss": 0.531, "step": 3150 }, { "epoch": 0.04675925244105389, "grad_norm": 0.5134905576705933, "learning_rate": 1.8703425525021356e-06, "loss": 0.533, "step": 3175 }, { "epoch": 0.047127435531140925, "grad_norm": 0.5121038556098938, "learning_rate": 1.8850696592147507e-06, "loss": 0.5311, "step": 3200 }, { "epoch": 0.04749561862122796, "grad_norm": 1.0264848470687866, "learning_rate": 1.8997967659273662e-06, "loss": 0.5273, "step": 3225 }, { "epoch": 0.047863801711315004, "grad_norm": 0.8714808225631714, "learning_rate": 1.914523872639981e-06, "loss": 0.5226, "step": 3250 }, { "epoch": 0.04823198480140204, "grad_norm": 0.7779112458229065, "learning_rate": 1.9292509793525966e-06, "loss": 0.5259, "step": 3275 }, { "epoch": 0.04860016789148908, "grad_norm": 0.5716907978057861, "learning_rate": 1.943978086065212e-06, "loss": 0.5243, "step": 3300 }, { "epoch": 0.04896835098157612, "grad_norm": 0.478712797164917, "learning_rate": 1.958705192777827e-06, "loss": 0.5235, "step": 3325 }, { "epoch": 0.049336534071663156, "grad_norm": 0.43825459480285645, "learning_rate": 1.9734322994904422e-06, "loss": 0.5224, "step": 3350 }, { "epoch": 0.04970471716175019, "grad_norm": 0.5753782987594604, "learning_rate": 1.9881594062030577e-06, "loss": 0.5248, "step": 3375 }, { "epoch": 0.050072900251837235, "grad_norm": 0.6924416422843933, "learning_rate": 2.002886512915673e-06, "loss": 0.5166, "step": 3400 }, { "epoch": 0.05044108334192427, "grad_norm": 0.48913905024528503, "learning_rate": 2.017613619628288e-06, "loss": 0.515, "step": 3425 }, { "epoch": 0.05080926643201131, "grad_norm": 0.8264048099517822, "learning_rate": 2.032340726340903e-06, "loss": 0.5145, "step": 3450 }, { "epoch": 0.05117744952209835, "grad_norm": 0.6421396732330322, "learning_rate": 2.0470678330535185e-06, "loss": 0.5093, "step": 3475 }, { "epoch": 0.05154563261218539, "grad_norm": 0.5583409667015076, "learning_rate": 2.061794939766134e-06, "loss": 0.5123, "step": 3500 }, { "epoch": 0.05191381570227242, "grad_norm": 0.5377383232116699, "learning_rate": 2.076522046478749e-06, "loss": 0.5169, "step": 3525 }, { "epoch": 0.052281998792359466, "grad_norm": 0.5501702427864075, "learning_rate": 2.091249153191364e-06, "loss": 0.509, "step": 3550 }, { "epoch": 0.0526501818824465, "grad_norm": 0.46207118034362793, "learning_rate": 2.105976259903979e-06, "loss": 0.5133, "step": 3575 }, { "epoch": 0.05301836497253354, "grad_norm": 0.5295949578285217, "learning_rate": 2.1207033666165947e-06, "loss": 0.5038, "step": 3600 }, { "epoch": 0.05338654806262058, "grad_norm": 0.526362419128418, "learning_rate": 2.13543047332921e-06, "loss": 0.5199, "step": 3625 }, { "epoch": 0.05375473115270762, "grad_norm": 0.5102420449256897, "learning_rate": 2.1501575800418253e-06, "loss": 0.5078, "step": 3650 }, { "epoch": 0.054122914242794654, "grad_norm": 0.5729849934577942, "learning_rate": 2.1648846867544403e-06, "loss": 0.5052, "step": 3675 }, { "epoch": 0.0544910973328817, "grad_norm": 0.6406521797180176, "learning_rate": 2.179611793467056e-06, "loss": 0.5071, "step": 3700 }, { "epoch": 0.05485928042296873, "grad_norm": 0.6915680766105652, "learning_rate": 2.194338900179671e-06, "loss": 0.5058, "step": 3725 }, { "epoch": 0.05522746351305577, "grad_norm": 0.8066368103027344, "learning_rate": 2.209066006892286e-06, "loss": 0.5045, "step": 3750 }, { "epoch": 0.05559564660314281, "grad_norm": 0.739680290222168, "learning_rate": 2.223793113604901e-06, "loss": 0.4987, "step": 3775 }, { "epoch": 0.05596382969322985, "grad_norm": 0.6398206949234009, "learning_rate": 2.2385202203175166e-06, "loss": 0.504, "step": 3800 }, { "epoch": 0.05633201278331689, "grad_norm": 0.640600860118866, "learning_rate": 2.253247327030132e-06, "loss": 0.505, "step": 3825 }, { "epoch": 0.05670019587340393, "grad_norm": 0.49086707830429077, "learning_rate": 2.267974433742747e-06, "loss": 0.5073, "step": 3850 }, { "epoch": 0.057068378963490964, "grad_norm": 0.6551923155784607, "learning_rate": 2.2827015404553622e-06, "loss": 0.5027, "step": 3875 }, { "epoch": 0.05743656205357801, "grad_norm": 0.5222965478897095, "learning_rate": 2.2974286471679777e-06, "loss": 0.4987, "step": 3900 }, { "epoch": 0.05780474514366504, "grad_norm": 0.4598608911037445, "learning_rate": 2.312155753880593e-06, "loss": 0.5047, "step": 3925 }, { "epoch": 0.05817292823375208, "grad_norm": 0.46309804916381836, "learning_rate": 2.3268828605932083e-06, "loss": 0.5031, "step": 3950 }, { "epoch": 0.05854111132383912, "grad_norm": 0.49093905091285706, "learning_rate": 2.3416099673058234e-06, "loss": 0.496, "step": 3975 }, { "epoch": 0.05890929441392616, "grad_norm": 0.49821043014526367, "learning_rate": 2.3563370740184384e-06, "loss": 0.5071, "step": 4000 }, { "epoch": 0.059277477504013194, "grad_norm": 0.5469359755516052, "learning_rate": 2.371064180731054e-06, "loss": 0.502, "step": 4025 }, { "epoch": 0.05964566059410024, "grad_norm": 0.493945449590683, "learning_rate": 2.385791287443669e-06, "loss": 0.4993, "step": 4050 }, { "epoch": 0.060013843684187274, "grad_norm": 0.6475746631622314, "learning_rate": 2.400518394156284e-06, "loss": 0.4977, "step": 4075 }, { "epoch": 0.06038202677427431, "grad_norm": 0.9257048964500427, "learning_rate": 2.415245500868899e-06, "loss": 0.5011, "step": 4100 }, { "epoch": 0.06075020986436135, "grad_norm": 0.687039852142334, "learning_rate": 2.4299726075815147e-06, "loss": 0.5029, "step": 4125 }, { "epoch": 0.06111839295444839, "grad_norm": 0.48104432225227356, "learning_rate": 2.44469971429413e-06, "loss": 0.505, "step": 4150 }, { "epoch": 0.061486576044535425, "grad_norm": 0.5689713954925537, "learning_rate": 2.4594268210067452e-06, "loss": 0.498, "step": 4175 }, { "epoch": 0.06185475913462247, "grad_norm": 0.5599949955940247, "learning_rate": 2.4741539277193603e-06, "loss": 0.4928, "step": 4200 }, { "epoch": 0.062222942224709504, "grad_norm": 0.5400413870811462, "learning_rate": 2.488881034431976e-06, "loss": 0.4963, "step": 4225 }, { "epoch": 0.06259112531479655, "grad_norm": 0.5699523687362671, "learning_rate": 2.5036081411445913e-06, "loss": 0.4955, "step": 4250 }, { "epoch": 0.06295930840488358, "grad_norm": 0.7645552158355713, "learning_rate": 2.518335247857206e-06, "loss": 0.5051, "step": 4275 }, { "epoch": 0.06332749149497062, "grad_norm": 1.205398440361023, "learning_rate": 2.5330623545698215e-06, "loss": 0.4913, "step": 4300 }, { "epoch": 0.06369567458505766, "grad_norm": 0.5377015471458435, "learning_rate": 2.5477894612824365e-06, "loss": 0.5012, "step": 4325 }, { "epoch": 0.06406385767514469, "grad_norm": 0.541007399559021, "learning_rate": 2.562516567995052e-06, "loss": 0.4927, "step": 4350 }, { "epoch": 0.06443204076523174, "grad_norm": 0.7172959446907043, "learning_rate": 2.5772436747076675e-06, "loss": 0.4879, "step": 4375 }, { "epoch": 0.06480022385531878, "grad_norm": 0.5034739971160889, "learning_rate": 2.591970781420282e-06, "loss": 0.4913, "step": 4400 }, { "epoch": 0.06516840694540581, "grad_norm": 0.4795535206794739, "learning_rate": 2.6066978881328973e-06, "loss": 0.4928, "step": 4425 }, { "epoch": 0.06553659003549285, "grad_norm": 0.4326831102371216, "learning_rate": 2.6214249948455128e-06, "loss": 0.484, "step": 4450 }, { "epoch": 0.0659047731255799, "grad_norm": 0.472896546125412, "learning_rate": 2.6361521015581283e-06, "loss": 0.4953, "step": 4475 }, { "epoch": 0.06627295621566692, "grad_norm": 0.5761368870735168, "learning_rate": 2.6508792082707434e-06, "loss": 0.4945, "step": 4500 }, { "epoch": 0.06664113930575397, "grad_norm": 0.6462481021881104, "learning_rate": 2.6656063149833584e-06, "loss": 0.4884, "step": 4525 }, { "epoch": 0.06700932239584101, "grad_norm": 0.6638084650039673, "learning_rate": 2.6803334216959735e-06, "loss": 0.4921, "step": 4550 }, { "epoch": 0.06737750548592804, "grad_norm": 0.49843668937683105, "learning_rate": 2.695060528408589e-06, "loss": 0.4914, "step": 4575 }, { "epoch": 0.06774568857601508, "grad_norm": 0.5445410013198853, "learning_rate": 2.7097876351212045e-06, "loss": 0.485, "step": 4600 }, { "epoch": 0.06811387166610212, "grad_norm": 0.8247371912002563, "learning_rate": 2.7245147418338196e-06, "loss": 0.4984, "step": 4625 }, { "epoch": 0.06848205475618915, "grad_norm": 0.4787178039550781, "learning_rate": 2.739241848546435e-06, "loss": 0.4832, "step": 4650 }, { "epoch": 0.0688502378462762, "grad_norm": 0.6253044605255127, "learning_rate": 2.7539689552590497e-06, "loss": 0.4852, "step": 4675 }, { "epoch": 0.06921842093636324, "grad_norm": 0.7872785329818726, "learning_rate": 2.7686960619716652e-06, "loss": 0.5008, "step": 4700 }, { "epoch": 0.06958660402645027, "grad_norm": 0.5098380446434021, "learning_rate": 2.7834231686842807e-06, "loss": 0.4815, "step": 4725 }, { "epoch": 0.06995478711653731, "grad_norm": 0.5085175633430481, "learning_rate": 2.798150275396896e-06, "loss": 0.4871, "step": 4750 }, { "epoch": 0.07032297020662436, "grad_norm": 0.48235610127449036, "learning_rate": 2.8128773821095113e-06, "loss": 0.4981, "step": 4775 }, { "epoch": 0.07069115329671138, "grad_norm": 0.5846980214118958, "learning_rate": 2.827604488822126e-06, "loss": 0.4837, "step": 4800 }, { "epoch": 0.07105933638679843, "grad_norm": 0.6444407105445862, "learning_rate": 2.8423315955347415e-06, "loss": 0.4879, "step": 4825 }, { "epoch": 0.07142751947688547, "grad_norm": 0.47602659463882446, "learning_rate": 2.8570587022473565e-06, "loss": 0.4861, "step": 4850 }, { "epoch": 0.0717957025669725, "grad_norm": 0.5946886539459229, "learning_rate": 2.871785808959972e-06, "loss": 0.4788, "step": 4875 }, { "epoch": 0.07216388565705954, "grad_norm": 0.9483232498168945, "learning_rate": 2.8865129156725875e-06, "loss": 0.4787, "step": 4900 }, { "epoch": 0.07253206874714659, "grad_norm": 0.7961587309837341, "learning_rate": 2.901240022385202e-06, "loss": 0.4865, "step": 4925 }, { "epoch": 0.07290025183723362, "grad_norm": 0.7431612610816956, "learning_rate": 2.9159671290978177e-06, "loss": 0.488, "step": 4950 }, { "epoch": 0.07326843492732066, "grad_norm": 0.6214255094528198, "learning_rate": 2.9306942358104328e-06, "loss": 0.4821, "step": 4975 }, { "epoch": 0.0736366180174077, "grad_norm": 0.6615573763847351, "learning_rate": 2.9454213425230483e-06, "loss": 0.4805, "step": 5000 }, { "epoch": 0.07400480110749473, "grad_norm": 0.6024646759033203, "learning_rate": 2.9601484492356638e-06, "loss": 0.4809, "step": 5025 }, { "epoch": 0.07437298419758177, "grad_norm": 0.5407711267471313, "learning_rate": 2.9748755559482784e-06, "loss": 0.4758, "step": 5050 }, { "epoch": 0.07474116728766882, "grad_norm": 0.6843119263648987, "learning_rate": 2.989602662660894e-06, "loss": 0.4798, "step": 5075 }, { "epoch": 0.07510935037775585, "grad_norm": 0.5549395084381104, "learning_rate": 3.004329769373509e-06, "loss": 0.4789, "step": 5100 }, { "epoch": 0.07547753346784289, "grad_norm": 0.818297803401947, "learning_rate": 3.0190568760861245e-06, "loss": 0.4801, "step": 5125 }, { "epoch": 0.07584571655792993, "grad_norm": 0.8120235800743103, "learning_rate": 3.0337839827987396e-06, "loss": 0.4761, "step": 5150 }, { "epoch": 0.07621389964801696, "grad_norm": 0.7820422053337097, "learning_rate": 3.048511089511355e-06, "loss": 0.4821, "step": 5175 }, { "epoch": 0.076582082738104, "grad_norm": 0.7186234593391418, "learning_rate": 3.0632381962239697e-06, "loss": 0.4833, "step": 5200 }, { "epoch": 0.07695026582819105, "grad_norm": 0.6120290160179138, "learning_rate": 3.0779653029365852e-06, "loss": 0.481, "step": 5225 }, { "epoch": 0.07731844891827808, "grad_norm": 0.6122280955314636, "learning_rate": 3.0926924096492007e-06, "loss": 0.4862, "step": 5250 }, { "epoch": 0.07768663200836512, "grad_norm": 0.5714207887649536, "learning_rate": 3.107419516361816e-06, "loss": 0.481, "step": 5275 }, { "epoch": 0.07805481509845216, "grad_norm": 0.609611451625824, "learning_rate": 3.1221466230744313e-06, "loss": 0.4891, "step": 5300 }, { "epoch": 0.07842299818853919, "grad_norm": 0.5937544107437134, "learning_rate": 3.136873729787046e-06, "loss": 0.4778, "step": 5325 }, { "epoch": 0.07879118127862623, "grad_norm": 0.698113739490509, "learning_rate": 3.1516008364996614e-06, "loss": 0.4771, "step": 5350 }, { "epoch": 0.07915936436871328, "grad_norm": 0.6822773814201355, "learning_rate": 3.166327943212277e-06, "loss": 0.4756, "step": 5375 }, { "epoch": 0.07952754745880031, "grad_norm": 0.610352635383606, "learning_rate": 3.181055049924892e-06, "loss": 0.4759, "step": 5400 }, { "epoch": 0.07989573054888735, "grad_norm": 0.6700140237808228, "learning_rate": 3.1957821566375075e-06, "loss": 0.4813, "step": 5425 }, { "epoch": 0.0802639136389744, "grad_norm": 0.909814178943634, "learning_rate": 3.210509263350122e-06, "loss": 0.4826, "step": 5450 }, { "epoch": 0.08063209672906142, "grad_norm": 0.4589587450027466, "learning_rate": 3.2252363700627377e-06, "loss": 0.4706, "step": 5475 }, { "epoch": 0.08100027981914847, "grad_norm": 0.5017442107200623, "learning_rate": 3.2399634767753527e-06, "loss": 0.4756, "step": 5500 }, { "epoch": 0.08136846290923551, "grad_norm": 0.5886111259460449, "learning_rate": 3.2546905834879682e-06, "loss": 0.4689, "step": 5525 }, { "epoch": 0.08173664599932254, "grad_norm": 0.5248635411262512, "learning_rate": 3.2694176902005837e-06, "loss": 0.4736, "step": 5550 }, { "epoch": 0.08210482908940958, "grad_norm": 0.5123960375785828, "learning_rate": 3.2841447969131984e-06, "loss": 0.4703, "step": 5575 }, { "epoch": 0.08247301217949662, "grad_norm": 0.5010682344436646, "learning_rate": 3.298871903625814e-06, "loss": 0.4782, "step": 5600 }, { "epoch": 0.08284119526958365, "grad_norm": 0.6653745770454407, "learning_rate": 3.313599010338429e-06, "loss": 0.4701, "step": 5625 }, { "epoch": 0.0832093783596707, "grad_norm": 0.5438437461853027, "learning_rate": 3.3283261170510445e-06, "loss": 0.4714, "step": 5650 }, { "epoch": 0.08357756144975774, "grad_norm": 0.5668756365776062, "learning_rate": 3.34305322376366e-06, "loss": 0.4681, "step": 5675 }, { "epoch": 0.08394574453984477, "grad_norm": 0.7274172902107239, "learning_rate": 3.357780330476275e-06, "loss": 0.4638, "step": 5700 }, { "epoch": 0.08431392762993181, "grad_norm": 0.6077244877815247, "learning_rate": 3.37250743718889e-06, "loss": 0.4781, "step": 5725 }, { "epoch": 0.08468211072001885, "grad_norm": 0.4999208152294159, "learning_rate": 3.387234543901505e-06, "loss": 0.4618, "step": 5750 }, { "epoch": 0.08505029381010588, "grad_norm": 0.5622535347938538, "learning_rate": 3.4019616506141207e-06, "loss": 0.4706, "step": 5775 }, { "epoch": 0.08541847690019293, "grad_norm": 0.6131230592727661, "learning_rate": 3.416688757326736e-06, "loss": 0.4711, "step": 5800 }, { "epoch": 0.08578665999027997, "grad_norm": 0.811798095703125, "learning_rate": 3.4314158640393513e-06, "loss": 0.4626, "step": 5825 }, { "epoch": 0.086154843080367, "grad_norm": 0.5276443362236023, "learning_rate": 3.446142970751966e-06, "loss": 0.4589, "step": 5850 }, { "epoch": 0.08652302617045404, "grad_norm": 0.590082049369812, "learning_rate": 3.4608700774645814e-06, "loss": 0.4653, "step": 5875 }, { "epoch": 0.08689120926054109, "grad_norm": 0.5512232780456543, "learning_rate": 3.475597184177197e-06, "loss": 0.462, "step": 5900 }, { "epoch": 0.08725939235062811, "grad_norm": 0.4878486096858978, "learning_rate": 3.490324290889812e-06, "loss": 0.4628, "step": 5925 }, { "epoch": 0.08762757544071516, "grad_norm": 0.7966400384902954, "learning_rate": 3.5050513976024275e-06, "loss": 0.4622, "step": 5950 }, { "epoch": 0.0879957585308022, "grad_norm": 0.5125811696052551, "learning_rate": 3.519778504315042e-06, "loss": 0.4636, "step": 5975 }, { "epoch": 0.08836394162088923, "grad_norm": 0.883798360824585, "learning_rate": 3.5345056110276577e-06, "loss": 0.4696, "step": 6000 }, { "epoch": 0.08873212471097627, "grad_norm": 0.5936166048049927, "learning_rate": 3.549232717740273e-06, "loss": 0.4726, "step": 6025 }, { "epoch": 0.08910030780106332, "grad_norm": 0.9113532900810242, "learning_rate": 3.5639598244528882e-06, "loss": 0.4624, "step": 6050 }, { "epoch": 0.08946849089115035, "grad_norm": 0.8539900779724121, "learning_rate": 3.5786869311655037e-06, "loss": 0.4648, "step": 6075 }, { "epoch": 0.08983667398123739, "grad_norm": 0.7228135466575623, "learning_rate": 3.5934140378781184e-06, "loss": 0.472, "step": 6100 }, { "epoch": 0.09020485707132443, "grad_norm": 0.6169723868370056, "learning_rate": 3.608141144590734e-06, "loss": 0.4591, "step": 6125 }, { "epoch": 0.09057304016141146, "grad_norm": 0.7366740703582764, "learning_rate": 3.6228682513033494e-06, "loss": 0.4636, "step": 6150 }, { "epoch": 0.0909412232514985, "grad_norm": 1.0246922969818115, "learning_rate": 3.6375953580159645e-06, "loss": 0.4601, "step": 6175 }, { "epoch": 0.09130940634158555, "grad_norm": 0.603578507900238, "learning_rate": 3.65232246472858e-06, "loss": 0.4681, "step": 6200 }, { "epoch": 0.09167758943167258, "grad_norm": 0.6169431805610657, "learning_rate": 3.667049571441195e-06, "loss": 0.4709, "step": 6225 }, { "epoch": 0.09204577252175962, "grad_norm": 0.6031911969184875, "learning_rate": 3.68177667815381e-06, "loss": 0.4548, "step": 6250 }, { "epoch": 0.09241395561184666, "grad_norm": 0.7893526554107666, "learning_rate": 3.696503784866425e-06, "loss": 0.4542, "step": 6275 }, { "epoch": 0.09278213870193369, "grad_norm": 0.738535463809967, "learning_rate": 3.7112308915790407e-06, "loss": 0.4577, "step": 6300 }, { "epoch": 0.09315032179202073, "grad_norm": 0.8681890964508057, "learning_rate": 3.725957998291656e-06, "loss": 0.4612, "step": 6325 }, { "epoch": 0.09351850488210778, "grad_norm": 0.6814743876457214, "learning_rate": 3.7406851050042713e-06, "loss": 0.4621, "step": 6350 }, { "epoch": 0.09388668797219481, "grad_norm": 0.5325171947479248, "learning_rate": 3.7554122117168863e-06, "loss": 0.4646, "step": 6375 }, { "epoch": 0.09425487106228185, "grad_norm": 0.6647558808326721, "learning_rate": 3.7701393184295014e-06, "loss": 0.4549, "step": 6400 }, { "epoch": 0.0946230541523689, "grad_norm": 0.5672147870063782, "learning_rate": 3.784866425142117e-06, "loss": 0.4528, "step": 6425 }, { "epoch": 0.09499123724245592, "grad_norm": 0.7314438223838806, "learning_rate": 3.7995935318547324e-06, "loss": 0.46, "step": 6450 }, { "epoch": 0.09535942033254297, "grad_norm": 1.0649642944335938, "learning_rate": 3.8143206385673475e-06, "loss": 0.447, "step": 6475 }, { "epoch": 0.09572760342263001, "grad_norm": 0.47005271911621094, "learning_rate": 3.829047745279962e-06, "loss": 0.4602, "step": 6500 }, { "epoch": 0.09609578651271704, "grad_norm": 0.678763210773468, "learning_rate": 3.843774851992578e-06, "loss": 0.4521, "step": 6525 }, { "epoch": 0.09646396960280408, "grad_norm": 0.4841667413711548, "learning_rate": 3.858501958705193e-06, "loss": 0.4439, "step": 6550 }, { "epoch": 0.09683215269289112, "grad_norm": 0.5574337244033813, "learning_rate": 3.873229065417809e-06, "loss": 0.4469, "step": 6575 }, { "epoch": 0.09720033578297815, "grad_norm": 0.5939049124717712, "learning_rate": 3.887956172130424e-06, "loss": 0.4527, "step": 6600 }, { "epoch": 0.0975685188730652, "grad_norm": 0.4860667288303375, "learning_rate": 3.902683278843039e-06, "loss": 0.4379, "step": 6625 }, { "epoch": 0.09793670196315224, "grad_norm": 0.6220310926437378, "learning_rate": 3.917410385555654e-06, "loss": 0.4429, "step": 6650 }, { "epoch": 0.09830488505323927, "grad_norm": 0.6435657143592834, "learning_rate": 3.932137492268269e-06, "loss": 0.4479, "step": 6675 }, { "epoch": 0.09867306814332631, "grad_norm": 0.5766171813011169, "learning_rate": 3.9468645989808845e-06, "loss": 0.4528, "step": 6700 }, { "epoch": 0.09904125123341335, "grad_norm": 0.7544565796852112, "learning_rate": 3.9615917056935e-06, "loss": 0.4544, "step": 6725 }, { "epoch": 0.09940943432350038, "grad_norm": 0.514340877532959, "learning_rate": 3.9763188124061155e-06, "loss": 0.4394, "step": 6750 }, { "epoch": 0.09977761741358743, "grad_norm": 0.5275327563285828, "learning_rate": 3.99104591911873e-06, "loss": 0.45, "step": 6775 }, { "epoch": 0.10014580050367447, "grad_norm": 0.5144765377044678, "learning_rate": 4.005773025831346e-06, "loss": 0.4442, "step": 6800 }, { "epoch": 0.1005139835937615, "grad_norm": 0.6812426447868347, "learning_rate": 4.020500132543961e-06, "loss": 0.4471, "step": 6825 }, { "epoch": 0.10088216668384854, "grad_norm": 0.5578859448432922, "learning_rate": 4.035227239256576e-06, "loss": 0.4438, "step": 6850 }, { "epoch": 0.10125034977393559, "grad_norm": 0.5542449951171875, "learning_rate": 4.049954345969191e-06, "loss": 0.4504, "step": 6875 }, { "epoch": 0.10161853286402261, "grad_norm": 0.47460320591926575, "learning_rate": 4.064681452681806e-06, "loss": 0.449, "step": 6900 }, { "epoch": 0.10198671595410966, "grad_norm": 0.5776277184486389, "learning_rate": 4.079408559394421e-06, "loss": 0.4435, "step": 6925 }, { "epoch": 0.1023548990441967, "grad_norm": 0.6199057102203369, "learning_rate": 4.094135666107037e-06, "loss": 0.4394, "step": 6950 }, { "epoch": 0.10272308213428373, "grad_norm": 0.5915837287902832, "learning_rate": 4.108862772819652e-06, "loss": 0.4424, "step": 6975 }, { "epoch": 0.10309126522437077, "grad_norm": 0.6803795695304871, "learning_rate": 4.123589879532268e-06, "loss": 0.4354, "step": 7000 }, { "epoch": 0.10345944831445782, "grad_norm": 0.6849611401557922, "learning_rate": 4.1383169862448826e-06, "loss": 0.4321, "step": 7025 }, { "epoch": 0.10382763140454485, "grad_norm": 0.7074750661849976, "learning_rate": 4.153044092957498e-06, "loss": 0.441, "step": 7050 }, { "epoch": 0.10419581449463189, "grad_norm": 0.5996087789535522, "learning_rate": 4.167771199670113e-06, "loss": 0.4315, "step": 7075 }, { "epoch": 0.10456399758471893, "grad_norm": 0.7005184888839722, "learning_rate": 4.182498306382728e-06, "loss": 0.437, "step": 7100 }, { "epoch": 0.10493218067480596, "grad_norm": 0.6087741255760193, "learning_rate": 4.197225413095344e-06, "loss": 0.4332, "step": 7125 }, { "epoch": 0.105300363764893, "grad_norm": 0.6142122149467468, "learning_rate": 4.211952519807958e-06, "loss": 0.4306, "step": 7150 }, { "epoch": 0.10566854685498005, "grad_norm": 0.5826869010925293, "learning_rate": 4.226679626520574e-06, "loss": 0.437, "step": 7175 }, { "epoch": 0.10603672994506708, "grad_norm": 0.6333126425743103, "learning_rate": 4.241406733233189e-06, "loss": 0.4268, "step": 7200 }, { "epoch": 0.10640491303515412, "grad_norm": 0.6748629808425903, "learning_rate": 4.256133839945805e-06, "loss": 0.4382, "step": 7225 }, { "epoch": 0.10677309612524116, "grad_norm": 0.5124508738517761, "learning_rate": 4.27086094665842e-06, "loss": 0.4228, "step": 7250 }, { "epoch": 0.10714127921532819, "grad_norm": 0.5307368636131287, "learning_rate": 4.285588053371035e-06, "loss": 0.4263, "step": 7275 }, { "epoch": 0.10750946230541523, "grad_norm": 0.572047770023346, "learning_rate": 4.3003151600836505e-06, "loss": 0.4298, "step": 7300 }, { "epoch": 0.10787764539550228, "grad_norm": 0.7120387554168701, "learning_rate": 4.315042266796265e-06, "loss": 0.4186, "step": 7325 }, { "epoch": 0.10824582848558931, "grad_norm": 0.5541498064994812, "learning_rate": 4.329769373508881e-06, "loss": 0.427, "step": 7350 }, { "epoch": 0.10861401157567635, "grad_norm": 0.6520849466323853, "learning_rate": 4.344496480221496e-06, "loss": 0.432, "step": 7375 }, { "epoch": 0.1089821946657634, "grad_norm": 0.5903097987174988, "learning_rate": 4.359223586934112e-06, "loss": 0.425, "step": 7400 }, { "epoch": 0.10935037775585042, "grad_norm": 0.7111144661903381, "learning_rate": 4.373950693646726e-06, "loss": 0.4266, "step": 7425 }, { "epoch": 0.10971856084593747, "grad_norm": 0.7609113454818726, "learning_rate": 4.388677800359342e-06, "loss": 0.4227, "step": 7450 }, { "epoch": 0.11008674393602451, "grad_norm": 0.6072840690612793, "learning_rate": 4.403404907071957e-06, "loss": 0.4217, "step": 7475 }, { "epoch": 0.11045492702611154, "grad_norm": 0.5471121668815613, "learning_rate": 4.418132013784572e-06, "loss": 0.4202, "step": 7500 }, { "epoch": 0.11082311011619858, "grad_norm": 0.7271543741226196, "learning_rate": 4.4328591204971875e-06, "loss": 0.422, "step": 7525 }, { "epoch": 0.11119129320628562, "grad_norm": 0.6403235793113708, "learning_rate": 4.447586227209802e-06, "loss": 0.421, "step": 7550 }, { "epoch": 0.11155947629637265, "grad_norm": 0.6067976355552673, "learning_rate": 4.462313333922418e-06, "loss": 0.42, "step": 7575 }, { "epoch": 0.1119276593864597, "grad_norm": 0.5522935390472412, "learning_rate": 4.477040440635033e-06, "loss": 0.4218, "step": 7600 }, { "epoch": 0.11229584247654674, "grad_norm": 0.5163924694061279, "learning_rate": 4.491767547347649e-06, "loss": 0.4217, "step": 7625 }, { "epoch": 0.11266402556663378, "grad_norm": 0.5037286877632141, "learning_rate": 4.506494654060264e-06, "loss": 0.4151, "step": 7650 }, { "epoch": 0.11303220865672081, "grad_norm": 0.7009169459342957, "learning_rate": 4.521221760772879e-06, "loss": 0.4181, "step": 7675 }, { "epoch": 0.11340039174680785, "grad_norm": 0.6608708500862122, "learning_rate": 4.535948867485494e-06, "loss": 0.4206, "step": 7700 }, { "epoch": 0.1137685748368949, "grad_norm": 0.5974350571632385, "learning_rate": 4.55067597419811e-06, "loss": 0.4079, "step": 7725 }, { "epoch": 0.11413675792698193, "grad_norm": 0.5736297369003296, "learning_rate": 4.5654030809107244e-06, "loss": 0.4073, "step": 7750 }, { "epoch": 0.11450494101706897, "grad_norm": 0.6583185791969299, "learning_rate": 4.58013018762334e-06, "loss": 0.4115, "step": 7775 }, { "epoch": 0.11487312410715601, "grad_norm": 0.5225375294685364, "learning_rate": 4.5948572943359554e-06, "loss": 0.4137, "step": 7800 }, { "epoch": 0.11524130719724304, "grad_norm": 0.6312311291694641, "learning_rate": 4.60958440104857e-06, "loss": 0.4096, "step": 7825 }, { "epoch": 0.11560949028733009, "grad_norm": 0.5658524036407471, "learning_rate": 4.624311507761186e-06, "loss": 0.4067, "step": 7850 }, { "epoch": 0.11597767337741713, "grad_norm": 0.5291056036949158, "learning_rate": 4.639038614473801e-06, "loss": 0.4019, "step": 7875 }, { "epoch": 0.11634585646750416, "grad_norm": 0.638701856136322, "learning_rate": 4.653765721186417e-06, "loss": 0.3978, "step": 7900 }, { "epoch": 0.1167140395575912, "grad_norm": 0.5484588146209717, "learning_rate": 4.668492827899031e-06, "loss": 0.4032, "step": 7925 }, { "epoch": 0.11708222264767824, "grad_norm": 0.6266508102416992, "learning_rate": 4.683219934611647e-06, "loss": 0.4034, "step": 7950 }, { "epoch": 0.11745040573776527, "grad_norm": 0.7253883481025696, "learning_rate": 4.697947041324261e-06, "loss": 0.3979, "step": 7975 }, { "epoch": 0.11781858882785232, "grad_norm": 0.6128943562507629, "learning_rate": 4.712674148036877e-06, "loss": 0.4051, "step": 8000 }, { "epoch": 0.11818677191793936, "grad_norm": 0.8992940187454224, "learning_rate": 4.727401254749492e-06, "loss": 0.4065, "step": 8025 }, { "epoch": 0.11855495500802639, "grad_norm": 0.6067450046539307, "learning_rate": 4.742128361462108e-06, "loss": 0.3956, "step": 8050 }, { "epoch": 0.11892313809811343, "grad_norm": 0.5823917984962463, "learning_rate": 4.7568554681747225e-06, "loss": 0.3936, "step": 8075 }, { "epoch": 0.11929132118820047, "grad_norm": 0.7057197093963623, "learning_rate": 4.771582574887338e-06, "loss": 0.4008, "step": 8100 }, { "epoch": 0.1196595042782875, "grad_norm": 0.780258297920227, "learning_rate": 4.7863096815999535e-06, "loss": 0.3969, "step": 8125 }, { "epoch": 0.12002768736837455, "grad_norm": 0.8426298499107361, "learning_rate": 4.801036788312568e-06, "loss": 0.4089, "step": 8150 }, { "epoch": 0.12039587045846159, "grad_norm": 0.679779589176178, "learning_rate": 4.815763895025184e-06, "loss": 0.4062, "step": 8175 }, { "epoch": 0.12076405354854862, "grad_norm": 0.5497132539749146, "learning_rate": 4.830491001737798e-06, "loss": 0.3992, "step": 8200 }, { "epoch": 0.12113223663863566, "grad_norm": 0.7404468059539795, "learning_rate": 4.845218108450414e-06, "loss": 0.3931, "step": 8225 }, { "epoch": 0.1215004197287227, "grad_norm": 0.647087574005127, "learning_rate": 4.859945215163029e-06, "loss": 0.3994, "step": 8250 }, { "epoch": 0.12186860281880973, "grad_norm": 0.7880210280418396, "learning_rate": 4.874672321875645e-06, "loss": 0.3974, "step": 8275 }, { "epoch": 0.12223678590889678, "grad_norm": 0.8280056715011597, "learning_rate": 4.88939942858826e-06, "loss": 0.3905, "step": 8300 }, { "epoch": 0.12260496899898382, "grad_norm": 0.6425456404685974, "learning_rate": 4.904126535300876e-06, "loss": 0.3957, "step": 8325 }, { "epoch": 0.12297315208907085, "grad_norm": 0.6932951211929321, "learning_rate": 4.9188536420134905e-06, "loss": 0.3937, "step": 8350 }, { "epoch": 0.1233413351791579, "grad_norm": 0.7406250238418579, "learning_rate": 4.933580748726106e-06, "loss": 0.3852, "step": 8375 }, { "epoch": 0.12370951826924494, "grad_norm": 0.6257508397102356, "learning_rate": 4.948307855438721e-06, "loss": 0.3925, "step": 8400 }, { "epoch": 0.12407770135933197, "grad_norm": 0.749832272529602, "learning_rate": 4.963034962151336e-06, "loss": 0.3916, "step": 8425 }, { "epoch": 0.12444588444941901, "grad_norm": 0.6868317723274231, "learning_rate": 4.977762068863952e-06, "loss": 0.392, "step": 8450 }, { "epoch": 0.12481406753950605, "grad_norm": 0.8427602648735046, "learning_rate": 4.992489175576566e-06, "loss": 0.4006, "step": 8475 }, { "epoch": 0.1251822506295931, "grad_norm": 0.6355247497558594, "learning_rate": 5.007216282289183e-06, "loss": 0.3928, "step": 8500 }, { "epoch": 0.1255504337196801, "grad_norm": 0.7556335926055908, "learning_rate": 5.0219433890017964e-06, "loss": 0.393, "step": 8525 }, { "epoch": 0.12591861680976715, "grad_norm": 0.6544154286384583, "learning_rate": 5.036670495714412e-06, "loss": 0.388, "step": 8550 }, { "epoch": 0.1262867998998542, "grad_norm": 0.7124810218811035, "learning_rate": 5.0513976024270274e-06, "loss": 0.3793, "step": 8575 }, { "epoch": 0.12665498298994124, "grad_norm": 1.151803731918335, "learning_rate": 5.066124709139643e-06, "loss": 0.3837, "step": 8600 }, { "epoch": 0.12702316608002828, "grad_norm": 0.8259851932525635, "learning_rate": 5.080851815852258e-06, "loss": 0.3858, "step": 8625 }, { "epoch": 0.12739134917011533, "grad_norm": 0.9365520477294922, "learning_rate": 5.095578922564873e-06, "loss": 0.3875, "step": 8650 }, { "epoch": 0.12775953226020234, "grad_norm": 0.6092934012413025, "learning_rate": 5.110306029277489e-06, "loss": 0.3819, "step": 8675 }, { "epoch": 0.12812771535028938, "grad_norm": 0.665541410446167, "learning_rate": 5.125033135990104e-06, "loss": 0.3822, "step": 8700 }, { "epoch": 0.12849589844037643, "grad_norm": 0.5750928521156311, "learning_rate": 5.13976024270272e-06, "loss": 0.3794, "step": 8725 }, { "epoch": 0.12886408153046347, "grad_norm": 0.8719878196716309, "learning_rate": 5.154487349415335e-06, "loss": 0.3839, "step": 8750 }, { "epoch": 0.1292322646205505, "grad_norm": 0.5725120902061462, "learning_rate": 5.16921445612795e-06, "loss": 0.3827, "step": 8775 }, { "epoch": 0.12960044771063756, "grad_norm": 0.6080131530761719, "learning_rate": 5.183941562840564e-06, "loss": 0.3784, "step": 8800 }, { "epoch": 0.12996863080072457, "grad_norm": 0.7401121854782104, "learning_rate": 5.19866866955318e-06, "loss": 0.3785, "step": 8825 }, { "epoch": 0.13033681389081161, "grad_norm": 0.637549877166748, "learning_rate": 5.2133957762657946e-06, "loss": 0.369, "step": 8850 }, { "epoch": 0.13070499698089866, "grad_norm": 0.6860336065292358, "learning_rate": 5.22812288297841e-06, "loss": 0.3767, "step": 8875 }, { "epoch": 0.1310731800709857, "grad_norm": 0.6861122846603394, "learning_rate": 5.2428499896910256e-06, "loss": 0.3764, "step": 8900 }, { "epoch": 0.13144136316107274, "grad_norm": 0.608595073223114, "learning_rate": 5.257577096403641e-06, "loss": 0.3771, "step": 8925 }, { "epoch": 0.1318095462511598, "grad_norm": 0.6960775852203369, "learning_rate": 5.2723042031162566e-06, "loss": 0.3757, "step": 8950 }, { "epoch": 0.1321777293412468, "grad_norm": 0.9528156518936157, "learning_rate": 5.287031309828872e-06, "loss": 0.3786, "step": 8975 }, { "epoch": 0.13254591243133385, "grad_norm": 1.1325275897979736, "learning_rate": 5.301758416541487e-06, "loss": 0.3728, "step": 9000 }, { "epoch": 0.1329140955214209, "grad_norm": 0.6594205498695374, "learning_rate": 5.316485523254102e-06, "loss": 0.3839, "step": 9025 }, { "epoch": 0.13328227861150793, "grad_norm": 0.6903712749481201, "learning_rate": 5.331212629966717e-06, "loss": 0.3671, "step": 9050 }, { "epoch": 0.13365046170159497, "grad_norm": 0.8763689398765564, "learning_rate": 5.345939736679332e-06, "loss": 0.3657, "step": 9075 }, { "epoch": 0.13401864479168202, "grad_norm": 0.7336785197257996, "learning_rate": 5.360666843391947e-06, "loss": 0.3725, "step": 9100 }, { "epoch": 0.13438682788176903, "grad_norm": 0.7568676471710205, "learning_rate": 5.3753939501045625e-06, "loss": 0.3696, "step": 9125 }, { "epoch": 0.13475501097185608, "grad_norm": 0.7238675951957703, "learning_rate": 5.390121056817178e-06, "loss": 0.3695, "step": 9150 }, { "epoch": 0.13512319406194312, "grad_norm": 0.6972078084945679, "learning_rate": 5.4048481635297935e-06, "loss": 0.3648, "step": 9175 }, { "epoch": 0.13549137715203016, "grad_norm": 0.6761888861656189, "learning_rate": 5.419575270242409e-06, "loss": 0.3586, "step": 9200 }, { "epoch": 0.1358595602421172, "grad_norm": 0.7237305045127869, "learning_rate": 5.434302376955024e-06, "loss": 0.373, "step": 9225 }, { "epoch": 0.13622774333220425, "grad_norm": 1.0163944959640503, "learning_rate": 5.449029483667639e-06, "loss": 0.3635, "step": 9250 }, { "epoch": 0.13659592642229126, "grad_norm": 0.7047263979911804, "learning_rate": 5.463756590380255e-06, "loss": 0.366, "step": 9275 }, { "epoch": 0.1369641095123783, "grad_norm": 0.8444983959197998, "learning_rate": 5.47848369709287e-06, "loss": 0.378, "step": 9300 }, { "epoch": 0.13733229260246535, "grad_norm": 0.7295616865158081, "learning_rate": 5.493210803805484e-06, "loss": 0.3649, "step": 9325 }, { "epoch": 0.1377004756925524, "grad_norm": 0.6368780136108398, "learning_rate": 5.5079379105180995e-06, "loss": 0.3678, "step": 9350 }, { "epoch": 0.13806865878263944, "grad_norm": 0.8212118148803711, "learning_rate": 5.522665017230715e-06, "loss": 0.365, "step": 9375 }, { "epoch": 0.13843684187272648, "grad_norm": 0.7287224531173706, "learning_rate": 5.5373921239433305e-06, "loss": 0.3691, "step": 9400 }, { "epoch": 0.1388050249628135, "grad_norm": 0.641190767288208, "learning_rate": 5.552119230655946e-06, "loss": 0.365, "step": 9425 }, { "epoch": 0.13917320805290054, "grad_norm": 0.6722202897071838, "learning_rate": 5.5668463373685615e-06, "loss": 0.3496, "step": 9450 }, { "epoch": 0.13954139114298758, "grad_norm": 0.5809231996536255, "learning_rate": 5.581573444081176e-06, "loss": 0.3659, "step": 9475 }, { "epoch": 0.13990957423307462, "grad_norm": 1.0955806970596313, "learning_rate": 5.596300550793792e-06, "loss": 0.3623, "step": 9500 }, { "epoch": 0.14027775732316167, "grad_norm": 0.7431578040122986, "learning_rate": 5.611027657506407e-06, "loss": 0.3474, "step": 9525 }, { "epoch": 0.1406459404132487, "grad_norm": 0.7483206987380981, "learning_rate": 5.625754764219023e-06, "loss": 0.3574, "step": 9550 }, { "epoch": 0.14101412350333573, "grad_norm": 0.8954830169677734, "learning_rate": 5.640481870931636e-06, "loss": 0.3637, "step": 9575 }, { "epoch": 0.14138230659342277, "grad_norm": 0.6910772323608398, "learning_rate": 5.655208977644252e-06, "loss": 0.3559, "step": 9600 }, { "epoch": 0.1417504896835098, "grad_norm": 0.6799713969230652, "learning_rate": 5.669936084356867e-06, "loss": 0.3558, "step": 9625 }, { "epoch": 0.14211867277359685, "grad_norm": 0.7354669570922852, "learning_rate": 5.684663191069483e-06, "loss": 0.3578, "step": 9650 }, { "epoch": 0.1424868558636839, "grad_norm": 0.792984664440155, "learning_rate": 5.699390297782098e-06, "loss": 0.3544, "step": 9675 }, { "epoch": 0.14285503895377094, "grad_norm": 0.6234838962554932, "learning_rate": 5.714117404494713e-06, "loss": 0.3529, "step": 9700 }, { "epoch": 0.14322322204385796, "grad_norm": 0.8698832392692566, "learning_rate": 5.7288445112073286e-06, "loss": 0.3512, "step": 9725 }, { "epoch": 0.143591405133945, "grad_norm": 0.685492753982544, "learning_rate": 5.743571617919944e-06, "loss": 0.3587, "step": 9750 }, { "epoch": 0.14395958822403204, "grad_norm": 0.8978196382522583, "learning_rate": 5.7582987246325596e-06, "loss": 0.3445, "step": 9775 }, { "epoch": 0.14432777131411909, "grad_norm": 0.6405984163284302, "learning_rate": 5.773025831345175e-06, "loss": 0.3541, "step": 9800 }, { "epoch": 0.14469595440420613, "grad_norm": 0.7188529968261719, "learning_rate": 5.7877529380577906e-06, "loss": 0.3471, "step": 9825 }, { "epoch": 0.14506413749429317, "grad_norm": 0.7249335050582886, "learning_rate": 5.802480044770404e-06, "loss": 0.3468, "step": 9850 }, { "epoch": 0.1454323205843802, "grad_norm": 0.7496221661567688, "learning_rate": 5.81720715148302e-06, "loss": 0.3546, "step": 9875 }, { "epoch": 0.14580050367446723, "grad_norm": 0.745561420917511, "learning_rate": 5.831934258195635e-06, "loss": 0.3511, "step": 9900 }, { "epoch": 0.14616868676455427, "grad_norm": 0.7024303078651428, "learning_rate": 5.84666136490825e-06, "loss": 0.3505, "step": 9925 }, { "epoch": 0.14653686985464132, "grad_norm": 0.7497286796569824, "learning_rate": 5.8613884716208655e-06, "loss": 0.3472, "step": 9950 }, { "epoch": 0.14690505294472836, "grad_norm": 0.6580822467803955, "learning_rate": 5.876115578333481e-06, "loss": 0.3498, "step": 9975 }, { "epoch": 0.1472732360348154, "grad_norm": 0.7973480224609375, "learning_rate": 5.8908426850460965e-06, "loss": 0.3454, "step": 10000 }, { "epoch": 0.14764141912490242, "grad_norm": 0.7136325836181641, "learning_rate": 5.9049807074902066e-06, "loss": 0.3511, "step": 10025 }, { "epoch": 0.14800960221498946, "grad_norm": 1.0403891801834106, "learning_rate": 5.919707814202822e-06, "loss": 0.3442, "step": 10050 }, { "epoch": 0.1483777853050765, "grad_norm": 0.9081152677536011, "learning_rate": 5.9344349209154376e-06, "loss": 0.3388, "step": 10075 }, { "epoch": 0.14874596839516355, "grad_norm": 0.7664965391159058, "learning_rate": 5.949162027628052e-06, "loss": 0.3445, "step": 10100 }, { "epoch": 0.1491141514852506, "grad_norm": 0.6614516973495483, "learning_rate": 5.963889134340668e-06, "loss": 0.3441, "step": 10125 }, { "epoch": 0.14948233457533763, "grad_norm": 0.7747263312339783, "learning_rate": 5.978616241053283e-06, "loss": 0.3414, "step": 10150 }, { "epoch": 0.14985051766542468, "grad_norm": 0.7055206894874573, "learning_rate": 5.993343347765899e-06, "loss": 0.3478, "step": 10175 }, { "epoch": 0.1502187007555117, "grad_norm": 1.1476428508758545, "learning_rate": 6.008070454478514e-06, "loss": 0.3476, "step": 10200 }, { "epoch": 0.15058688384559873, "grad_norm": 0.6901858448982239, "learning_rate": 6.02279756119113e-06, "loss": 0.3403, "step": 10225 }, { "epoch": 0.15095506693568578, "grad_norm": 0.8119340538978577, "learning_rate": 6.0375246679037435e-06, "loss": 0.3345, "step": 10250 }, { "epoch": 0.15132325002577282, "grad_norm": 0.7182418704032898, "learning_rate": 6.052251774616359e-06, "loss": 0.3444, "step": 10275 }, { "epoch": 0.15169143311585986, "grad_norm": 0.6877521872520447, "learning_rate": 6.0669788813289745e-06, "loss": 0.3368, "step": 10300 }, { "epoch": 0.1520596162059469, "grad_norm": 0.7715916633605957, "learning_rate": 6.08170598804159e-06, "loss": 0.3411, "step": 10325 }, { "epoch": 0.15242779929603392, "grad_norm": 0.866104006767273, "learning_rate": 6.096433094754205e-06, "loss": 0.3423, "step": 10350 }, { "epoch": 0.15279598238612097, "grad_norm": 1.0410808324813843, "learning_rate": 6.11116020146682e-06, "loss": 0.3347, "step": 10375 }, { "epoch": 0.153164165476208, "grad_norm": 0.8808249831199646, "learning_rate": 6.125887308179436e-06, "loss": 0.3439, "step": 10400 }, { "epoch": 0.15353234856629505, "grad_norm": 0.7519041895866394, "learning_rate": 6.140614414892051e-06, "loss": 0.3384, "step": 10425 }, { "epoch": 0.1539005316563821, "grad_norm": 0.7106051445007324, "learning_rate": 6.155341521604667e-06, "loss": 0.3342, "step": 10450 }, { "epoch": 0.15426871474646914, "grad_norm": 0.8196017742156982, "learning_rate": 6.170068628317281e-06, "loss": 0.3318, "step": 10475 }, { "epoch": 0.15463689783655615, "grad_norm": 0.8981484770774841, "learning_rate": 6.184795735029897e-06, "loss": 0.3303, "step": 10500 }, { "epoch": 0.1550050809266432, "grad_norm": 0.8908175826072693, "learning_rate": 6.1995228417425115e-06, "loss": 0.3337, "step": 10525 }, { "epoch": 0.15537326401673024, "grad_norm": 1.051892876625061, "learning_rate": 6.214249948455127e-06, "loss": 0.3453, "step": 10550 }, { "epoch": 0.15574144710681728, "grad_norm": 0.760026216506958, "learning_rate": 6.228977055167742e-06, "loss": 0.3345, "step": 10575 }, { "epoch": 0.15610963019690433, "grad_norm": 0.7725528478622437, "learning_rate": 6.243704161880357e-06, "loss": 0.3324, "step": 10600 }, { "epoch": 0.15647781328699137, "grad_norm": 0.7681843042373657, "learning_rate": 6.258431268592973e-06, "loss": 0.3337, "step": 10625 }, { "epoch": 0.15684599637707838, "grad_norm": 0.8469594717025757, "learning_rate": 6.273158375305588e-06, "loss": 0.3313, "step": 10650 }, { "epoch": 0.15721417946716543, "grad_norm": 0.9619236588478088, "learning_rate": 6.287885482018204e-06, "loss": 0.3312, "step": 10675 }, { "epoch": 0.15758236255725247, "grad_norm": 0.9958358407020569, "learning_rate": 6.302612588730819e-06, "loss": 0.3319, "step": 10700 }, { "epoch": 0.1579505456473395, "grad_norm": 0.8480785489082336, "learning_rate": 6.317339695443434e-06, "loss": 0.3169, "step": 10725 }, { "epoch": 0.15831872873742656, "grad_norm": 0.7890802025794983, "learning_rate": 6.332066802156049e-06, "loss": 0.3231, "step": 10750 }, { "epoch": 0.1586869118275136, "grad_norm": 0.735247015953064, "learning_rate": 6.346793908868664e-06, "loss": 0.3224, "step": 10775 }, { "epoch": 0.15905509491760061, "grad_norm": 0.7928955554962158, "learning_rate": 6.361521015581279e-06, "loss": 0.3273, "step": 10800 }, { "epoch": 0.15942327800768766, "grad_norm": 0.8884017467498779, "learning_rate": 6.376248122293894e-06, "loss": 0.3168, "step": 10825 }, { "epoch": 0.1597914610977747, "grad_norm": 0.8429638147354126, "learning_rate": 6.3909752290065096e-06, "loss": 0.3295, "step": 10850 }, { "epoch": 0.16015964418786174, "grad_norm": 0.8525298237800598, "learning_rate": 6.405702335719125e-06, "loss": 0.3272, "step": 10875 }, { "epoch": 0.1605278272779488, "grad_norm": 0.806425154209137, "learning_rate": 6.4204294424317406e-06, "loss": 0.3165, "step": 10900 }, { "epoch": 0.16089601036803583, "grad_norm": 0.7422736287117004, "learning_rate": 6.435156549144356e-06, "loss": 0.3215, "step": 10925 }, { "epoch": 0.16126419345812285, "grad_norm": 1.050544261932373, "learning_rate": 6.449883655856971e-06, "loss": 0.3145, "step": 10950 }, { "epoch": 0.1616323765482099, "grad_norm": 0.7095434069633484, "learning_rate": 6.464610762569586e-06, "loss": 0.3243, "step": 10975 }, { "epoch": 0.16200055963829693, "grad_norm": 0.8569414019584656, "learning_rate": 6.479337869282202e-06, "loss": 0.329, "step": 11000 }, { "epoch": 0.16236874272838397, "grad_norm": 0.704474687576294, "learning_rate": 6.494064975994817e-06, "loss": 0.3165, "step": 11025 }, { "epoch": 0.16273692581847102, "grad_norm": 1.3744553327560425, "learning_rate": 6.508792082707431e-06, "loss": 0.3207, "step": 11050 }, { "epoch": 0.16310510890855806, "grad_norm": 0.9816151261329651, "learning_rate": 6.5235191894200465e-06, "loss": 0.3131, "step": 11075 }, { "epoch": 0.16347329199864508, "grad_norm": 1.0555229187011719, "learning_rate": 6.538246296132662e-06, "loss": 0.3264, "step": 11100 }, { "epoch": 0.16384147508873212, "grad_norm": 0.816231906414032, "learning_rate": 6.5529734028452775e-06, "loss": 0.3153, "step": 11125 }, { "epoch": 0.16420965817881916, "grad_norm": 0.8610765933990479, "learning_rate": 6.567700509557893e-06, "loss": 0.3156, "step": 11150 }, { "epoch": 0.1645778412689062, "grad_norm": 0.8233363628387451, "learning_rate": 6.582427616270508e-06, "loss": 0.3165, "step": 11175 }, { "epoch": 0.16494602435899325, "grad_norm": 0.8406914472579956, "learning_rate": 6.597154722983123e-06, "loss": 0.3234, "step": 11200 }, { "epoch": 0.1653142074490803, "grad_norm": 0.7772729992866516, "learning_rate": 6.611881829695739e-06, "loss": 0.32, "step": 11225 }, { "epoch": 0.1656823905391673, "grad_norm": 0.9047859311103821, "learning_rate": 6.626608936408354e-06, "loss": 0.3127, "step": 11250 }, { "epoch": 0.16605057362925435, "grad_norm": 0.6934922933578491, "learning_rate": 6.64133604312097e-06, "loss": 0.3125, "step": 11275 }, { "epoch": 0.1664187567193414, "grad_norm": 0.8364558219909668, "learning_rate": 6.6560631498335835e-06, "loss": 0.319, "step": 11300 }, { "epoch": 0.16678693980942844, "grad_norm": 0.8287280797958374, "learning_rate": 6.670790256546199e-06, "loss": 0.3076, "step": 11325 }, { "epoch": 0.16715512289951548, "grad_norm": 0.8386410474777222, "learning_rate": 6.6855173632588145e-06, "loss": 0.3178, "step": 11350 }, { "epoch": 0.16752330598960252, "grad_norm": 1.0043636560440063, "learning_rate": 6.70024446997143e-06, "loss": 0.3279, "step": 11375 }, { "epoch": 0.16789148907968954, "grad_norm": 0.7347243428230286, "learning_rate": 6.7149715766840455e-06, "loss": 0.3041, "step": 11400 }, { "epoch": 0.16825967216977658, "grad_norm": 0.7245997786521912, "learning_rate": 6.72969868339666e-06, "loss": 0.3117, "step": 11425 }, { "epoch": 0.16862785525986362, "grad_norm": 1.2553895711898804, "learning_rate": 6.744425790109276e-06, "loss": 0.3149, "step": 11450 }, { "epoch": 0.16899603834995067, "grad_norm": 0.9861347079277039, "learning_rate": 6.759152896821891e-06, "loss": 0.3043, "step": 11475 }, { "epoch": 0.1693642214400377, "grad_norm": 0.8539161682128906, "learning_rate": 6.773880003534507e-06, "loss": 0.3157, "step": 11500 }, { "epoch": 0.16973240453012475, "grad_norm": 0.939351499080658, "learning_rate": 6.788607110247122e-06, "loss": 0.3093, "step": 11525 }, { "epoch": 0.17010058762021177, "grad_norm": 0.8767924904823303, "learning_rate": 6.803334216959737e-06, "loss": 0.3046, "step": 11550 }, { "epoch": 0.1704687707102988, "grad_norm": 0.8383442163467407, "learning_rate": 6.8180613236723514e-06, "loss": 0.3143, "step": 11575 }, { "epoch": 0.17083695380038585, "grad_norm": 0.8109354972839355, "learning_rate": 6.832788430384967e-06, "loss": 0.2992, "step": 11600 }, { "epoch": 0.1712051368904729, "grad_norm": 0.8896241188049316, "learning_rate": 6.8475155370975824e-06, "loss": 0.3104, "step": 11625 }, { "epoch": 0.17157331998055994, "grad_norm": 0.8169327974319458, "learning_rate": 6.862242643810197e-06, "loss": 0.308, "step": 11650 }, { "epoch": 0.17194150307064698, "grad_norm": 1.13442862033844, "learning_rate": 6.876969750522813e-06, "loss": 0.3081, "step": 11675 }, { "epoch": 0.172309686160734, "grad_norm": 0.8486407995223999, "learning_rate": 6.891696857235428e-06, "loss": 0.3046, "step": 11700 }, { "epoch": 0.17267786925082104, "grad_norm": 1.0129576921463013, "learning_rate": 6.906423963948044e-06, "loss": 0.3037, "step": 11725 }, { "epoch": 0.17304605234090809, "grad_norm": 0.9492967128753662, "learning_rate": 6.921151070660659e-06, "loss": 0.3046, "step": 11750 }, { "epoch": 0.17341423543099513, "grad_norm": 0.9998227953910828, "learning_rate": 6.935878177373275e-06, "loss": 0.294, "step": 11775 }, { "epoch": 0.17378241852108217, "grad_norm": 0.7796307802200317, "learning_rate": 6.950605284085889e-06, "loss": 0.2962, "step": 11800 }, { "epoch": 0.17415060161116921, "grad_norm": 0.8251902461051941, "learning_rate": 6.965332390798504e-06, "loss": 0.2982, "step": 11825 }, { "epoch": 0.17451878470125623, "grad_norm": 0.7870060801506042, "learning_rate": 6.980059497511119e-06, "loss": 0.2966, "step": 11850 }, { "epoch": 0.17488696779134327, "grad_norm": 0.9390984177589417, "learning_rate": 6.994786604223734e-06, "loss": 0.3103, "step": 11875 }, { "epoch": 0.17525515088143032, "grad_norm": 0.9660308957099915, "learning_rate": 7.0095137109363496e-06, "loss": 0.3009, "step": 11900 }, { "epoch": 0.17562333397151736, "grad_norm": 1.0213398933410645, "learning_rate": 7.024240817648965e-06, "loss": 0.306, "step": 11925 }, { "epoch": 0.1759915170616044, "grad_norm": 1.5115578174591064, "learning_rate": 7.0389679243615805e-06, "loss": 0.2988, "step": 11950 }, { "epoch": 0.17635970015169145, "grad_norm": 0.902195930480957, "learning_rate": 7.053695031074196e-06, "loss": 0.3002, "step": 11975 }, { "epoch": 0.17672788324177846, "grad_norm": 0.783263087272644, "learning_rate": 7.0684221377868115e-06, "loss": 0.2987, "step": 12000 }, { "epoch": 0.1770960663318655, "grad_norm": 0.7162420153617859, "learning_rate": 7.082560160230922e-06, "loss": 0.2949, "step": 12025 }, { "epoch": 0.17746424942195255, "grad_norm": 0.9869486093521118, "learning_rate": 7.097287266943536e-06, "loss": 0.2955, "step": 12050 }, { "epoch": 0.1778324325120396, "grad_norm": 0.9826450943946838, "learning_rate": 7.112014373656152e-06, "loss": 0.3006, "step": 12075 }, { "epoch": 0.17820061560212663, "grad_norm": 0.9040399789810181, "learning_rate": 7.126741480368767e-06, "loss": 0.2947, "step": 12100 }, { "epoch": 0.17856879869221368, "grad_norm": 1.1333189010620117, "learning_rate": 7.141468587081383e-06, "loss": 0.2933, "step": 12125 }, { "epoch": 0.1789369817823007, "grad_norm": 0.8149101138114929, "learning_rate": 7.156195693793998e-06, "loss": 0.298, "step": 12150 }, { "epoch": 0.17930516487238773, "grad_norm": 0.7296385765075684, "learning_rate": 7.170922800506614e-06, "loss": 0.2944, "step": 12175 }, { "epoch": 0.17967334796247478, "grad_norm": 0.8775405883789062, "learning_rate": 7.185649907219228e-06, "loss": 0.2978, "step": 12200 }, { "epoch": 0.18004153105256182, "grad_norm": 0.8729538321495056, "learning_rate": 7.200377013931844e-06, "loss": 0.2932, "step": 12225 }, { "epoch": 0.18040971414264886, "grad_norm": 0.8459117412567139, "learning_rate": 7.2151041206444585e-06, "loss": 0.2961, "step": 12250 }, { "epoch": 0.1807778972327359, "grad_norm": 0.9352242946624756, "learning_rate": 7.229831227357074e-06, "loss": 0.307, "step": 12275 }, { "epoch": 0.18114608032282292, "grad_norm": 1.170359492301941, "learning_rate": 7.244558334069689e-06, "loss": 0.2913, "step": 12300 }, { "epoch": 0.18151426341290997, "grad_norm": 0.7829805612564087, "learning_rate": 7.259285440782304e-06, "loss": 0.2972, "step": 12325 }, { "epoch": 0.181882446502997, "grad_norm": 0.9527215361595154, "learning_rate": 7.27401254749492e-06, "loss": 0.2876, "step": 12350 }, { "epoch": 0.18225062959308405, "grad_norm": 0.7842198014259338, "learning_rate": 7.288739654207535e-06, "loss": 0.2925, "step": 12375 }, { "epoch": 0.1826188126831711, "grad_norm": 0.9339554905891418, "learning_rate": 7.303466760920151e-06, "loss": 0.2872, "step": 12400 }, { "epoch": 0.18298699577325814, "grad_norm": 0.7690834403038025, "learning_rate": 7.318193867632765e-06, "loss": 0.2916, "step": 12425 }, { "epoch": 0.18335517886334515, "grad_norm": 0.8643265962600708, "learning_rate": 7.332920974345381e-06, "loss": 0.2932, "step": 12450 }, { "epoch": 0.1837233619534322, "grad_norm": 0.9393995404243469, "learning_rate": 7.347648081057996e-06, "loss": 0.2958, "step": 12475 }, { "epoch": 0.18409154504351924, "grad_norm": 0.9852946400642395, "learning_rate": 7.362375187770611e-06, "loss": 0.2893, "step": 12500 }, { "epoch": 0.18445972813360628, "grad_norm": 1.0065406560897827, "learning_rate": 7.377102294483226e-06, "loss": 0.2961, "step": 12525 }, { "epoch": 0.18482791122369333, "grad_norm": 0.8727032542228699, "learning_rate": 7.391829401195841e-06, "loss": 0.2928, "step": 12550 }, { "epoch": 0.18519609431378037, "grad_norm": 1.0887058973312378, "learning_rate": 7.406556507908457e-06, "loss": 0.2936, "step": 12575 }, { "epoch": 0.18556427740386738, "grad_norm": 0.8902204632759094, "learning_rate": 7.421283614621072e-06, "loss": 0.2928, "step": 12600 }, { "epoch": 0.18593246049395443, "grad_norm": 0.9425715208053589, "learning_rate": 7.436010721333688e-06, "loss": 0.2857, "step": 12625 }, { "epoch": 0.18630064358404147, "grad_norm": 1.170000433921814, "learning_rate": 7.450737828046303e-06, "loss": 0.2958, "step": 12650 }, { "epoch": 0.1866688266741285, "grad_norm": 1.0390965938568115, "learning_rate": 7.465464934758918e-06, "loss": 0.2812, "step": 12675 }, { "epoch": 0.18703700976421556, "grad_norm": 1.0281293392181396, "learning_rate": 7.480192041471533e-06, "loss": 0.2878, "step": 12700 }, { "epoch": 0.1874051928543026, "grad_norm": 1.0428563356399536, "learning_rate": 7.494919148184149e-06, "loss": 0.2854, "step": 12725 }, { "epoch": 0.18777337594438961, "grad_norm": 0.9071276187896729, "learning_rate": 7.509646254896764e-06, "loss": 0.2808, "step": 12750 }, { "epoch": 0.18814155903447666, "grad_norm": 0.9674225449562073, "learning_rate": 7.524373361609378e-06, "loss": 0.2914, "step": 12775 }, { "epoch": 0.1885097421245637, "grad_norm": 0.8867351412773132, "learning_rate": 7.539100468321994e-06, "loss": 0.2809, "step": 12800 }, { "epoch": 0.18887792521465074, "grad_norm": 1.048401951789856, "learning_rate": 7.553827575034609e-06, "loss": 0.2844, "step": 12825 }, { "epoch": 0.1892461083047378, "grad_norm": 1.1923837661743164, "learning_rate": 7.568554681747225e-06, "loss": 0.2846, "step": 12850 }, { "epoch": 0.18961429139482483, "grad_norm": 1.000769853591919, "learning_rate": 7.58328178845984e-06, "loss": 0.2825, "step": 12875 }, { "epoch": 0.18998247448491185, "grad_norm": 0.9111427664756775, "learning_rate": 7.598008895172455e-06, "loss": 0.2867, "step": 12900 }, { "epoch": 0.1903506575749989, "grad_norm": 0.8391136527061462, "learning_rate": 7.61273600188507e-06, "loss": 0.2773, "step": 12925 }, { "epoch": 0.19071884066508593, "grad_norm": 1.2547825574874878, "learning_rate": 7.627463108597686e-06, "loss": 0.2799, "step": 12950 }, { "epoch": 0.19108702375517297, "grad_norm": 0.8818742632865906, "learning_rate": 7.6421902153103e-06, "loss": 0.2744, "step": 12975 }, { "epoch": 0.19145520684526002, "grad_norm": 0.884929358959198, "learning_rate": 7.656917322022917e-06, "loss": 0.2858, "step": 13000 }, { "epoch": 0.19182338993534706, "grad_norm": 1.0774343013763428, "learning_rate": 7.671644428735531e-06, "loss": 0.2904, "step": 13025 }, { "epoch": 0.19219157302543408, "grad_norm": 1.2477922439575195, "learning_rate": 7.686371535448146e-06, "loss": 0.2886, "step": 13050 }, { "epoch": 0.19255975611552112, "grad_norm": 1.028517246246338, "learning_rate": 7.70109864216076e-06, "loss": 0.2866, "step": 13075 }, { "epoch": 0.19292793920560816, "grad_norm": 0.9460515975952148, "learning_rate": 7.715825748873377e-06, "loss": 0.2791, "step": 13100 }, { "epoch": 0.1932961222956952, "grad_norm": 0.9999874830245972, "learning_rate": 7.730552855585992e-06, "loss": 0.2752, "step": 13125 }, { "epoch": 0.19366430538578225, "grad_norm": 0.9967283606529236, "learning_rate": 7.745279962298608e-06, "loss": 0.2722, "step": 13150 }, { "epoch": 0.1940324884758693, "grad_norm": 0.9238536357879639, "learning_rate": 7.760007069011223e-06, "loss": 0.2767, "step": 13175 }, { "epoch": 0.1944006715659563, "grad_norm": 1.0233261585235596, "learning_rate": 7.774734175723837e-06, "loss": 0.2783, "step": 13200 }, { "epoch": 0.19476885465604335, "grad_norm": 0.9148591756820679, "learning_rate": 7.789461282436454e-06, "loss": 0.2791, "step": 13225 }, { "epoch": 0.1951370377461304, "grad_norm": 1.153035044670105, "learning_rate": 7.804188389149068e-06, "loss": 0.2749, "step": 13250 }, { "epoch": 0.19550522083621744, "grad_norm": 1.205528736114502, "learning_rate": 7.818915495861685e-06, "loss": 0.2766, "step": 13275 }, { "epoch": 0.19587340392630448, "grad_norm": 0.8279467821121216, "learning_rate": 7.833642602574298e-06, "loss": 0.2867, "step": 13300 }, { "epoch": 0.19624158701639152, "grad_norm": 1.032379150390625, "learning_rate": 7.848369709286914e-06, "loss": 0.275, "step": 13325 }, { "epoch": 0.19660977010647854, "grad_norm": 1.5636012554168701, "learning_rate": 7.863096815999529e-06, "loss": 0.2791, "step": 13350 }, { "epoch": 0.19697795319656558, "grad_norm": 0.8588216304779053, "learning_rate": 7.877823922712145e-06, "loss": 0.2852, "step": 13375 }, { "epoch": 0.19734613628665262, "grad_norm": 1.1563206911087036, "learning_rate": 7.89255102942476e-06, "loss": 0.2761, "step": 13400 }, { "epoch": 0.19771431937673967, "grad_norm": 0.9631919264793396, "learning_rate": 7.907278136137374e-06, "loss": 0.2813, "step": 13425 }, { "epoch": 0.1980825024668267, "grad_norm": 0.9245600700378418, "learning_rate": 7.92200524284999e-06, "loss": 0.2779, "step": 13450 }, { "epoch": 0.19845068555691375, "grad_norm": 1.0227192640304565, "learning_rate": 7.936732349562605e-06, "loss": 0.2778, "step": 13475 }, { "epoch": 0.19881886864700077, "grad_norm": 0.9280269145965576, "learning_rate": 7.951459456275222e-06, "loss": 0.2751, "step": 13500 }, { "epoch": 0.1991870517370878, "grad_norm": 0.945601761341095, "learning_rate": 7.966186562987836e-06, "loss": 0.2758, "step": 13525 }, { "epoch": 0.19955523482717485, "grad_norm": 0.8174274563789368, "learning_rate": 7.980913669700451e-06, "loss": 0.2754, "step": 13550 }, { "epoch": 0.1999234179172619, "grad_norm": 0.8976578712463379, "learning_rate": 7.995640776413066e-06, "loss": 0.2741, "step": 13575 }, { "epoch": 0.20029160100734894, "grad_norm": 1.0173979997634888, "learning_rate": 8.010367883125682e-06, "loss": 0.269, "step": 13600 }, { "epoch": 0.20065978409743598, "grad_norm": 0.9142144918441772, "learning_rate": 8.025094989838297e-06, "loss": 0.2748, "step": 13625 }, { "epoch": 0.201027967187523, "grad_norm": 1.0898303985595703, "learning_rate": 8.039822096550913e-06, "loss": 0.2695, "step": 13650 }, { "epoch": 0.20139615027761004, "grad_norm": 0.9205647706985474, "learning_rate": 8.054549203263528e-06, "loss": 0.2717, "step": 13675 }, { "epoch": 0.20176433336769709, "grad_norm": 0.9046348333358765, "learning_rate": 8.069276309976142e-06, "loss": 0.2689, "step": 13700 }, { "epoch": 0.20213251645778413, "grad_norm": 1.0964778661727905, "learning_rate": 8.084003416688759e-06, "loss": 0.2745, "step": 13725 }, { "epoch": 0.20250069954787117, "grad_norm": 0.9805202484130859, "learning_rate": 8.098730523401373e-06, "loss": 0.2739, "step": 13750 }, { "epoch": 0.20286888263795821, "grad_norm": 0.9941619038581848, "learning_rate": 8.11345763011399e-06, "loss": 0.2667, "step": 13775 }, { "epoch": 0.20323706572804523, "grad_norm": 1.1271330118179321, "learning_rate": 8.128184736826604e-06, "loss": 0.2647, "step": 13800 }, { "epoch": 0.20360524881813227, "grad_norm": 0.9569908380508423, "learning_rate": 8.142911843539219e-06, "loss": 0.2674, "step": 13825 }, { "epoch": 0.20397343190821932, "grad_norm": 0.942244827747345, "learning_rate": 8.157638950251834e-06, "loss": 0.268, "step": 13850 }, { "epoch": 0.20434161499830636, "grad_norm": 1.2185558080673218, "learning_rate": 8.17236605696445e-06, "loss": 0.2651, "step": 13875 }, { "epoch": 0.2047097980883934, "grad_norm": 0.8764684200286865, "learning_rate": 8.187093163677065e-06, "loss": 0.2614, "step": 13900 }, { "epoch": 0.20507798117848045, "grad_norm": 1.1464710235595703, "learning_rate": 8.20182027038968e-06, "loss": 0.2749, "step": 13925 }, { "epoch": 0.20544616426856746, "grad_norm": 0.9970076680183411, "learning_rate": 8.216547377102296e-06, "loss": 0.2661, "step": 13950 }, { "epoch": 0.2058143473586545, "grad_norm": 1.0001120567321777, "learning_rate": 8.23127448381491e-06, "loss": 0.262, "step": 13975 }, { "epoch": 0.20618253044874155, "grad_norm": 1.0340194702148438, "learning_rate": 8.246001590527527e-06, "loss": 0.2638, "step": 14000 }, { "epoch": 0.2065507135388286, "grad_norm": 0.8450446128845215, "learning_rate": 8.260139612971636e-06, "loss": 0.2626, "step": 14025 }, { "epoch": 0.20691889662891563, "grad_norm": 0.991098165512085, "learning_rate": 8.274866719684252e-06, "loss": 0.2613, "step": 14050 }, { "epoch": 0.20728707971900268, "grad_norm": 0.9824557900428772, "learning_rate": 8.289004742128363e-06, "loss": 0.2676, "step": 14075 }, { "epoch": 0.2076552628090897, "grad_norm": 0.9877932667732239, "learning_rate": 8.303731848840978e-06, "loss": 0.2686, "step": 14100 }, { "epoch": 0.20802344589917673, "grad_norm": 1.0560307502746582, "learning_rate": 8.318458955553592e-06, "loss": 0.255, "step": 14125 }, { "epoch": 0.20839162898926378, "grad_norm": 0.992840588092804, "learning_rate": 8.333186062266207e-06, "loss": 0.2613, "step": 14150 }, { "epoch": 0.20875981207935082, "grad_norm": 0.8604615926742554, "learning_rate": 8.347913168978823e-06, "loss": 0.2656, "step": 14175 }, { "epoch": 0.20912799516943786, "grad_norm": 1.2327070236206055, "learning_rate": 8.362640275691438e-06, "loss": 0.26, "step": 14200 }, { "epoch": 0.2094961782595249, "grad_norm": 0.932135820388794, "learning_rate": 8.377367382404054e-06, "loss": 0.2601, "step": 14225 }, { "epoch": 0.20986436134961192, "grad_norm": 0.9523219466209412, "learning_rate": 8.392094489116669e-06, "loss": 0.2667, "step": 14250 }, { "epoch": 0.21023254443969897, "grad_norm": 1.3120263814926147, "learning_rate": 8.406821595829284e-06, "loss": 0.26, "step": 14275 }, { "epoch": 0.210600727529786, "grad_norm": 1.3057265281677246, "learning_rate": 8.4215487025419e-06, "loss": 0.258, "step": 14300 }, { "epoch": 0.21096891061987305, "grad_norm": 0.9782459139823914, "learning_rate": 8.436275809254515e-06, "loss": 0.2697, "step": 14325 }, { "epoch": 0.2113370937099601, "grad_norm": 1.1482174396514893, "learning_rate": 8.451002915967131e-06, "loss": 0.2546, "step": 14350 }, { "epoch": 0.21170527680004714, "grad_norm": 1.288827657699585, "learning_rate": 8.465730022679744e-06, "loss": 0.2614, "step": 14375 }, { "epoch": 0.21207345989013415, "grad_norm": 0.9124887585639954, "learning_rate": 8.48045712939236e-06, "loss": 0.2565, "step": 14400 }, { "epoch": 0.2124416429802212, "grad_norm": 1.1914868354797363, "learning_rate": 8.495184236104975e-06, "loss": 0.2567, "step": 14425 }, { "epoch": 0.21280982607030824, "grad_norm": 1.1137144565582275, "learning_rate": 8.509911342817591e-06, "loss": 0.2591, "step": 14450 }, { "epoch": 0.21317800916039528, "grad_norm": 0.965062141418457, "learning_rate": 8.524638449530206e-06, "loss": 0.2517, "step": 14475 }, { "epoch": 0.21354619225048233, "grad_norm": 1.0583897829055786, "learning_rate": 8.53936555624282e-06, "loss": 0.2547, "step": 14500 }, { "epoch": 0.21391437534056937, "grad_norm": 1.2841545343399048, "learning_rate": 8.554092662955437e-06, "loss": 0.2568, "step": 14525 }, { "epoch": 0.21428255843065638, "grad_norm": 1.2085353136062622, "learning_rate": 8.568819769668052e-06, "loss": 0.2583, "step": 14550 }, { "epoch": 0.21465074152074343, "grad_norm": 0.9443012475967407, "learning_rate": 8.583546876380668e-06, "loss": 0.253, "step": 14575 }, { "epoch": 0.21501892461083047, "grad_norm": 1.041521668434143, "learning_rate": 8.598273983093283e-06, "loss": 0.2521, "step": 14600 }, { "epoch": 0.2153871077009175, "grad_norm": 0.9093597531318665, "learning_rate": 8.613001089805897e-06, "loss": 0.2551, "step": 14625 }, { "epoch": 0.21575529079100456, "grad_norm": 1.1667312383651733, "learning_rate": 8.627728196518512e-06, "loss": 0.2583, "step": 14650 }, { "epoch": 0.2161234738810916, "grad_norm": 0.9899322390556335, "learning_rate": 8.642455303231128e-06, "loss": 0.257, "step": 14675 }, { "epoch": 0.21649165697117861, "grad_norm": 0.9446772933006287, "learning_rate": 8.657182409943743e-06, "loss": 0.2501, "step": 14700 }, { "epoch": 0.21685984006126566, "grad_norm": 1.0645805597305298, "learning_rate": 8.671909516656358e-06, "loss": 0.2559, "step": 14725 }, { "epoch": 0.2172280231513527, "grad_norm": 0.973878026008606, "learning_rate": 8.686636623368974e-06, "loss": 0.2505, "step": 14750 }, { "epoch": 0.21759620624143974, "grad_norm": 1.1371912956237793, "learning_rate": 8.701363730081589e-06, "loss": 0.2491, "step": 14775 }, { "epoch": 0.2179643893315268, "grad_norm": 1.156105875968933, "learning_rate": 8.716090836794205e-06, "loss": 0.253, "step": 14800 }, { "epoch": 0.21833257242161383, "grad_norm": 1.585997223854065, "learning_rate": 8.73081794350682e-06, "loss": 0.2458, "step": 14825 }, { "epoch": 0.21870075551170085, "grad_norm": 1.114240288734436, "learning_rate": 8.745545050219436e-06, "loss": 0.2484, "step": 14850 }, { "epoch": 0.2190689386017879, "grad_norm": 0.9762231707572937, "learning_rate": 8.76027215693205e-06, "loss": 0.2544, "step": 14875 }, { "epoch": 0.21943712169187493, "grad_norm": 1.1107852458953857, "learning_rate": 8.774999263644665e-06, "loss": 0.2491, "step": 14900 }, { "epoch": 0.21980530478196197, "grad_norm": 0.7990845441818237, "learning_rate": 8.78972637035728e-06, "loss": 0.2509, "step": 14925 }, { "epoch": 0.22017348787204902, "grad_norm": 1.110044002532959, "learning_rate": 8.804453477069894e-06, "loss": 0.257, "step": 14950 }, { "epoch": 0.22054167096213606, "grad_norm": 1.1343072652816772, "learning_rate": 8.81918058378251e-06, "loss": 0.2537, "step": 14975 }, { "epoch": 0.22090985405222308, "grad_norm": 1.0467344522476196, "learning_rate": 8.833907690495125e-06, "loss": 0.2541, "step": 15000 }, { "epoch": 0.22127803714231012, "grad_norm": 0.992863655090332, "learning_rate": 8.848634797207742e-06, "loss": 0.2457, "step": 15025 }, { "epoch": 0.22164622023239716, "grad_norm": 0.9078419208526611, "learning_rate": 8.863361903920356e-06, "loss": 0.2611, "step": 15050 }, { "epoch": 0.2220144033224842, "grad_norm": 1.1889991760253906, "learning_rate": 8.878089010632973e-06, "loss": 0.2475, "step": 15075 }, { "epoch": 0.22238258641257125, "grad_norm": 1.1085773706436157, "learning_rate": 8.892816117345587e-06, "loss": 0.2517, "step": 15100 }, { "epoch": 0.2227507695026583, "grad_norm": 1.073651909828186, "learning_rate": 8.907543224058202e-06, "loss": 0.2575, "step": 15125 }, { "epoch": 0.2231189525927453, "grad_norm": 1.0204280614852905, "learning_rate": 8.922270330770817e-06, "loss": 0.2504, "step": 15150 }, { "epoch": 0.22348713568283235, "grad_norm": 1.0358093976974487, "learning_rate": 8.936997437483433e-06, "loss": 0.2429, "step": 15175 }, { "epoch": 0.2238553187729194, "grad_norm": 1.191390037536621, "learning_rate": 8.951724544196048e-06, "loss": 0.2471, "step": 15200 }, { "epoch": 0.22422350186300644, "grad_norm": 1.0816316604614258, "learning_rate": 8.966451650908662e-06, "loss": 0.249, "step": 15225 }, { "epoch": 0.22459168495309348, "grad_norm": 0.9376477599143982, "learning_rate": 8.981178757621279e-06, "loss": 0.2486, "step": 15250 }, { "epoch": 0.22495986804318052, "grad_norm": 1.1276350021362305, "learning_rate": 8.995905864333893e-06, "loss": 0.2455, "step": 15275 }, { "epoch": 0.22532805113326757, "grad_norm": 1.0941764116287231, "learning_rate": 9.01063297104651e-06, "loss": 0.2482, "step": 15300 }, { "epoch": 0.22569623422335458, "grad_norm": 1.093008279800415, "learning_rate": 9.025360077759124e-06, "loss": 0.2453, "step": 15325 }, { "epoch": 0.22606441731344162, "grad_norm": 0.9502381086349487, "learning_rate": 9.040087184471739e-06, "loss": 0.2458, "step": 15350 }, { "epoch": 0.22643260040352867, "grad_norm": 1.2488805055618286, "learning_rate": 9.054814291184355e-06, "loss": 0.2417, "step": 15375 }, { "epoch": 0.2268007834936157, "grad_norm": 1.0725972652435303, "learning_rate": 9.06954139789697e-06, "loss": 0.2453, "step": 15400 }, { "epoch": 0.22716896658370275, "grad_norm": 1.4769214391708374, "learning_rate": 9.084268504609585e-06, "loss": 0.2414, "step": 15425 }, { "epoch": 0.2275371496737898, "grad_norm": 1.148898959159851, "learning_rate": 9.0989956113222e-06, "loss": 0.2511, "step": 15450 }, { "epoch": 0.2279053327638768, "grad_norm": 1.0634573698043823, "learning_rate": 9.113722718034816e-06, "loss": 0.2396, "step": 15475 }, { "epoch": 0.22827351585396385, "grad_norm": 1.1080799102783203, "learning_rate": 9.12844982474743e-06, "loss": 0.2565, "step": 15500 }, { "epoch": 0.2286416989440509, "grad_norm": 1.3626021146774292, "learning_rate": 9.143176931460047e-06, "loss": 0.248, "step": 15525 }, { "epoch": 0.22900988203413794, "grad_norm": 1.0013554096221924, "learning_rate": 9.157904038172661e-06, "loss": 0.2393, "step": 15550 }, { "epoch": 0.22937806512422498, "grad_norm": 1.0677698850631714, "learning_rate": 9.172631144885276e-06, "loss": 0.2425, "step": 15575 }, { "epoch": 0.22974624821431203, "grad_norm": 0.9467232823371887, "learning_rate": 9.187358251597892e-06, "loss": 0.2328, "step": 15600 }, { "epoch": 0.23011443130439904, "grad_norm": 1.2118295431137085, "learning_rate": 9.202085358310507e-06, "loss": 0.2442, "step": 15625 }, { "epoch": 0.23048261439448609, "grad_norm": 1.2218703031539917, "learning_rate": 9.216812465023123e-06, "loss": 0.2312, "step": 15650 }, { "epoch": 0.23085079748457313, "grad_norm": 1.2991282939910889, "learning_rate": 9.231539571735736e-06, "loss": 0.2456, "step": 15675 }, { "epoch": 0.23121898057466017, "grad_norm": 1.34254789352417, "learning_rate": 9.246266678448353e-06, "loss": 0.2426, "step": 15700 }, { "epoch": 0.23158716366474721, "grad_norm": 1.237637996673584, "learning_rate": 9.260993785160967e-06, "loss": 0.2461, "step": 15725 }, { "epoch": 0.23195534675483426, "grad_norm": 1.3032089471817017, "learning_rate": 9.275720891873584e-06, "loss": 0.2367, "step": 15750 }, { "epoch": 0.23232352984492127, "grad_norm": 1.2366684675216675, "learning_rate": 9.290447998586198e-06, "loss": 0.2374, "step": 15775 }, { "epoch": 0.23269171293500832, "grad_norm": 1.2915616035461426, "learning_rate": 9.305175105298813e-06, "loss": 0.2401, "step": 15800 }, { "epoch": 0.23305989602509536, "grad_norm": 1.1223536729812622, "learning_rate": 9.31990221201143e-06, "loss": 0.2409, "step": 15825 }, { "epoch": 0.2334280791151824, "grad_norm": 1.1354379653930664, "learning_rate": 9.334629318724044e-06, "loss": 0.2441, "step": 15850 }, { "epoch": 0.23379626220526945, "grad_norm": 1.1635915040969849, "learning_rate": 9.34935642543666e-06, "loss": 0.2401, "step": 15875 }, { "epoch": 0.2341644452953565, "grad_norm": 1.0397142171859741, "learning_rate": 9.364083532149275e-06, "loss": 0.2406, "step": 15900 }, { "epoch": 0.2345326283854435, "grad_norm": 1.171473741531372, "learning_rate": 9.37881063886189e-06, "loss": 0.2399, "step": 15925 }, { "epoch": 0.23490081147553055, "grad_norm": 1.1377612352371216, "learning_rate": 9.393537745574504e-06, "loss": 0.24, "step": 15950 }, { "epoch": 0.2352689945656176, "grad_norm": 1.2155762910842896, "learning_rate": 9.40826485228712e-06, "loss": 0.2397, "step": 15975 }, { "epoch": 0.23563717765570463, "grad_norm": 1.1246799230575562, "learning_rate": 9.422991958999735e-06, "loss": 0.2472, "step": 16000 }, { "epoch": 0.23600536074579168, "grad_norm": 1.2643800973892212, "learning_rate": 9.43771906571235e-06, "loss": 0.2445, "step": 16025 }, { "epoch": 0.23637354383587872, "grad_norm": 0.971194863319397, "learning_rate": 9.452446172424966e-06, "loss": 0.2441, "step": 16050 }, { "epoch": 0.23674172692596573, "grad_norm": 1.0318214893341064, "learning_rate": 9.467173279137581e-06, "loss": 0.2379, "step": 16075 }, { "epoch": 0.23710991001605278, "grad_norm": 0.9700537919998169, "learning_rate": 9.481900385850197e-06, "loss": 0.2447, "step": 16100 }, { "epoch": 0.23747809310613982, "grad_norm": 1.1324304342269897, "learning_rate": 9.496627492562812e-06, "loss": 0.233, "step": 16125 }, { "epoch": 0.23784627619622686, "grad_norm": 1.808837652206421, "learning_rate": 9.511354599275428e-06, "loss": 0.2326, "step": 16150 }, { "epoch": 0.2382144592863139, "grad_norm": 1.1952983140945435, "learning_rate": 9.526081705988043e-06, "loss": 0.238, "step": 16175 }, { "epoch": 0.23858264237640095, "grad_norm": 1.0128469467163086, "learning_rate": 9.540808812700658e-06, "loss": 0.2368, "step": 16200 }, { "epoch": 0.23895082546648796, "grad_norm": 1.2425153255462646, "learning_rate": 9.555535919413272e-06, "loss": 0.2339, "step": 16225 }, { "epoch": 0.239319008556575, "grad_norm": 1.3166993856430054, "learning_rate": 9.570263026125889e-06, "loss": 0.2289, "step": 16250 }, { "epoch": 0.23968719164666205, "grad_norm": 1.0847030878067017, "learning_rate": 9.584990132838503e-06, "loss": 0.2375, "step": 16275 }, { "epoch": 0.2400553747367491, "grad_norm": 1.2066354751586914, "learning_rate": 9.599717239551118e-06, "loss": 0.2348, "step": 16300 }, { "epoch": 0.24042355782683614, "grad_norm": 1.2283575534820557, "learning_rate": 9.614444346263734e-06, "loss": 0.2279, "step": 16325 }, { "epoch": 0.24079174091692318, "grad_norm": 1.2124601602554321, "learning_rate": 9.629171452976349e-06, "loss": 0.2387, "step": 16350 }, { "epoch": 0.2411599240070102, "grad_norm": 1.1417065858840942, "learning_rate": 9.643898559688965e-06, "loss": 0.2366, "step": 16375 }, { "epoch": 0.24152810709709724, "grad_norm": 1.068881630897522, "learning_rate": 9.65862566640158e-06, "loss": 0.2297, "step": 16400 }, { "epoch": 0.24189629018718428, "grad_norm": 0.9088355302810669, "learning_rate": 9.673352773114195e-06, "loss": 0.2348, "step": 16425 }, { "epoch": 0.24226447327727132, "grad_norm": 1.6773308515548706, "learning_rate": 9.688079879826811e-06, "loss": 0.2396, "step": 16450 }, { "epoch": 0.24263265636735837, "grad_norm": 1.2640100717544556, "learning_rate": 9.702806986539426e-06, "loss": 0.2298, "step": 16475 }, { "epoch": 0.2430008394574454, "grad_norm": 1.1822272539138794, "learning_rate": 9.71753409325204e-06, "loss": 0.231, "step": 16500 }, { "epoch": 0.24336902254753243, "grad_norm": 1.2628257274627686, "learning_rate": 9.732261199964655e-06, "loss": 0.2366, "step": 16525 }, { "epoch": 0.24373720563761947, "grad_norm": 1.206954002380371, "learning_rate": 9.746988306677271e-06, "loss": 0.2287, "step": 16550 }, { "epoch": 0.2441053887277065, "grad_norm": 1.3411674499511719, "learning_rate": 9.761126329121382e-06, "loss": 0.2417, "step": 16575 }, { "epoch": 0.24447357181779356, "grad_norm": 1.1334586143493652, "learning_rate": 9.775853435833997e-06, "loss": 0.224, "step": 16600 }, { "epoch": 0.2448417549078806, "grad_norm": 1.0482016801834106, "learning_rate": 9.790580542546611e-06, "loss": 0.2342, "step": 16625 }, { "epoch": 0.24520993799796764, "grad_norm": 0.9900887608528137, "learning_rate": 9.805307649259228e-06, "loss": 0.2396, "step": 16650 }, { "epoch": 0.24557812108805466, "grad_norm": 1.1004738807678223, "learning_rate": 9.820034755971842e-06, "loss": 0.2219, "step": 16675 }, { "epoch": 0.2459463041781417, "grad_norm": 1.236008644104004, "learning_rate": 9.834761862684457e-06, "loss": 0.2248, "step": 16700 }, { "epoch": 0.24631448726822874, "grad_norm": 1.283540964126587, "learning_rate": 9.849488969397073e-06, "loss": 0.2269, "step": 16725 }, { "epoch": 0.2466826703583158, "grad_norm": 1.0043452978134155, "learning_rate": 9.864216076109688e-06, "loss": 0.232, "step": 16750 }, { "epoch": 0.24705085344840283, "grad_norm": 1.3650057315826416, "learning_rate": 9.878943182822304e-06, "loss": 0.2273, "step": 16775 }, { "epoch": 0.24741903653848987, "grad_norm": 0.9510201811790466, "learning_rate": 9.893670289534919e-06, "loss": 0.226, "step": 16800 }, { "epoch": 0.2477872196285769, "grad_norm": 1.5678138732910156, "learning_rate": 9.908397396247534e-06, "loss": 0.224, "step": 16825 }, { "epoch": 0.24815540271866393, "grad_norm": 1.2678954601287842, "learning_rate": 9.92312450296015e-06, "loss": 0.2114, "step": 16850 }, { "epoch": 0.24852358580875097, "grad_norm": 1.2294344902038574, "learning_rate": 9.937851609672765e-06, "loss": 0.2325, "step": 16875 }, { "epoch": 0.24889176889883802, "grad_norm": 1.084308385848999, "learning_rate": 9.95257871638538e-06, "loss": 0.2284, "step": 16900 }, { "epoch": 0.24925995198892506, "grad_norm": 1.3282686471939087, "learning_rate": 9.967305823097994e-06, "loss": 0.2193, "step": 16925 }, { "epoch": 0.2496281350790121, "grad_norm": 1.0947468280792236, "learning_rate": 9.98203292981061e-06, "loss": 0.2252, "step": 16950 }, { "epoch": 0.24999631816909912, "grad_norm": 1.2787188291549683, "learning_rate": 9.996760036523225e-06, "loss": 0.2271, "step": 16975 }, { "epoch": 0.2503645012591862, "grad_norm": 1.179134726524353, "learning_rate": 1.0011487143235841e-05, "loss": 0.2223, "step": 17000 }, { "epoch": 0.2507326843492732, "grad_norm": 1.1341197490692139, "learning_rate": 1.0026214249948456e-05, "loss": 0.22, "step": 17025 }, { "epoch": 0.2511008674393602, "grad_norm": 0.9111505150794983, "learning_rate": 1.004094135666107e-05, "loss": 0.2275, "step": 17050 }, { "epoch": 0.2514690505294473, "grad_norm": 1.2634352445602417, "learning_rate": 1.0055668463373687e-05, "loss": 0.2317, "step": 17075 }, { "epoch": 0.2518372336195343, "grad_norm": 1.4474118947982788, "learning_rate": 1.0070395570086302e-05, "loss": 0.2208, "step": 17100 }, { "epoch": 0.2522054167096214, "grad_norm": 1.125046968460083, "learning_rate": 1.0085122676798918e-05, "loss": 0.2181, "step": 17125 }, { "epoch": 0.2525735997997084, "grad_norm": 1.0893241167068481, "learning_rate": 1.0099849783511533e-05, "loss": 0.2251, "step": 17150 }, { "epoch": 0.25294178288979546, "grad_norm": 1.244471549987793, "learning_rate": 1.0114576890224147e-05, "loss": 0.2225, "step": 17175 }, { "epoch": 0.2533099659798825, "grad_norm": 1.1279207468032837, "learning_rate": 1.0129303996936764e-05, "loss": 0.2277, "step": 17200 }, { "epoch": 0.2536781490699695, "grad_norm": 1.1421986818313599, "learning_rate": 1.0144031103649378e-05, "loss": 0.2207, "step": 17225 }, { "epoch": 0.25404633216005656, "grad_norm": 1.43101966381073, "learning_rate": 1.0158758210361995e-05, "loss": 0.2279, "step": 17250 }, { "epoch": 0.2544145152501436, "grad_norm": 1.2407532930374146, "learning_rate": 1.0173485317074608e-05, "loss": 0.2271, "step": 17275 }, { "epoch": 0.25478269834023065, "grad_norm": 1.3361932039260864, "learning_rate": 1.0188212423787222e-05, "loss": 0.2239, "step": 17300 }, { "epoch": 0.25515088143031767, "grad_norm": 1.0362130403518677, "learning_rate": 1.0202939530499839e-05, "loss": 0.2252, "step": 17325 }, { "epoch": 0.2555190645204047, "grad_norm": 0.9963209629058838, "learning_rate": 1.0217666637212453e-05, "loss": 0.221, "step": 17350 }, { "epoch": 0.25588724761049175, "grad_norm": 0.906834065914154, "learning_rate": 1.0232393743925068e-05, "loss": 0.2217, "step": 17375 }, { "epoch": 0.25625543070057877, "grad_norm": 1.3475807905197144, "learning_rate": 1.0247120850637684e-05, "loss": 0.2226, "step": 17400 }, { "epoch": 0.25662361379066584, "grad_norm": 1.1310131549835205, "learning_rate": 1.0261847957350299e-05, "loss": 0.2259, "step": 17425 }, { "epoch": 0.25699179688075285, "grad_norm": 1.1497154235839844, "learning_rate": 1.0276575064062915e-05, "loss": 0.2163, "step": 17450 }, { "epoch": 0.2573599799708399, "grad_norm": 1.5212485790252686, "learning_rate": 1.029130217077553e-05, "loss": 0.2284, "step": 17475 }, { "epoch": 0.25772816306092694, "grad_norm": 1.4213180541992188, "learning_rate": 1.0306029277488146e-05, "loss": 0.2152, "step": 17500 }, { "epoch": 0.25809634615101396, "grad_norm": 1.1019536256790161, "learning_rate": 1.0320756384200761e-05, "loss": 0.2236, "step": 17525 }, { "epoch": 0.258464529241101, "grad_norm": 1.2373889684677124, "learning_rate": 1.0335483490913376e-05, "loss": 0.2205, "step": 17550 }, { "epoch": 0.25883271233118804, "grad_norm": 1.1303311586380005, "learning_rate": 1.0350210597625992e-05, "loss": 0.2163, "step": 17575 }, { "epoch": 0.2592008954212751, "grad_norm": 1.8989242315292358, "learning_rate": 1.0364937704338607e-05, "loss": 0.2155, "step": 17600 }, { "epoch": 0.25956907851136213, "grad_norm": 1.081231951713562, "learning_rate": 1.0379664811051223e-05, "loss": 0.2182, "step": 17625 }, { "epoch": 0.25993726160144914, "grad_norm": 1.2433748245239258, "learning_rate": 1.0394391917763838e-05, "loss": 0.2191, "step": 17650 }, { "epoch": 0.2603054446915362, "grad_norm": 1.1952991485595703, "learning_rate": 1.0409119024476452e-05, "loss": 0.2188, "step": 17675 }, { "epoch": 0.26067362778162323, "grad_norm": 1.2529122829437256, "learning_rate": 1.0423846131189069e-05, "loss": 0.2223, "step": 17700 }, { "epoch": 0.2610418108717103, "grad_norm": 1.153318166732788, "learning_rate": 1.0438573237901683e-05, "loss": 0.214, "step": 17725 }, { "epoch": 0.2614099939617973, "grad_norm": 1.440818190574646, "learning_rate": 1.04533003446143e-05, "loss": 0.2119, "step": 17750 }, { "epoch": 0.2617781770518844, "grad_norm": 1.3393558263778687, "learning_rate": 1.0468027451326914e-05, "loss": 0.2198, "step": 17775 }, { "epoch": 0.2621463601419714, "grad_norm": 1.0429470539093018, "learning_rate": 1.0482754558039527e-05, "loss": 0.2162, "step": 17800 }, { "epoch": 0.2625145432320584, "grad_norm": 1.112441062927246, "learning_rate": 1.0497481664752144e-05, "loss": 0.2195, "step": 17825 }, { "epoch": 0.2628827263221455, "grad_norm": 1.2542566061019897, "learning_rate": 1.0512208771464758e-05, "loss": 0.2164, "step": 17850 }, { "epoch": 0.2632509094122325, "grad_norm": 1.3128899335861206, "learning_rate": 1.0526935878177373e-05, "loss": 0.2223, "step": 17875 }, { "epoch": 0.2636190925023196, "grad_norm": 1.1940679550170898, "learning_rate": 1.054166298488999e-05, "loss": 0.2173, "step": 17900 }, { "epoch": 0.2639872755924066, "grad_norm": 1.258396029472351, "learning_rate": 1.0556390091602604e-05, "loss": 0.2074, "step": 17925 }, { "epoch": 0.2643554586824936, "grad_norm": 1.1842387914657593, "learning_rate": 1.057111719831522e-05, "loss": 0.2157, "step": 17950 }, { "epoch": 0.2647236417725807, "grad_norm": 1.0509520769119263, "learning_rate": 1.0585844305027835e-05, "loss": 0.2129, "step": 17975 }, { "epoch": 0.2650918248626677, "grad_norm": 1.2387256622314453, "learning_rate": 1.060057141174045e-05, "loss": 0.2239, "step": 18000 }, { "epoch": 0.26546000795275476, "grad_norm": 0.9733960628509521, "learning_rate": 1.0615298518453066e-05, "loss": 0.2179, "step": 18025 }, { "epoch": 0.2658281910428418, "grad_norm": 1.2037309408187866, "learning_rate": 1.063002562516568e-05, "loss": 0.215, "step": 18050 }, { "epoch": 0.26619637413292885, "grad_norm": 1.305167555809021, "learning_rate": 1.0644752731878297e-05, "loss": 0.2156, "step": 18075 }, { "epoch": 0.26656455722301586, "grad_norm": 1.1345758438110352, "learning_rate": 1.0659479838590911e-05, "loss": 0.2119, "step": 18100 }, { "epoch": 0.2669327403131029, "grad_norm": 1.0518168210983276, "learning_rate": 1.0674206945303526e-05, "loss": 0.2077, "step": 18125 }, { "epoch": 0.26730092340318995, "grad_norm": 1.3454447984695435, "learning_rate": 1.0688934052016142e-05, "loss": 0.2117, "step": 18150 }, { "epoch": 0.26766910649327696, "grad_norm": 1.540737509727478, "learning_rate": 1.0703661158728757e-05, "loss": 0.2098, "step": 18175 }, { "epoch": 0.26803728958336404, "grad_norm": 1.0186264514923096, "learning_rate": 1.0718388265441373e-05, "loss": 0.2106, "step": 18200 }, { "epoch": 0.26840547267345105, "grad_norm": 1.2026820182800293, "learning_rate": 1.0733115372153988e-05, "loss": 0.206, "step": 18225 }, { "epoch": 0.26877365576353807, "grad_norm": 0.9871214628219604, "learning_rate": 1.0747842478866603e-05, "loss": 0.2063, "step": 18250 }, { "epoch": 0.26914183885362514, "grad_norm": 1.1501942873001099, "learning_rate": 1.0762569585579219e-05, "loss": 0.2125, "step": 18275 }, { "epoch": 0.26951002194371215, "grad_norm": 1.422052025794983, "learning_rate": 1.0777296692291834e-05, "loss": 0.2144, "step": 18300 }, { "epoch": 0.2698782050337992, "grad_norm": 1.3499372005462646, "learning_rate": 1.0792023799004447e-05, "loss": 0.2091, "step": 18325 }, { "epoch": 0.27024638812388624, "grad_norm": 1.1054781675338745, "learning_rate": 1.0806750905717063e-05, "loss": 0.2116, "step": 18350 }, { "epoch": 0.2706145712139733, "grad_norm": 1.160321831703186, "learning_rate": 1.0821478012429678e-05, "loss": 0.2091, "step": 18375 }, { "epoch": 0.2709827543040603, "grad_norm": 0.9781479835510254, "learning_rate": 1.0836205119142294e-05, "loss": 0.2124, "step": 18400 }, { "epoch": 0.27135093739414734, "grad_norm": 1.2315099239349365, "learning_rate": 1.0850932225854909e-05, "loss": 0.212, "step": 18425 }, { "epoch": 0.2717191204842344, "grad_norm": 1.3107922077178955, "learning_rate": 1.0865659332567523e-05, "loss": 0.2042, "step": 18450 }, { "epoch": 0.2720873035743214, "grad_norm": 1.2262879610061646, "learning_rate": 1.088038643928014e-05, "loss": 0.2135, "step": 18475 }, { "epoch": 0.2724554866644085, "grad_norm": 1.409114956855774, "learning_rate": 1.0895113545992754e-05, "loss": 0.2138, "step": 18500 }, { "epoch": 0.2728236697544955, "grad_norm": 1.0485094785690308, "learning_rate": 1.090984065270537e-05, "loss": 0.2039, "step": 18525 }, { "epoch": 0.27319185284458253, "grad_norm": 1.374588131904602, "learning_rate": 1.0924567759417985e-05, "loss": 0.204, "step": 18550 }, { "epoch": 0.2735600359346696, "grad_norm": 0.9809952974319458, "learning_rate": 1.0939294866130602e-05, "loss": 0.2088, "step": 18575 }, { "epoch": 0.2739282190247566, "grad_norm": 1.3115181922912598, "learning_rate": 1.0954021972843216e-05, "loss": 0.1981, "step": 18600 }, { "epoch": 0.2742964021148437, "grad_norm": 1.44467031955719, "learning_rate": 1.0968749079555831e-05, "loss": 0.2063, "step": 18625 }, { "epoch": 0.2746645852049307, "grad_norm": 0.9407679438591003, "learning_rate": 1.0982887101999942e-05, "loss": 0.2032, "step": 18650 }, { "epoch": 0.27503276829501777, "grad_norm": 1.440572738647461, "learning_rate": 1.0997614208712557e-05, "loss": 0.2056, "step": 18675 }, { "epoch": 0.2754009513851048, "grad_norm": 1.6290857791900635, "learning_rate": 1.1012341315425173e-05, "loss": 0.2078, "step": 18700 }, { "epoch": 0.2757691344751918, "grad_norm": 1.3203442096710205, "learning_rate": 1.1027068422137788e-05, "loss": 0.2107, "step": 18725 }, { "epoch": 0.2761373175652789, "grad_norm": 1.2521476745605469, "learning_rate": 1.1041795528850404e-05, "loss": 0.207, "step": 18750 }, { "epoch": 0.2765055006553659, "grad_norm": 1.4565852880477905, "learning_rate": 1.1056522635563019e-05, "loss": 0.2045, "step": 18775 }, { "epoch": 0.27687368374545296, "grad_norm": 1.0191683769226074, "learning_rate": 1.1071249742275633e-05, "loss": 0.2008, "step": 18800 }, { "epoch": 0.27724186683554, "grad_norm": 1.4495227336883545, "learning_rate": 1.108597684898825e-05, "loss": 0.2013, "step": 18825 }, { "epoch": 0.277610049925627, "grad_norm": 1.3123514652252197, "learning_rate": 1.1100703955700864e-05, "loss": 0.2046, "step": 18850 }, { "epoch": 0.27797823301571406, "grad_norm": 1.053816318511963, "learning_rate": 1.111543106241348e-05, "loss": 0.2096, "step": 18875 }, { "epoch": 0.2783464161058011, "grad_norm": 0.9499918222427368, "learning_rate": 1.1130158169126095e-05, "loss": 0.1954, "step": 18900 }, { "epoch": 0.27871459919588815, "grad_norm": 1.004050850868225, "learning_rate": 1.114488527583871e-05, "loss": 0.2021, "step": 18925 }, { "epoch": 0.27908278228597516, "grad_norm": 1.1039396524429321, "learning_rate": 1.1159612382551326e-05, "loss": 0.2081, "step": 18950 }, { "epoch": 0.27945096537606223, "grad_norm": 1.31987726688385, "learning_rate": 1.1174339489263941e-05, "loss": 0.1875, "step": 18975 }, { "epoch": 0.27981914846614925, "grad_norm": 1.3621447086334229, "learning_rate": 1.1189066595976554e-05, "loss": 0.2082, "step": 19000 }, { "epoch": 0.28018733155623626, "grad_norm": 1.6277425289154053, "learning_rate": 1.120379370268917e-05, "loss": 0.207, "step": 19025 }, { "epoch": 0.28055551464632333, "grad_norm": 1.0901118516921997, "learning_rate": 1.1218520809401785e-05, "loss": 0.2028, "step": 19050 }, { "epoch": 0.28092369773641035, "grad_norm": 1.1058247089385986, "learning_rate": 1.1233247916114401e-05, "loss": 0.2122, "step": 19075 }, { "epoch": 0.2812918808264974, "grad_norm": 1.084421157836914, "learning_rate": 1.1247975022827016e-05, "loss": 0.2068, "step": 19100 }, { "epoch": 0.28166006391658444, "grad_norm": 1.2624268531799316, "learning_rate": 1.126270212953963e-05, "loss": 0.1984, "step": 19125 }, { "epoch": 0.28202824700667145, "grad_norm": 1.3556143045425415, "learning_rate": 1.1277429236252247e-05, "loss": 0.2064, "step": 19150 }, { "epoch": 0.2823964300967585, "grad_norm": 1.1427866220474243, "learning_rate": 1.1292156342964861e-05, "loss": 0.2007, "step": 19175 }, { "epoch": 0.28276461318684554, "grad_norm": 1.3310309648513794, "learning_rate": 1.1306883449677478e-05, "loss": 0.2041, "step": 19200 }, { "epoch": 0.2831327962769326, "grad_norm": 1.4609546661376953, "learning_rate": 1.1321610556390092e-05, "loss": 0.2063, "step": 19225 }, { "epoch": 0.2835009793670196, "grad_norm": 1.464328408241272, "learning_rate": 1.1336337663102707e-05, "loss": 0.2053, "step": 19250 }, { "epoch": 0.2838691624571067, "grad_norm": 1.2487759590148926, "learning_rate": 1.1351064769815323e-05, "loss": 0.2109, "step": 19275 }, { "epoch": 0.2842373455471937, "grad_norm": 1.0267369747161865, "learning_rate": 1.1365791876527938e-05, "loss": 0.1941, "step": 19300 }, { "epoch": 0.2846055286372807, "grad_norm": 1.5034706592559814, "learning_rate": 1.1380518983240554e-05, "loss": 0.2062, "step": 19325 }, { "epoch": 0.2849737117273678, "grad_norm": 1.3053853511810303, "learning_rate": 1.1395246089953169e-05, "loss": 0.2088, "step": 19350 }, { "epoch": 0.2853418948174548, "grad_norm": 1.1365464925765991, "learning_rate": 1.1409973196665784e-05, "loss": 0.2091, "step": 19375 }, { "epoch": 0.2857100779075419, "grad_norm": 1.1613692045211792, "learning_rate": 1.14247003033784e-05, "loss": 0.2008, "step": 19400 }, { "epoch": 0.2860782609976289, "grad_norm": 1.2385408878326416, "learning_rate": 1.1439427410091015e-05, "loss": 0.2009, "step": 19425 }, { "epoch": 0.2864464440877159, "grad_norm": 1.4164599180221558, "learning_rate": 1.1454154516803631e-05, "loss": 0.196, "step": 19450 }, { "epoch": 0.286814627177803, "grad_norm": 1.0749568939208984, "learning_rate": 1.1468881623516246e-05, "loss": 0.2048, "step": 19475 }, { "epoch": 0.28718281026789, "grad_norm": 1.2188838720321655, "learning_rate": 1.148360873022886e-05, "loss": 0.2047, "step": 19500 }, { "epoch": 0.28755099335797707, "grad_norm": 1.2900882959365845, "learning_rate": 1.1498335836941475e-05, "loss": 0.2046, "step": 19525 }, { "epoch": 0.2879191764480641, "grad_norm": 1.3515424728393555, "learning_rate": 1.151306294365409e-05, "loss": 0.2013, "step": 19550 }, { "epoch": 0.28828735953815116, "grad_norm": 1.1831262111663818, "learning_rate": 1.1527790050366704e-05, "loss": 0.196, "step": 19575 }, { "epoch": 0.28865554262823817, "grad_norm": 1.2154779434204102, "learning_rate": 1.154251715707932e-05, "loss": 0.2001, "step": 19600 }, { "epoch": 0.2890237257183252, "grad_norm": 1.0575876235961914, "learning_rate": 1.1557244263791935e-05, "loss": 0.198, "step": 19625 }, { "epoch": 0.28939190880841226, "grad_norm": 1.2004785537719727, "learning_rate": 1.1571971370504552e-05, "loss": 0.1928, "step": 19650 }, { "epoch": 0.2897600918984993, "grad_norm": 1.2369384765625, "learning_rate": 1.1586698477217166e-05, "loss": 0.1973, "step": 19675 }, { "epoch": 0.29012827498858634, "grad_norm": 1.194305419921875, "learning_rate": 1.1601425583929781e-05, "loss": 0.1994, "step": 19700 }, { "epoch": 0.29049645807867336, "grad_norm": 1.2108451128005981, "learning_rate": 1.1616152690642397e-05, "loss": 0.1964, "step": 19725 }, { "epoch": 0.2908646411687604, "grad_norm": 1.3389203548431396, "learning_rate": 1.1630879797355012e-05, "loss": 0.2059, "step": 19750 }, { "epoch": 0.29123282425884744, "grad_norm": 1.416451334953308, "learning_rate": 1.1645606904067628e-05, "loss": 0.2053, "step": 19775 }, { "epoch": 0.29160100734893446, "grad_norm": 1.76418137550354, "learning_rate": 1.1660334010780243e-05, "loss": 0.1958, "step": 19800 }, { "epoch": 0.29196919043902153, "grad_norm": 1.3602815866470337, "learning_rate": 1.167506111749286e-05, "loss": 0.2086, "step": 19825 }, { "epoch": 0.29233737352910855, "grad_norm": 1.4478594064712524, "learning_rate": 1.1689788224205474e-05, "loss": 0.1931, "step": 19850 }, { "epoch": 0.2927055566191956, "grad_norm": 1.269147276878357, "learning_rate": 1.1704515330918089e-05, "loss": 0.1954, "step": 19875 }, { "epoch": 0.29307373970928263, "grad_norm": 1.711710810661316, "learning_rate": 1.1719242437630705e-05, "loss": 0.1935, "step": 19900 }, { "epoch": 0.29344192279936965, "grad_norm": 1.4398845434188843, "learning_rate": 1.173396954434332e-05, "loss": 0.1955, "step": 19925 }, { "epoch": 0.2938101058894567, "grad_norm": 1.256732702255249, "learning_rate": 1.1748696651055936e-05, "loss": 0.1964, "step": 19950 }, { "epoch": 0.29417828897954373, "grad_norm": 1.09647798538208, "learning_rate": 1.176342375776855e-05, "loss": 0.1977, "step": 19975 }, { "epoch": 0.2945464720696308, "grad_norm": 1.3230574131011963, "learning_rate": 1.1778150864481165e-05, "loss": 0.1933, "step": 20000 }, { "epoch": 0.2949146551597178, "grad_norm": 1.3102632761001587, "learning_rate": 1.1792877971193782e-05, "loss": 0.1913, "step": 20025 }, { "epoch": 0.29528283824980484, "grad_norm": 1.1603052616119385, "learning_rate": 1.1807605077906395e-05, "loss": 0.1923, "step": 20050 }, { "epoch": 0.2956510213398919, "grad_norm": 1.3793220520019531, "learning_rate": 1.182233218461901e-05, "loss": 0.193, "step": 20075 }, { "epoch": 0.2960192044299789, "grad_norm": 1.0161463022232056, "learning_rate": 1.1837059291331626e-05, "loss": 0.2002, "step": 20100 }, { "epoch": 0.296387387520066, "grad_norm": 1.4899036884307861, "learning_rate": 1.185178639804424e-05, "loss": 0.2016, "step": 20125 }, { "epoch": 0.296755570610153, "grad_norm": 1.4574910402297974, "learning_rate": 1.1866513504756857e-05, "loss": 0.197, "step": 20150 }, { "epoch": 0.2971237537002401, "grad_norm": 1.5206778049468994, "learning_rate": 1.1881240611469471e-05, "loss": 0.1973, "step": 20175 }, { "epoch": 0.2974919367903271, "grad_norm": 1.0904589891433716, "learning_rate": 1.1895967718182086e-05, "loss": 0.1965, "step": 20200 }, { "epoch": 0.2978601198804141, "grad_norm": 1.367783546447754, "learning_rate": 1.1910694824894702e-05, "loss": 0.2014, "step": 20225 }, { "epoch": 0.2982283029705012, "grad_norm": 1.2343593835830688, "learning_rate": 1.1925421931607317e-05, "loss": 0.1982, "step": 20250 }, { "epoch": 0.2985964860605882, "grad_norm": 1.2906816005706787, "learning_rate": 1.1940149038319933e-05, "loss": 0.2008, "step": 20275 }, { "epoch": 0.29896466915067527, "grad_norm": 1.178275227546692, "learning_rate": 1.1954876145032548e-05, "loss": 0.1955, "step": 20300 }, { "epoch": 0.2993328522407623, "grad_norm": 1.4055285453796387, "learning_rate": 1.1969603251745163e-05, "loss": 0.2023, "step": 20325 }, { "epoch": 0.29970103533084935, "grad_norm": 1.0455254316329956, "learning_rate": 1.1984330358457779e-05, "loss": 0.1942, "step": 20350 }, { "epoch": 0.30006921842093637, "grad_norm": 1.0922812223434448, "learning_rate": 1.1999057465170394e-05, "loss": 0.1988, "step": 20375 }, { "epoch": 0.3004374015110234, "grad_norm": 0.9248048067092896, "learning_rate": 1.201378457188301e-05, "loss": 0.1909, "step": 20400 }, { "epoch": 0.30080558460111045, "grad_norm": 1.0108305215835571, "learning_rate": 1.2028511678595625e-05, "loss": 0.1946, "step": 20425 }, { "epoch": 0.30117376769119747, "grad_norm": 1.4391227960586548, "learning_rate": 1.204323878530824e-05, "loss": 0.1943, "step": 20450 }, { "epoch": 0.30154195078128454, "grad_norm": 1.42784583568573, "learning_rate": 1.2057965892020856e-05, "loss": 0.1824, "step": 20475 }, { "epoch": 0.30191013387137156, "grad_norm": 1.3134140968322754, "learning_rate": 1.207269299873347e-05, "loss": 0.197, "step": 20500 }, { "epoch": 0.30227831696145857, "grad_norm": 1.3688997030258179, "learning_rate": 1.2087420105446087e-05, "loss": 0.1912, "step": 20525 }, { "epoch": 0.30264650005154564, "grad_norm": 1.3083784580230713, "learning_rate": 1.2102147212158701e-05, "loss": 0.1928, "step": 20550 }, { "epoch": 0.30301468314163266, "grad_norm": 1.4008431434631348, "learning_rate": 1.2116874318871314e-05, "loss": 0.1973, "step": 20575 }, { "epoch": 0.30338286623171973, "grad_norm": 1.101962685585022, "learning_rate": 1.213160142558393e-05, "loss": 0.1925, "step": 20600 }, { "epoch": 0.30375104932180674, "grad_norm": 1.5726611614227295, "learning_rate": 1.2146328532296545e-05, "loss": 0.194, "step": 20625 }, { "epoch": 0.3041192324118938, "grad_norm": 1.2495619058609009, "learning_rate": 1.216105563900916e-05, "loss": 0.1933, "step": 20650 }, { "epoch": 0.30448741550198083, "grad_norm": 1.1473134756088257, "learning_rate": 1.2175782745721776e-05, "loss": 0.1854, "step": 20675 }, { "epoch": 0.30485559859206784, "grad_norm": 1.277658462524414, "learning_rate": 1.2190509852434391e-05, "loss": 0.188, "step": 20700 }, { "epoch": 0.3052237816821549, "grad_norm": 1.6464298963546753, "learning_rate": 1.2205236959147007e-05, "loss": 0.1902, "step": 20725 }, { "epoch": 0.30559196477224193, "grad_norm": 1.2113037109375, "learning_rate": 1.2219374981591116e-05, "loss": 0.1941, "step": 20750 }, { "epoch": 0.305960147862329, "grad_norm": 0.9931922554969788, "learning_rate": 1.2234102088303733e-05, "loss": 0.1877, "step": 20775 }, { "epoch": 0.306328330952416, "grad_norm": 1.262056827545166, "learning_rate": 1.2248829195016347e-05, "loss": 0.1939, "step": 20800 }, { "epoch": 0.30669651404250303, "grad_norm": 1.4179117679595947, "learning_rate": 1.2263556301728962e-05, "loss": 0.1945, "step": 20825 }, { "epoch": 0.3070646971325901, "grad_norm": 1.4320852756500244, "learning_rate": 1.2278283408441578e-05, "loss": 0.194, "step": 20850 }, { "epoch": 0.3074328802226771, "grad_norm": 1.748063325881958, "learning_rate": 1.2293010515154193e-05, "loss": 0.192, "step": 20875 }, { "epoch": 0.3078010633127642, "grad_norm": 1.6274995803833008, "learning_rate": 1.230773762186681e-05, "loss": 0.1889, "step": 20900 }, { "epoch": 0.3081692464028512, "grad_norm": 1.4107574224472046, "learning_rate": 1.2322464728579424e-05, "loss": 0.201, "step": 20925 }, { "epoch": 0.3085374294929383, "grad_norm": 1.573684811592102, "learning_rate": 1.2337191835292039e-05, "loss": 0.1819, "step": 20950 }, { "epoch": 0.3089056125830253, "grad_norm": 1.3869659900665283, "learning_rate": 1.2351918942004655e-05, "loss": 0.1945, "step": 20975 }, { "epoch": 0.3092737956731123, "grad_norm": 1.1741231679916382, "learning_rate": 1.236664604871727e-05, "loss": 0.1852, "step": 21000 }, { "epoch": 0.3096419787631994, "grad_norm": 1.3392157554626465, "learning_rate": 1.2381373155429886e-05, "loss": 0.1851, "step": 21025 }, { "epoch": 0.3100101618532864, "grad_norm": 1.2029756307601929, "learning_rate": 1.23961002621425e-05, "loss": 0.1836, "step": 21050 }, { "epoch": 0.31037834494337346, "grad_norm": 1.9749929904937744, "learning_rate": 1.2410827368855117e-05, "loss": 0.1973, "step": 21075 }, { "epoch": 0.3107465280334605, "grad_norm": 1.1220290660858154, "learning_rate": 1.2425554475567732e-05, "loss": 0.1964, "step": 21100 }, { "epoch": 0.3111147111235475, "grad_norm": 1.6141853332519531, "learning_rate": 1.2440281582280346e-05, "loss": 0.1899, "step": 21125 }, { "epoch": 0.31148289421363456, "grad_norm": 1.1342378854751587, "learning_rate": 1.2455008688992963e-05, "loss": 0.1882, "step": 21150 }, { "epoch": 0.3118510773037216, "grad_norm": 1.2282766103744507, "learning_rate": 1.2469735795705577e-05, "loss": 0.1916, "step": 21175 }, { "epoch": 0.31221926039380865, "grad_norm": 1.3788354396820068, "learning_rate": 1.2484462902418194e-05, "loss": 0.1826, "step": 21200 }, { "epoch": 0.31258744348389567, "grad_norm": 1.207901120185852, "learning_rate": 1.2499190009130808e-05, "loss": 0.1897, "step": 21225 }, { "epoch": 0.31295562657398274, "grad_norm": 1.5424796342849731, "learning_rate": 1.2513917115843421e-05, "loss": 0.1829, "step": 21250 }, { "epoch": 0.31332380966406975, "grad_norm": 1.495710015296936, "learning_rate": 1.2528644222556036e-05, "loss": 0.1794, "step": 21275 }, { "epoch": 0.31369199275415677, "grad_norm": 1.404462218284607, "learning_rate": 1.2543371329268652e-05, "loss": 0.1861, "step": 21300 }, { "epoch": 0.31406017584424384, "grad_norm": 1.2983025312423706, "learning_rate": 1.2558098435981267e-05, "loss": 0.1898, "step": 21325 }, { "epoch": 0.31442835893433085, "grad_norm": 1.0166761875152588, "learning_rate": 1.2572825542693883e-05, "loss": 0.191, "step": 21350 }, { "epoch": 0.3147965420244179, "grad_norm": 1.436242699623108, "learning_rate": 1.2587552649406498e-05, "loss": 0.1896, "step": 21375 }, { "epoch": 0.31516472511450494, "grad_norm": 1.688125729560852, "learning_rate": 1.2602279756119114e-05, "loss": 0.1924, "step": 21400 }, { "epoch": 0.31553290820459196, "grad_norm": 1.1041502952575684, "learning_rate": 1.2617006862831729e-05, "loss": 0.1804, "step": 21425 }, { "epoch": 0.315901091294679, "grad_norm": 1.085607647895813, "learning_rate": 1.2631733969544344e-05, "loss": 0.1848, "step": 21450 }, { "epoch": 0.31626927438476604, "grad_norm": 1.2319964170455933, "learning_rate": 1.264646107625696e-05, "loss": 0.1897, "step": 21475 }, { "epoch": 0.3166374574748531, "grad_norm": 1.2102179527282715, "learning_rate": 1.2661188182969575e-05, "loss": 0.1864, "step": 21500 }, { "epoch": 0.3170056405649401, "grad_norm": 1.4258583784103394, "learning_rate": 1.2675915289682191e-05, "loss": 0.1896, "step": 21525 }, { "epoch": 0.3173738236550272, "grad_norm": 1.3039108514785767, "learning_rate": 1.2690642396394806e-05, "loss": 0.1858, "step": 21550 }, { "epoch": 0.3177420067451142, "grad_norm": 1.050047516822815, "learning_rate": 1.270536950310742e-05, "loss": 0.1763, "step": 21575 }, { "epoch": 0.31811018983520123, "grad_norm": 1.3864538669586182, "learning_rate": 1.2720096609820037e-05, "loss": 0.1932, "step": 21600 }, { "epoch": 0.3184783729252883, "grad_norm": 1.6021428108215332, "learning_rate": 1.2734823716532651e-05, "loss": 0.1841, "step": 21625 }, { "epoch": 0.3188465560153753, "grad_norm": 1.1374555826187134, "learning_rate": 1.2749550823245268e-05, "loss": 0.1835, "step": 21650 }, { "epoch": 0.3192147391054624, "grad_norm": 1.4656171798706055, "learning_rate": 1.2764277929957882e-05, "loss": 0.1758, "step": 21675 }, { "epoch": 0.3195829221955494, "grad_norm": 1.4196912050247192, "learning_rate": 1.2779005036670497e-05, "loss": 0.1816, "step": 21700 }, { "epoch": 0.3199511052856364, "grad_norm": 1.0034286975860596, "learning_rate": 1.2793732143383113e-05, "loss": 0.1879, "step": 21725 }, { "epoch": 0.3203192883757235, "grad_norm": 1.1203784942626953, "learning_rate": 1.2808459250095728e-05, "loss": 0.178, "step": 21750 }, { "epoch": 0.3206874714658105, "grad_norm": 1.4380009174346924, "learning_rate": 1.2823186356808341e-05, "loss": 0.1786, "step": 21775 }, { "epoch": 0.3210556545558976, "grad_norm": 1.2680689096450806, "learning_rate": 1.2837913463520957e-05, "loss": 0.1851, "step": 21800 }, { "epoch": 0.3214238376459846, "grad_norm": 1.1762382984161377, "learning_rate": 1.2852640570233572e-05, "loss": 0.1825, "step": 21825 }, { "epoch": 0.32179202073607166, "grad_norm": 1.2245060205459595, "learning_rate": 1.2867367676946188e-05, "loss": 0.1885, "step": 21850 }, { "epoch": 0.3221602038261587, "grad_norm": 1.1323437690734863, "learning_rate": 1.2882094783658803e-05, "loss": 0.1836, "step": 21875 }, { "epoch": 0.3225283869162457, "grad_norm": 1.123909831047058, "learning_rate": 1.2896821890371418e-05, "loss": 0.1821, "step": 21900 }, { "epoch": 0.32289657000633276, "grad_norm": 1.4582033157348633, "learning_rate": 1.2911548997084034e-05, "loss": 0.1846, "step": 21925 }, { "epoch": 0.3232647530964198, "grad_norm": 1.174345850944519, "learning_rate": 1.2926276103796649e-05, "loss": 0.1808, "step": 21950 }, { "epoch": 0.32363293618650685, "grad_norm": 1.231270432472229, "learning_rate": 1.2941003210509265e-05, "loss": 0.1713, "step": 21975 }, { "epoch": 0.32400111927659386, "grad_norm": 1.2960635423660278, "learning_rate": 1.295573031722188e-05, "loss": 0.1857, "step": 22000 }, { "epoch": 0.3243693023666809, "grad_norm": 0.9982641935348511, "learning_rate": 1.2970457423934494e-05, "loss": 0.1825, "step": 22025 }, { "epoch": 0.32473748545676795, "grad_norm": 1.3632073402404785, "learning_rate": 1.298518453064711e-05, "loss": 0.1869, "step": 22050 }, { "epoch": 0.32510566854685496, "grad_norm": 1.3916586637496948, "learning_rate": 1.2999911637359725e-05, "loss": 0.1823, "step": 22075 }, { "epoch": 0.32547385163694204, "grad_norm": 1.2210173606872559, "learning_rate": 1.3014638744072342e-05, "loss": 0.1729, "step": 22100 }, { "epoch": 0.32584203472702905, "grad_norm": 1.3650914430618286, "learning_rate": 1.3029365850784956e-05, "loss": 0.1811, "step": 22125 }, { "epoch": 0.3262102178171161, "grad_norm": 1.2728363275527954, "learning_rate": 1.304409295749757e-05, "loss": 0.1845, "step": 22150 }, { "epoch": 0.32657840090720314, "grad_norm": 1.3349742889404297, "learning_rate": 1.3058820064210187e-05, "loss": 0.1858, "step": 22175 }, { "epoch": 0.32694658399729015, "grad_norm": 1.4142699241638184, "learning_rate": 1.3073547170922802e-05, "loss": 0.1769, "step": 22200 }, { "epoch": 0.3273147670873772, "grad_norm": 1.687619924545288, "learning_rate": 1.3088274277635418e-05, "loss": 0.1879, "step": 22225 }, { "epoch": 0.32768295017746424, "grad_norm": 1.2499809265136719, "learning_rate": 1.3103001384348033e-05, "loss": 0.1823, "step": 22250 }, { "epoch": 0.3280511332675513, "grad_norm": 1.4056971073150635, "learning_rate": 1.311772849106065e-05, "loss": 0.182, "step": 22275 }, { "epoch": 0.3284193163576383, "grad_norm": 1.1245547533035278, "learning_rate": 1.3132455597773262e-05, "loss": 0.1779, "step": 22300 }, { "epoch": 0.32878749944772534, "grad_norm": 1.2430375814437866, "learning_rate": 1.3147182704485877e-05, "loss": 0.1792, "step": 22325 }, { "epoch": 0.3291556825378124, "grad_norm": 1.1041802167892456, "learning_rate": 1.3161909811198491e-05, "loss": 0.1845, "step": 22350 }, { "epoch": 0.3295238656278994, "grad_norm": 1.1275758743286133, "learning_rate": 1.3176636917911108e-05, "loss": 0.1747, "step": 22375 }, { "epoch": 0.3298920487179865, "grad_norm": 1.3952943086624146, "learning_rate": 1.3191364024623722e-05, "loss": 0.1795, "step": 22400 }, { "epoch": 0.3302602318080735, "grad_norm": 1.4974526166915894, "learning_rate": 1.3206091131336339e-05, "loss": 0.1777, "step": 22425 }, { "epoch": 0.3306284148981606, "grad_norm": 1.3657947778701782, "learning_rate": 1.3220818238048953e-05, "loss": 0.1783, "step": 22450 }, { "epoch": 0.3309965979882476, "grad_norm": 1.5914496183395386, "learning_rate": 1.323554534476157e-05, "loss": 0.1726, "step": 22475 }, { "epoch": 0.3313647810783346, "grad_norm": 1.3611053228378296, "learning_rate": 1.3250272451474184e-05, "loss": 0.1827, "step": 22500 }, { "epoch": 0.3317329641684217, "grad_norm": 1.180898904800415, "learning_rate": 1.3264999558186799e-05, "loss": 0.1756, "step": 22525 }, { "epoch": 0.3321011472585087, "grad_norm": 1.2224446535110474, "learning_rate": 1.3279726664899415e-05, "loss": 0.1783, "step": 22550 }, { "epoch": 0.33246933034859577, "grad_norm": 1.2551331520080566, "learning_rate": 1.329445377161203e-05, "loss": 0.17, "step": 22575 }, { "epoch": 0.3328375134386828, "grad_norm": 1.4178850650787354, "learning_rate": 1.3309180878324646e-05, "loss": 0.1813, "step": 22600 }, { "epoch": 0.3332056965287698, "grad_norm": 1.0674101114273071, "learning_rate": 1.3323907985037261e-05, "loss": 0.1865, "step": 22625 }, { "epoch": 0.33357387961885687, "grad_norm": 1.501469373703003, "learning_rate": 1.3338635091749876e-05, "loss": 0.1925, "step": 22650 }, { "epoch": 0.3339420627089439, "grad_norm": 1.135437250137329, "learning_rate": 1.3353362198462492e-05, "loss": 0.167, "step": 22675 }, { "epoch": 0.33431024579903096, "grad_norm": 1.4651609659194946, "learning_rate": 1.3368089305175107e-05, "loss": 0.1777, "step": 22700 }, { "epoch": 0.334678428889118, "grad_norm": 1.3253328800201416, "learning_rate": 1.3382816411887723e-05, "loss": 0.1824, "step": 22725 }, { "epoch": 0.33504661197920504, "grad_norm": 1.2355338335037231, "learning_rate": 1.3397543518600338e-05, "loss": 0.1854, "step": 22750 }, { "epoch": 0.33541479506929206, "grad_norm": 1.2989897727966309, "learning_rate": 1.3412270625312952e-05, "loss": 0.1892, "step": 22775 }, { "epoch": 0.3357829781593791, "grad_norm": 1.2566183805465698, "learning_rate": 1.3426997732025569e-05, "loss": 0.1798, "step": 22800 }, { "epoch": 0.33615116124946615, "grad_norm": 1.4441183805465698, "learning_rate": 1.3441724838738182e-05, "loss": 0.178, "step": 22825 }, { "epoch": 0.33651934433955316, "grad_norm": 1.923704743385315, "learning_rate": 1.3455862861182294e-05, "loss": 0.1797, "step": 22850 }, { "epoch": 0.33688752742964023, "grad_norm": 1.0510995388031006, "learning_rate": 1.3470589967894909e-05, "loss": 0.1688, "step": 22875 }, { "epoch": 0.33725571051972725, "grad_norm": 1.6159011125564575, "learning_rate": 1.3485317074607525e-05, "loss": 0.1735, "step": 22900 }, { "epoch": 0.33762389360981426, "grad_norm": 1.45757257938385, "learning_rate": 1.350004418132014e-05, "loss": 0.1761, "step": 22925 }, { "epoch": 0.33799207669990133, "grad_norm": 1.2837272882461548, "learning_rate": 1.3514771288032755e-05, "loss": 0.1768, "step": 22950 }, { "epoch": 0.33836025978998835, "grad_norm": 1.4296029806137085, "learning_rate": 1.352949839474537e-05, "loss": 0.1791, "step": 22975 }, { "epoch": 0.3387284428800754, "grad_norm": 1.47047758102417, "learning_rate": 1.3544225501457984e-05, "loss": 0.1761, "step": 23000 }, { "epoch": 0.33909662597016244, "grad_norm": 1.6144695281982422, "learning_rate": 1.3558952608170599e-05, "loss": 0.1807, "step": 23025 }, { "epoch": 0.3394648090602495, "grad_norm": 1.231798768043518, "learning_rate": 1.3573679714883215e-05, "loss": 0.1762, "step": 23050 }, { "epoch": 0.3398329921503365, "grad_norm": 1.2265921831130981, "learning_rate": 1.358840682159583e-05, "loss": 0.168, "step": 23075 }, { "epoch": 0.34020117524042354, "grad_norm": 1.3389085531234741, "learning_rate": 1.3603133928308446e-05, "loss": 0.1675, "step": 23100 }, { "epoch": 0.3405693583305106, "grad_norm": 1.2000136375427246, "learning_rate": 1.361786103502106e-05, "loss": 0.1652, "step": 23125 }, { "epoch": 0.3409375414205976, "grad_norm": 1.2100751399993896, "learning_rate": 1.3632588141733675e-05, "loss": 0.1786, "step": 23150 }, { "epoch": 0.3413057245106847, "grad_norm": 1.1619250774383545, "learning_rate": 1.3647315248446292e-05, "loss": 0.1777, "step": 23175 }, { "epoch": 0.3416739076007717, "grad_norm": 1.4115016460418701, "learning_rate": 1.3662042355158906e-05, "loss": 0.1742, "step": 23200 }, { "epoch": 0.3420420906908587, "grad_norm": 1.1313114166259766, "learning_rate": 1.3676769461871523e-05, "loss": 0.1663, "step": 23225 }, { "epoch": 0.3424102737809458, "grad_norm": 1.1694964170455933, "learning_rate": 1.3691496568584137e-05, "loss": 0.1763, "step": 23250 }, { "epoch": 0.3427784568710328, "grad_norm": 1.2373450994491577, "learning_rate": 1.3706223675296752e-05, "loss": 0.1738, "step": 23275 }, { "epoch": 0.3431466399611199, "grad_norm": 1.5605484247207642, "learning_rate": 1.3720950782009368e-05, "loss": 0.1727, "step": 23300 }, { "epoch": 0.3435148230512069, "grad_norm": 1.084865927696228, "learning_rate": 1.3735677888721983e-05, "loss": 0.1763, "step": 23325 }, { "epoch": 0.34388300614129397, "grad_norm": 1.0823957920074463, "learning_rate": 1.37504049954346e-05, "loss": 0.1687, "step": 23350 }, { "epoch": 0.344251189231381, "grad_norm": 1.2936733961105347, "learning_rate": 1.3765132102147214e-05, "loss": 0.1698, "step": 23375 }, { "epoch": 0.344619372321468, "grad_norm": 1.125900387763977, "learning_rate": 1.3779859208859828e-05, "loss": 0.1714, "step": 23400 }, { "epoch": 0.34498755541155507, "grad_norm": 1.1072611808776855, "learning_rate": 1.3794586315572445e-05, "loss": 0.1757, "step": 23425 }, { "epoch": 0.3453557385016421, "grad_norm": 1.4236797094345093, "learning_rate": 1.380931342228506e-05, "loss": 0.1582, "step": 23450 }, { "epoch": 0.34572392159172916, "grad_norm": 1.2479220628738403, "learning_rate": 1.3824040528997676e-05, "loss": 0.1732, "step": 23475 }, { "epoch": 0.34609210468181617, "grad_norm": 1.3718407154083252, "learning_rate": 1.3838767635710289e-05, "loss": 0.1826, "step": 23500 }, { "epoch": 0.3464602877719032, "grad_norm": 1.2314164638519287, "learning_rate": 1.3853494742422903e-05, "loss": 0.1717, "step": 23525 }, { "epoch": 0.34682847086199026, "grad_norm": 1.364166498184204, "learning_rate": 1.386822184913552e-05, "loss": 0.1761, "step": 23550 }, { "epoch": 0.34719665395207727, "grad_norm": 1.3473548889160156, "learning_rate": 1.3882948955848134e-05, "loss": 0.169, "step": 23575 }, { "epoch": 0.34756483704216434, "grad_norm": 1.6499969959259033, "learning_rate": 1.3897676062560749e-05, "loss": 0.1673, "step": 23600 }, { "epoch": 0.34793302013225136, "grad_norm": 1.2165099382400513, "learning_rate": 1.3912403169273365e-05, "loss": 0.1699, "step": 23625 }, { "epoch": 0.34830120322233843, "grad_norm": 1.2824857234954834, "learning_rate": 1.392713027598598e-05, "loss": 0.173, "step": 23650 }, { "epoch": 0.34866938631242544, "grad_norm": 1.24893319606781, "learning_rate": 1.3941857382698596e-05, "loss": 0.1644, "step": 23675 }, { "epoch": 0.34903756940251246, "grad_norm": 1.295711636543274, "learning_rate": 1.3956584489411211e-05, "loss": 0.1757, "step": 23700 }, { "epoch": 0.34940575249259953, "grad_norm": 1.4278984069824219, "learning_rate": 1.3971311596123827e-05, "loss": 0.1616, "step": 23725 }, { "epoch": 0.34977393558268655, "grad_norm": 1.3779631853103638, "learning_rate": 1.3986038702836442e-05, "loss": 0.1764, "step": 23750 }, { "epoch": 0.3501421186727736, "grad_norm": 1.279738426208496, "learning_rate": 1.4000765809549057e-05, "loss": 0.1674, "step": 23775 }, { "epoch": 0.35051030176286063, "grad_norm": 1.2752861976623535, "learning_rate": 1.4015492916261673e-05, "loss": 0.1622, "step": 23800 }, { "epoch": 0.35087848485294765, "grad_norm": 1.615437626838684, "learning_rate": 1.4030220022974288e-05, "loss": 0.1751, "step": 23825 }, { "epoch": 0.3512466679430347, "grad_norm": 1.3879810571670532, "learning_rate": 1.4044947129686904e-05, "loss": 0.166, "step": 23850 }, { "epoch": 0.35161485103312173, "grad_norm": 1.3716177940368652, "learning_rate": 1.4059674236399519e-05, "loss": 0.1669, "step": 23875 }, { "epoch": 0.3519830341232088, "grad_norm": 1.2520028352737427, "learning_rate": 1.4074401343112133e-05, "loss": 0.1728, "step": 23900 }, { "epoch": 0.3523512172132958, "grad_norm": 1.7196623086929321, "learning_rate": 1.408912844982475e-05, "loss": 0.1766, "step": 23925 }, { "epoch": 0.3527194003033829, "grad_norm": 1.1781913042068481, "learning_rate": 1.4103855556537364e-05, "loss": 0.1736, "step": 23950 }, { "epoch": 0.3530875833934699, "grad_norm": 1.1760460138320923, "learning_rate": 1.411858266324998e-05, "loss": 0.1637, "step": 23975 }, { "epoch": 0.3534557664835569, "grad_norm": 1.2736172676086426, "learning_rate": 1.4133309769962595e-05, "loss": 0.1588, "step": 24000 }, { "epoch": 0.353823949573644, "grad_norm": 1.2447682619094849, "learning_rate": 1.4148036876675208e-05, "loss": 0.1635, "step": 24025 }, { "epoch": 0.354192132663731, "grad_norm": 1.9752280712127686, "learning_rate": 1.4162763983387825e-05, "loss": 0.1662, "step": 24050 }, { "epoch": 0.3545603157538181, "grad_norm": 1.1681700944900513, "learning_rate": 1.417749109010044e-05, "loss": 0.1707, "step": 24075 }, { "epoch": 0.3549284988439051, "grad_norm": 1.2221779823303223, "learning_rate": 1.4192218196813054e-05, "loss": 0.1748, "step": 24100 }, { "epoch": 0.3552966819339921, "grad_norm": 1.6404857635498047, "learning_rate": 1.420694530352567e-05, "loss": 0.1741, "step": 24125 }, { "epoch": 0.3556648650240792, "grad_norm": 1.340167760848999, "learning_rate": 1.4221672410238285e-05, "loss": 0.1669, "step": 24150 }, { "epoch": 0.3560330481141662, "grad_norm": 1.8098108768463135, "learning_rate": 1.4236399516950901e-05, "loss": 0.1582, "step": 24175 }, { "epoch": 0.35640123120425327, "grad_norm": 1.1971865892410278, "learning_rate": 1.4251126623663516e-05, "loss": 0.1637, "step": 24200 }, { "epoch": 0.3567694142943403, "grad_norm": 1.2659186124801636, "learning_rate": 1.426585373037613e-05, "loss": 0.1708, "step": 24225 }, { "epoch": 0.35713759738442735, "grad_norm": 1.5661580562591553, "learning_rate": 1.4280580837088747e-05, "loss": 0.1704, "step": 24250 }, { "epoch": 0.35750578047451437, "grad_norm": 1.0712963342666626, "learning_rate": 1.4295307943801362e-05, "loss": 0.1687, "step": 24275 }, { "epoch": 0.3578739635646014, "grad_norm": 1.2966463565826416, "learning_rate": 1.4310035050513978e-05, "loss": 0.1694, "step": 24300 }, { "epoch": 0.35824214665468845, "grad_norm": 1.4097758531570435, "learning_rate": 1.4324762157226593e-05, "loss": 0.1677, "step": 24325 }, { "epoch": 0.35861032974477547, "grad_norm": 1.2268353700637817, "learning_rate": 1.4339489263939207e-05, "loss": 0.1694, "step": 24350 }, { "epoch": 0.35897851283486254, "grad_norm": 1.4043735265731812, "learning_rate": 1.4354216370651824e-05, "loss": 0.1647, "step": 24375 }, { "epoch": 0.35934669592494956, "grad_norm": 1.1261810064315796, "learning_rate": 1.4368943477364438e-05, "loss": 0.168, "step": 24400 }, { "epoch": 0.35971487901503657, "grad_norm": 1.194634199142456, "learning_rate": 1.4383670584077055e-05, "loss": 0.1615, "step": 24425 }, { "epoch": 0.36008306210512364, "grad_norm": 1.2855753898620605, "learning_rate": 1.439839769078967e-05, "loss": 0.1694, "step": 24450 }, { "epoch": 0.36045124519521066, "grad_norm": 1.473142147064209, "learning_rate": 1.4413124797502284e-05, "loss": 0.1655, "step": 24475 }, { "epoch": 0.3608194282852977, "grad_norm": 1.2650806903839111, "learning_rate": 1.44278519042149e-05, "loss": 0.1721, "step": 24500 }, { "epoch": 0.36118761137538474, "grad_norm": 1.557308554649353, "learning_rate": 1.4442579010927515e-05, "loss": 0.1631, "step": 24525 }, { "epoch": 0.3615557944654718, "grad_norm": 2.086210012435913, "learning_rate": 1.4457306117640128e-05, "loss": 0.165, "step": 24550 }, { "epoch": 0.36192397755555883, "grad_norm": 1.1287481784820557, "learning_rate": 1.4472033224352744e-05, "loss": 0.1694, "step": 24575 }, { "epoch": 0.36229216064564584, "grad_norm": 1.2974869012832642, "learning_rate": 1.4486760331065359e-05, "loss": 0.1605, "step": 24600 }, { "epoch": 0.3626603437357329, "grad_norm": 1.2801960706710815, "learning_rate": 1.4501487437777975e-05, "loss": 0.168, "step": 24625 }, { "epoch": 0.36302852682581993, "grad_norm": 1.556368350982666, "learning_rate": 1.451621454449059e-05, "loss": 0.1715, "step": 24650 }, { "epoch": 0.363396709915907, "grad_norm": 1.3484870195388794, "learning_rate": 1.4530941651203205e-05, "loss": 0.16, "step": 24675 }, { "epoch": 0.363764893005994, "grad_norm": 1.9718481302261353, "learning_rate": 1.4545668757915821e-05, "loss": 0.1669, "step": 24700 }, { "epoch": 0.36413307609608103, "grad_norm": 1.3649804592132568, "learning_rate": 1.4560395864628436e-05, "loss": 0.165, "step": 24725 }, { "epoch": 0.3645012591861681, "grad_norm": 1.5112513303756714, "learning_rate": 1.4575122971341052e-05, "loss": 0.1633, "step": 24750 }, { "epoch": 0.3648694422762551, "grad_norm": 1.6509013175964355, "learning_rate": 1.4589850078053667e-05, "loss": 0.1677, "step": 24775 }, { "epoch": 0.3652376253663422, "grad_norm": 1.242371916770935, "learning_rate": 1.4604577184766283e-05, "loss": 0.1639, "step": 24800 }, { "epoch": 0.3656058084564292, "grad_norm": 1.3893762826919556, "learning_rate": 1.4619304291478898e-05, "loss": 0.1641, "step": 24825 }, { "epoch": 0.3659739915465163, "grad_norm": 1.8748382329940796, "learning_rate": 1.4633442313923007e-05, "loss": 0.1677, "step": 24850 }, { "epoch": 0.3663421746366033, "grad_norm": 1.4344875812530518, "learning_rate": 1.4648169420635623e-05, "loss": 0.1639, "step": 24875 }, { "epoch": 0.3667103577266903, "grad_norm": 1.229787826538086, "learning_rate": 1.4662896527348238e-05, "loss": 0.1669, "step": 24900 }, { "epoch": 0.3670785408167774, "grad_norm": 1.7821155786514282, "learning_rate": 1.4677623634060854e-05, "loss": 0.1694, "step": 24925 }, { "epoch": 0.3674467239068644, "grad_norm": 1.4540947675704956, "learning_rate": 1.4692350740773469e-05, "loss": 0.1632, "step": 24950 }, { "epoch": 0.36781490699695146, "grad_norm": 1.3375898599624634, "learning_rate": 1.4707077847486085e-05, "loss": 0.1614, "step": 24975 }, { "epoch": 0.3681830900870385, "grad_norm": 1.3651843070983887, "learning_rate": 1.47218049541987e-05, "loss": 0.1579, "step": 25000 }, { "epoch": 0.3685512731771255, "grad_norm": 1.2038607597351074, "learning_rate": 1.4736532060911314e-05, "loss": 0.1684, "step": 25025 }, { "epoch": 0.36891945626721256, "grad_norm": 1.362025260925293, "learning_rate": 1.475125916762393e-05, "loss": 0.155, "step": 25050 }, { "epoch": 0.3692876393572996, "grad_norm": 1.3822096586227417, "learning_rate": 1.4765986274336545e-05, "loss": 0.1624, "step": 25075 }, { "epoch": 0.36965582244738665, "grad_norm": 2.2989706993103027, "learning_rate": 1.4780713381049162e-05, "loss": 0.1651, "step": 25100 }, { "epoch": 0.37002400553747367, "grad_norm": 1.4963966608047485, "learning_rate": 1.4795440487761776e-05, "loss": 0.1634, "step": 25125 }, { "epoch": 0.37039218862756074, "grad_norm": 1.811550498008728, "learning_rate": 1.4810167594474391e-05, "loss": 0.1666, "step": 25150 }, { "epoch": 0.37076037171764775, "grad_norm": 1.3327304124832153, "learning_rate": 1.4824894701187007e-05, "loss": 0.1567, "step": 25175 }, { "epoch": 0.37112855480773477, "grad_norm": 1.1280299425125122, "learning_rate": 1.483962180789962e-05, "loss": 0.162, "step": 25200 }, { "epoch": 0.37149673789782184, "grad_norm": 1.8525338172912598, "learning_rate": 1.4854348914612235e-05, "loss": 0.1699, "step": 25225 }, { "epoch": 0.37186492098790885, "grad_norm": 1.4508144855499268, "learning_rate": 1.4869076021324851e-05, "loss": 0.1653, "step": 25250 }, { "epoch": 0.3722331040779959, "grad_norm": 1.5556083917617798, "learning_rate": 1.4883803128037466e-05, "loss": 0.1578, "step": 25275 }, { "epoch": 0.37260128716808294, "grad_norm": 1.4834439754486084, "learning_rate": 1.4898530234750082e-05, "loss": 0.1652, "step": 25300 }, { "epoch": 0.37296947025816996, "grad_norm": 1.4071025848388672, "learning_rate": 1.4913257341462697e-05, "loss": 0.1585, "step": 25325 }, { "epoch": 0.373337653348257, "grad_norm": 1.3072419166564941, "learning_rate": 1.4927984448175312e-05, "loss": 0.1604, "step": 25350 }, { "epoch": 0.37370583643834404, "grad_norm": 1.2802237272262573, "learning_rate": 1.4942711554887928e-05, "loss": 0.1575, "step": 25375 }, { "epoch": 0.3740740195284311, "grad_norm": 1.0141359567642212, "learning_rate": 1.4957438661600543e-05, "loss": 0.151, "step": 25400 }, { "epoch": 0.3744422026185181, "grad_norm": 1.3200985193252563, "learning_rate": 1.4972165768313159e-05, "loss": 0.1611, "step": 25425 }, { "epoch": 0.3748103857086052, "grad_norm": 1.478265643119812, "learning_rate": 1.4986892875025774e-05, "loss": 0.1681, "step": 25450 }, { "epoch": 0.3751785687986922, "grad_norm": 1.1375505924224854, "learning_rate": 1.5001619981738388e-05, "loss": 0.1581, "step": 25475 }, { "epoch": 0.37554675188877923, "grad_norm": 1.9286413192749023, "learning_rate": 1.5016347088451005e-05, "loss": 0.1711, "step": 25500 }, { "epoch": 0.3759149349788663, "grad_norm": 1.6296018362045288, "learning_rate": 1.503107419516362e-05, "loss": 0.1616, "step": 25525 }, { "epoch": 0.3762831180689533, "grad_norm": 1.482069492340088, "learning_rate": 1.5045801301876236e-05, "loss": 0.1619, "step": 25550 }, { "epoch": 0.3766513011590404, "grad_norm": 1.1911407709121704, "learning_rate": 1.506052840858885e-05, "loss": 0.1602, "step": 25575 }, { "epoch": 0.3770194842491274, "grad_norm": 1.1659958362579346, "learning_rate": 1.5075255515301465e-05, "loss": 0.1619, "step": 25600 }, { "epoch": 0.37738766733921447, "grad_norm": 1.192456841468811, "learning_rate": 1.5089982622014081e-05, "loss": 0.1591, "step": 25625 }, { "epoch": 0.3777558504293015, "grad_norm": 1.5322376489639282, "learning_rate": 1.5104709728726696e-05, "loss": 0.1645, "step": 25650 }, { "epoch": 0.3781240335193885, "grad_norm": 1.208443522453308, "learning_rate": 1.5119436835439312e-05, "loss": 0.1559, "step": 25675 }, { "epoch": 0.3784922166094756, "grad_norm": 1.7743768692016602, "learning_rate": 1.5134163942151927e-05, "loss": 0.1589, "step": 25700 }, { "epoch": 0.3788603996995626, "grad_norm": 1.4697054624557495, "learning_rate": 1.514889104886454e-05, "loss": 0.1511, "step": 25725 }, { "epoch": 0.37922858278964966, "grad_norm": 1.455379605293274, "learning_rate": 1.5163618155577156e-05, "loss": 0.165, "step": 25750 }, { "epoch": 0.3795967658797367, "grad_norm": 1.2325196266174316, "learning_rate": 1.5178345262289771e-05, "loss": 0.1546, "step": 25775 }, { "epoch": 0.3799649489698237, "grad_norm": 1.4367910623550415, "learning_rate": 1.5193072369002386e-05, "loss": 0.1687, "step": 25800 }, { "epoch": 0.38033313205991076, "grad_norm": 1.4400395154953003, "learning_rate": 1.5207799475715002e-05, "loss": 0.1519, "step": 25825 }, { "epoch": 0.3807013151499978, "grad_norm": 1.612979769706726, "learning_rate": 1.5222526582427617e-05, "loss": 0.1584, "step": 25850 }, { "epoch": 0.38106949824008485, "grad_norm": 1.6632273197174072, "learning_rate": 1.5237253689140233e-05, "loss": 0.159, "step": 25875 }, { "epoch": 0.38143768133017186, "grad_norm": 1.35444974899292, "learning_rate": 1.5251980795852848e-05, "loss": 0.1652, "step": 25900 }, { "epoch": 0.38180586442025893, "grad_norm": 1.7284669876098633, "learning_rate": 1.5266707902565464e-05, "loss": 0.1544, "step": 25925 }, { "epoch": 0.38217404751034595, "grad_norm": 1.4676893949508667, "learning_rate": 1.528143500927808e-05, "loss": 0.1624, "step": 25950 }, { "epoch": 0.38254223060043296, "grad_norm": 1.2698357105255127, "learning_rate": 1.5296162115990693e-05, "loss": 0.1599, "step": 25975 }, { "epoch": 0.38291041369052004, "grad_norm": 1.423574686050415, "learning_rate": 1.5310889222703308e-05, "loss": 0.1616, "step": 26000 }, { "epoch": 0.38327859678060705, "grad_norm": 1.264630913734436, "learning_rate": 1.5325616329415926e-05, "loss": 0.1554, "step": 26025 }, { "epoch": 0.3836467798706941, "grad_norm": 1.2193396091461182, "learning_rate": 1.534034343612854e-05, "loss": 0.1567, "step": 26050 }, { "epoch": 0.38401496296078114, "grad_norm": 1.2291114330291748, "learning_rate": 1.5355070542841155e-05, "loss": 0.16, "step": 26075 }, { "epoch": 0.38438314605086815, "grad_norm": 1.4714957475662231, "learning_rate": 1.536979764955377e-05, "loss": 0.1691, "step": 26100 }, { "epoch": 0.3847513291409552, "grad_norm": 1.1252244710922241, "learning_rate": 1.5384524756266385e-05, "loss": 0.1541, "step": 26125 }, { "epoch": 0.38511951223104224, "grad_norm": 1.645963430404663, "learning_rate": 1.5399251862979003e-05, "loss": 0.1505, "step": 26150 }, { "epoch": 0.3854876953211293, "grad_norm": 1.2302244901657104, "learning_rate": 1.5413978969691617e-05, "loss": 0.1609, "step": 26175 }, { "epoch": 0.3858558784112163, "grad_norm": 1.4861561059951782, "learning_rate": 1.5428706076404232e-05, "loss": 0.1588, "step": 26200 }, { "epoch": 0.3862240615013034, "grad_norm": 1.3448596000671387, "learning_rate": 1.5443433183116847e-05, "loss": 0.1595, "step": 26225 }, { "epoch": 0.3865922445913904, "grad_norm": 2.0465972423553467, "learning_rate": 1.545816028982946e-05, "loss": 0.1557, "step": 26250 }, { "epoch": 0.3869604276814774, "grad_norm": 1.3477450609207153, "learning_rate": 1.5472887396542076e-05, "loss": 0.1481, "step": 26275 }, { "epoch": 0.3873286107715645, "grad_norm": 1.526447057723999, "learning_rate": 1.548761450325469e-05, "loss": 0.1608, "step": 26300 }, { "epoch": 0.3876967938616515, "grad_norm": 1.4494584798812866, "learning_rate": 1.5502341609967305e-05, "loss": 0.1599, "step": 26325 }, { "epoch": 0.3880649769517386, "grad_norm": 1.4448504447937012, "learning_rate": 1.5517068716679923e-05, "loss": 0.1495, "step": 26350 }, { "epoch": 0.3884331600418256, "grad_norm": 1.3479735851287842, "learning_rate": 1.5531795823392538e-05, "loss": 0.1506, "step": 26375 }, { "epoch": 0.3888013431319126, "grad_norm": 2.5692548751831055, "learning_rate": 1.5546522930105152e-05, "loss": 0.1544, "step": 26400 }, { "epoch": 0.3891695262219997, "grad_norm": 1.4863978624343872, "learning_rate": 1.5561250036817767e-05, "loss": 0.1487, "step": 26425 }, { "epoch": 0.3895377093120867, "grad_norm": 1.1599884033203125, "learning_rate": 1.5575977143530382e-05, "loss": 0.1602, "step": 26450 }, { "epoch": 0.38990589240217377, "grad_norm": 1.4004433155059814, "learning_rate": 1.5590704250243e-05, "loss": 0.1558, "step": 26475 }, { "epoch": 0.3902740754922608, "grad_norm": 1.9732985496520996, "learning_rate": 1.5605431356955614e-05, "loss": 0.1543, "step": 26500 }, { "epoch": 0.39064225858234786, "grad_norm": 1.709584355354309, "learning_rate": 1.562015846366823e-05, "loss": 0.1523, "step": 26525 }, { "epoch": 0.39101044167243487, "grad_norm": 1.5066332817077637, "learning_rate": 1.5634885570380844e-05, "loss": 0.1413, "step": 26550 }, { "epoch": 0.3913786247625219, "grad_norm": 1.2560043334960938, "learning_rate": 1.564961267709346e-05, "loss": 0.1554, "step": 26575 }, { "epoch": 0.39174680785260896, "grad_norm": 1.3362622261047363, "learning_rate": 1.5664339783806076e-05, "loss": 0.1583, "step": 26600 }, { "epoch": 0.392114990942696, "grad_norm": 1.3887149095535278, "learning_rate": 1.567906689051869e-05, "loss": 0.1544, "step": 26625 }, { "epoch": 0.39248317403278304, "grad_norm": 1.496191143989563, "learning_rate": 1.5693793997231306e-05, "loss": 0.1525, "step": 26650 }, { "epoch": 0.39285135712287006, "grad_norm": 1.2821321487426758, "learning_rate": 1.570852110394392e-05, "loss": 0.1475, "step": 26675 }, { "epoch": 0.3932195402129571, "grad_norm": 1.2387585639953613, "learning_rate": 1.5723248210656535e-05, "loss": 0.1463, "step": 26700 }, { "epoch": 0.39358772330304415, "grad_norm": 1.3753855228424072, "learning_rate": 1.5737975317369153e-05, "loss": 0.1505, "step": 26725 }, { "epoch": 0.39395590639313116, "grad_norm": 1.6069930791854858, "learning_rate": 1.5752702424081768e-05, "loss": 0.1492, "step": 26750 }, { "epoch": 0.39432408948321823, "grad_norm": 1.6889283657073975, "learning_rate": 1.576742953079438e-05, "loss": 0.1538, "step": 26775 }, { "epoch": 0.39469227257330525, "grad_norm": 1.3301491737365723, "learning_rate": 1.5782156637506997e-05, "loss": 0.1477, "step": 26800 }, { "epoch": 0.3950604556633923, "grad_norm": 1.5237812995910645, "learning_rate": 1.5796883744219612e-05, "loss": 0.1395, "step": 26825 }, { "epoch": 0.39542863875347933, "grad_norm": 1.6008751392364502, "learning_rate": 1.5811610850932226e-05, "loss": 0.1577, "step": 26850 }, { "epoch": 0.39579682184356635, "grad_norm": 1.2232062816619873, "learning_rate": 1.5825748873376337e-05, "loss": 0.151, "step": 26875 }, { "epoch": 0.3961650049336534, "grad_norm": 1.3552442789077759, "learning_rate": 1.5840475980088955e-05, "loss": 0.1639, "step": 26900 }, { "epoch": 0.39653318802374043, "grad_norm": 1.5702840089797974, "learning_rate": 1.5855203086801567e-05, "loss": 0.156, "step": 26925 }, { "epoch": 0.3969013711138275, "grad_norm": 1.4246914386749268, "learning_rate": 1.586993019351418e-05, "loss": 0.1561, "step": 26950 }, { "epoch": 0.3972695542039145, "grad_norm": 1.04789137840271, "learning_rate": 1.58846573002268e-05, "loss": 0.1556, "step": 26975 }, { "epoch": 0.39763773729400154, "grad_norm": 1.5424654483795166, "learning_rate": 1.5899384406939414e-05, "loss": 0.1585, "step": 27000 }, { "epoch": 0.3980059203840886, "grad_norm": 1.3209010362625122, "learning_rate": 1.591411151365203e-05, "loss": 0.1482, "step": 27025 }, { "epoch": 0.3983741034741756, "grad_norm": 1.437687873840332, "learning_rate": 1.5928838620364643e-05, "loss": 0.1491, "step": 27050 }, { "epoch": 0.3987422865642627, "grad_norm": 1.3784931898117065, "learning_rate": 1.5943565727077258e-05, "loss": 0.1544, "step": 27075 }, { "epoch": 0.3991104696543497, "grad_norm": 1.365294098854065, "learning_rate": 1.5958292833789876e-05, "loss": 0.1527, "step": 27100 }, { "epoch": 0.3994786527444368, "grad_norm": 1.4586961269378662, "learning_rate": 1.597301994050249e-05, "loss": 0.1513, "step": 27125 }, { "epoch": 0.3998468358345238, "grad_norm": 1.2528156042099, "learning_rate": 1.5987747047215105e-05, "loss": 0.1491, "step": 27150 }, { "epoch": 0.4002150189246108, "grad_norm": 1.2851122617721558, "learning_rate": 1.600247415392772e-05, "loss": 0.1498, "step": 27175 }, { "epoch": 0.4005832020146979, "grad_norm": 1.3917914628982544, "learning_rate": 1.6017201260640335e-05, "loss": 0.1576, "step": 27200 }, { "epoch": 0.4009513851047849, "grad_norm": 1.2627981901168823, "learning_rate": 1.6031928367352953e-05, "loss": 0.1529, "step": 27225 }, { "epoch": 0.40131956819487197, "grad_norm": 1.2389428615570068, "learning_rate": 1.6046655474065567e-05, "loss": 0.1585, "step": 27250 }, { "epoch": 0.401687751284959, "grad_norm": 1.2482959032058716, "learning_rate": 1.6061382580778182e-05, "loss": 0.141, "step": 27275 }, { "epoch": 0.402055934375046, "grad_norm": 1.5105633735656738, "learning_rate": 1.6076109687490797e-05, "loss": 0.1356, "step": 27300 }, { "epoch": 0.40242411746513307, "grad_norm": 1.5747884511947632, "learning_rate": 1.609083679420341e-05, "loss": 0.1487, "step": 27325 }, { "epoch": 0.4027923005552201, "grad_norm": 1.2051383256912231, "learning_rate": 1.610556390091603e-05, "loss": 0.1556, "step": 27350 }, { "epoch": 0.40316048364530715, "grad_norm": 1.313347339630127, "learning_rate": 1.6120291007628644e-05, "loss": 0.1528, "step": 27375 }, { "epoch": 0.40352866673539417, "grad_norm": 1.3574309349060059, "learning_rate": 1.613501811434126e-05, "loss": 0.1531, "step": 27400 }, { "epoch": 0.40389684982548124, "grad_norm": 1.1611549854278564, "learning_rate": 1.6149745221053873e-05, "loss": 0.1355, "step": 27425 }, { "epoch": 0.40426503291556826, "grad_norm": 1.5017125606536865, "learning_rate": 1.6164472327766488e-05, "loss": 0.146, "step": 27450 }, { "epoch": 0.40463321600565527, "grad_norm": 2.055236577987671, "learning_rate": 1.6179199434479102e-05, "loss": 0.1408, "step": 27475 }, { "epoch": 0.40500139909574234, "grad_norm": 1.3194584846496582, "learning_rate": 1.6193926541191717e-05, "loss": 0.1505, "step": 27500 }, { "epoch": 0.40536958218582936, "grad_norm": 1.3802636861801147, "learning_rate": 1.6208653647904332e-05, "loss": 0.1548, "step": 27525 }, { "epoch": 0.40573776527591643, "grad_norm": 1.34214186668396, "learning_rate": 1.622338075461695e-05, "loss": 0.1363, "step": 27550 }, { "epoch": 0.40610594836600344, "grad_norm": 1.2384545803070068, "learning_rate": 1.6238107861329564e-05, "loss": 0.1528, "step": 27575 }, { "epoch": 0.40647413145609046, "grad_norm": 1.8981614112854004, "learning_rate": 1.625283496804218e-05, "loss": 0.1479, "step": 27600 }, { "epoch": 0.40684231454617753, "grad_norm": 1.6245635747909546, "learning_rate": 1.6267562074754794e-05, "loss": 0.1519, "step": 27625 }, { "epoch": 0.40721049763626455, "grad_norm": 1.3965609073638916, "learning_rate": 1.628228918146741e-05, "loss": 0.1538, "step": 27650 }, { "epoch": 0.4075786807263516, "grad_norm": 1.1727200746536255, "learning_rate": 1.6297016288180026e-05, "loss": 0.1458, "step": 27675 }, { "epoch": 0.40794686381643863, "grad_norm": 1.342296838760376, "learning_rate": 1.631174339489264e-05, "loss": 0.1502, "step": 27700 }, { "epoch": 0.4083150469065257, "grad_norm": 1.6224783658981323, "learning_rate": 1.6326470501605256e-05, "loss": 0.1457, "step": 27725 }, { "epoch": 0.4086832299966127, "grad_norm": 1.3962500095367432, "learning_rate": 1.634119760831787e-05, "loss": 0.1497, "step": 27750 }, { "epoch": 0.40905141308669973, "grad_norm": 1.6830774545669556, "learning_rate": 1.6355924715030485e-05, "loss": 0.1431, "step": 27775 }, { "epoch": 0.4094195961767868, "grad_norm": 1.2840871810913086, "learning_rate": 1.6370651821743103e-05, "loss": 0.1545, "step": 27800 }, { "epoch": 0.4097877792668738, "grad_norm": 1.1633930206298828, "learning_rate": 1.6385378928455718e-05, "loss": 0.1487, "step": 27825 }, { "epoch": 0.4101559623569609, "grad_norm": 1.2747087478637695, "learning_rate": 1.6400106035168332e-05, "loss": 0.1436, "step": 27850 }, { "epoch": 0.4105241454470479, "grad_norm": 1.332680344581604, "learning_rate": 1.6414833141880947e-05, "loss": 0.1437, "step": 27875 }, { "epoch": 0.4108923285371349, "grad_norm": 1.2956751585006714, "learning_rate": 1.6429560248593565e-05, "loss": 0.1411, "step": 27900 }, { "epoch": 0.411260511627222, "grad_norm": 1.2702312469482422, "learning_rate": 1.644428735530618e-05, "loss": 0.1435, "step": 27925 }, { "epoch": 0.411628694717309, "grad_norm": 1.5242955684661865, "learning_rate": 1.6459014462018794e-05, "loss": 0.1499, "step": 27950 }, { "epoch": 0.4119968778073961, "grad_norm": 1.279181718826294, "learning_rate": 1.6473741568731406e-05, "loss": 0.1473, "step": 27975 }, { "epoch": 0.4123650608974831, "grad_norm": 1.2853556871414185, "learning_rate": 1.6488468675444024e-05, "loss": 0.1484, "step": 28000 }, { "epoch": 0.41273324398757016, "grad_norm": 1.3675915002822876, "learning_rate": 1.650319578215664e-05, "loss": 0.1502, "step": 28025 }, { "epoch": 0.4131014270776572, "grad_norm": 1.477524757385254, "learning_rate": 1.6517922888869253e-05, "loss": 0.1409, "step": 28050 }, { "epoch": 0.4134696101677442, "grad_norm": 1.4106136560440063, "learning_rate": 1.6532649995581868e-05, "loss": 0.1453, "step": 28075 }, { "epoch": 0.41383779325783127, "grad_norm": 1.4092079401016235, "learning_rate": 1.6547377102294482e-05, "loss": 0.1502, "step": 28100 }, { "epoch": 0.4142059763479183, "grad_norm": 1.4397227764129639, "learning_rate": 1.65621042090071e-05, "loss": 0.146, "step": 28125 }, { "epoch": 0.41457415943800535, "grad_norm": 1.4936139583587646, "learning_rate": 1.6576831315719715e-05, "loss": 0.144, "step": 28150 }, { "epoch": 0.41494234252809237, "grad_norm": 1.263158917427063, "learning_rate": 1.659155842243233e-05, "loss": 0.1464, "step": 28175 }, { "epoch": 0.4153105256181794, "grad_norm": 1.577727198600769, "learning_rate": 1.6606285529144944e-05, "loss": 0.146, "step": 28200 }, { "epoch": 0.41567870870826645, "grad_norm": 1.1452049016952515, "learning_rate": 1.6621012635857562e-05, "loss": 0.1421, "step": 28225 }, { "epoch": 0.41604689179835347, "grad_norm": 1.2748959064483643, "learning_rate": 1.6635739742570177e-05, "loss": 0.1412, "step": 28250 }, { "epoch": 0.41641507488844054, "grad_norm": 1.3492980003356934, "learning_rate": 1.665046684928279e-05, "loss": 0.1524, "step": 28275 }, { "epoch": 0.41678325797852755, "grad_norm": 1.2692748308181763, "learning_rate": 1.6665193955995406e-05, "loss": 0.1499, "step": 28300 }, { "epoch": 0.4171514410686146, "grad_norm": 1.4279532432556152, "learning_rate": 1.667992106270802e-05, "loss": 0.1481, "step": 28325 }, { "epoch": 0.41751962415870164, "grad_norm": 1.5554064512252808, "learning_rate": 1.669464816942064e-05, "loss": 0.1521, "step": 28350 }, { "epoch": 0.41788780724878866, "grad_norm": 1.4604122638702393, "learning_rate": 1.6709375276133254e-05, "loss": 0.1502, "step": 28375 }, { "epoch": 0.4182559903388757, "grad_norm": 1.6504544019699097, "learning_rate": 1.672410238284587e-05, "loss": 0.1439, "step": 28400 }, { "epoch": 0.41862417342896274, "grad_norm": 1.6239420175552368, "learning_rate": 1.6738829489558483e-05, "loss": 0.1497, "step": 28425 }, { "epoch": 0.4189923565190498, "grad_norm": 1.3682606220245361, "learning_rate": 1.6753556596271098e-05, "loss": 0.1514, "step": 28450 }, { "epoch": 0.41936053960913683, "grad_norm": 1.132874846458435, "learning_rate": 1.6768283702983716e-05, "loss": 0.1442, "step": 28475 }, { "epoch": 0.41972872269922384, "grad_norm": 1.3476136922836304, "learning_rate": 1.6783010809696327e-05, "loss": 0.146, "step": 28500 }, { "epoch": 0.4200969057893109, "grad_norm": 1.2397880554199219, "learning_rate": 1.679773791640894e-05, "loss": 0.1438, "step": 28525 }, { "epoch": 0.42046508887939793, "grad_norm": 1.3864375352859497, "learning_rate": 1.681246502312156e-05, "loss": 0.1456, "step": 28550 }, { "epoch": 0.420833271969485, "grad_norm": 1.5281440019607544, "learning_rate": 1.6827192129834174e-05, "loss": 0.1536, "step": 28575 }, { "epoch": 0.421201455059572, "grad_norm": 1.2818355560302734, "learning_rate": 1.684191923654679e-05, "loss": 0.1432, "step": 28600 }, { "epoch": 0.4215696381496591, "grad_norm": 1.3338851928710938, "learning_rate": 1.6856646343259404e-05, "loss": 0.1395, "step": 28625 }, { "epoch": 0.4219378212397461, "grad_norm": 1.4486943483352661, "learning_rate": 1.6871373449972018e-05, "loss": 0.1412, "step": 28650 }, { "epoch": 0.4223060043298331, "grad_norm": 1.2234041690826416, "learning_rate": 1.6886100556684636e-05, "loss": 0.1526, "step": 28675 }, { "epoch": 0.4226741874199202, "grad_norm": 1.268106460571289, "learning_rate": 1.690082766339725e-05, "loss": 0.1439, "step": 28700 }, { "epoch": 0.4230423705100072, "grad_norm": 1.360742211341858, "learning_rate": 1.6915554770109866e-05, "loss": 0.1513, "step": 28725 }, { "epoch": 0.4234105536000943, "grad_norm": 1.2657285928726196, "learning_rate": 1.693028187682248e-05, "loss": 0.1518, "step": 28750 }, { "epoch": 0.4237787366901813, "grad_norm": 1.2721775770187378, "learning_rate": 1.6945008983535095e-05, "loss": 0.1415, "step": 28775 }, { "epoch": 0.4241469197802683, "grad_norm": 1.7750695943832397, "learning_rate": 1.6959736090247713e-05, "loss": 0.1366, "step": 28800 }, { "epoch": 0.4245151028703554, "grad_norm": 1.446153998374939, "learning_rate": 1.6974463196960328e-05, "loss": 0.1384, "step": 28825 }, { "epoch": 0.4248832859604424, "grad_norm": 1.5607752799987793, "learning_rate": 1.6989190303672942e-05, "loss": 0.1461, "step": 28850 }, { "epoch": 0.42525146905052946, "grad_norm": 1.1609091758728027, "learning_rate": 1.7003328326117053e-05, "loss": 0.1462, "step": 28875 }, { "epoch": 0.4256196521406165, "grad_norm": 1.1347569227218628, "learning_rate": 1.7018055432829668e-05, "loss": 0.14, "step": 28900 }, { "epoch": 0.42598783523070355, "grad_norm": 1.4356962442398071, "learning_rate": 1.7032782539542282e-05, "loss": 0.1479, "step": 28925 }, { "epoch": 0.42635601832079056, "grad_norm": 1.5667729377746582, "learning_rate": 1.7047509646254897e-05, "loss": 0.1491, "step": 28950 }, { "epoch": 0.4267242014108776, "grad_norm": 1.314302682876587, "learning_rate": 1.7062236752967515e-05, "loss": 0.1437, "step": 28975 }, { "epoch": 0.42709238450096465, "grad_norm": 1.2139393091201782, "learning_rate": 1.707696385968013e-05, "loss": 0.1379, "step": 29000 }, { "epoch": 0.42746056759105167, "grad_norm": 1.203644871711731, "learning_rate": 1.7091690966392744e-05, "loss": 0.1431, "step": 29025 }, { "epoch": 0.42782875068113874, "grad_norm": 1.437659740447998, "learning_rate": 1.710641807310536e-05, "loss": 0.1443, "step": 29050 }, { "epoch": 0.42819693377122575, "grad_norm": 1.3156840801239014, "learning_rate": 1.7121145179817974e-05, "loss": 0.1504, "step": 29075 }, { "epoch": 0.42856511686131277, "grad_norm": 1.430440068244934, "learning_rate": 1.7135872286530592e-05, "loss": 0.1444, "step": 29100 }, { "epoch": 0.42893329995139984, "grad_norm": 1.129550814628601, "learning_rate": 1.7150599393243206e-05, "loss": 0.1445, "step": 29125 }, { "epoch": 0.42930148304148685, "grad_norm": 1.8005977869033813, "learning_rate": 1.716532649995582e-05, "loss": 0.1463, "step": 29150 }, { "epoch": 0.4296696661315739, "grad_norm": 1.3299059867858887, "learning_rate": 1.7180053606668436e-05, "loss": 0.1461, "step": 29175 }, { "epoch": 0.43003784922166094, "grad_norm": 1.46122407913208, "learning_rate": 1.719478071338105e-05, "loss": 0.143, "step": 29200 }, { "epoch": 0.430406032311748, "grad_norm": 1.190050721168518, "learning_rate": 1.7209507820093665e-05, "loss": 0.1341, "step": 29225 }, { "epoch": 0.430774215401835, "grad_norm": 1.300050973892212, "learning_rate": 1.722423492680628e-05, "loss": 0.1422, "step": 29250 }, { "epoch": 0.43114239849192204, "grad_norm": 1.2624523639678955, "learning_rate": 1.7238962033518894e-05, "loss": 0.1407, "step": 29275 }, { "epoch": 0.4315105815820091, "grad_norm": 1.954111099243164, "learning_rate": 1.7253689140231512e-05, "loss": 0.1437, "step": 29300 }, { "epoch": 0.4318787646720961, "grad_norm": 1.4629451036453247, "learning_rate": 1.7268416246944127e-05, "loss": 0.1362, "step": 29325 }, { "epoch": 0.4322469477621832, "grad_norm": 1.2829498052597046, "learning_rate": 1.728314335365674e-05, "loss": 0.1434, "step": 29350 }, { "epoch": 0.4326151308522702, "grad_norm": 1.4281407594680786, "learning_rate": 1.7297870460369356e-05, "loss": 0.1308, "step": 29375 }, { "epoch": 0.43298331394235723, "grad_norm": 1.4115138053894043, "learning_rate": 1.731259756708197e-05, "loss": 0.1375, "step": 29400 }, { "epoch": 0.4333514970324443, "grad_norm": 1.5216060876846313, "learning_rate": 1.732732467379459e-05, "loss": 0.1405, "step": 29425 }, { "epoch": 0.4337196801225313, "grad_norm": 1.1875633001327515, "learning_rate": 1.7342051780507204e-05, "loss": 0.1456, "step": 29450 }, { "epoch": 0.4340878632126184, "grad_norm": 1.9678187370300293, "learning_rate": 1.735677888721982e-05, "loss": 0.1366, "step": 29475 }, { "epoch": 0.4344560463027054, "grad_norm": 1.3012348413467407, "learning_rate": 1.7371505993932433e-05, "loss": 0.1445, "step": 29500 }, { "epoch": 0.43482422939279247, "grad_norm": 1.2349598407745361, "learning_rate": 1.7386233100645048e-05, "loss": 0.1376, "step": 29525 }, { "epoch": 0.4351924124828795, "grad_norm": 1.517460584640503, "learning_rate": 1.7400960207357666e-05, "loss": 0.1379, "step": 29550 }, { "epoch": 0.4355605955729665, "grad_norm": 1.4243059158325195, "learning_rate": 1.741568731407028e-05, "loss": 0.1337, "step": 29575 }, { "epoch": 0.4359287786630536, "grad_norm": 1.931024432182312, "learning_rate": 1.7430414420782895e-05, "loss": 0.1378, "step": 29600 }, { "epoch": 0.4362969617531406, "grad_norm": 1.5100048780441284, "learning_rate": 1.744514152749551e-05, "loss": 0.1337, "step": 29625 }, { "epoch": 0.43666514484322766, "grad_norm": 1.2116131782531738, "learning_rate": 1.7459868634208124e-05, "loss": 0.1435, "step": 29650 }, { "epoch": 0.4370333279333147, "grad_norm": 1.585527777671814, "learning_rate": 1.7474595740920742e-05, "loss": 0.1425, "step": 29675 }, { "epoch": 0.4374015110234017, "grad_norm": 1.3392324447631836, "learning_rate": 1.7489322847633354e-05, "loss": 0.1447, "step": 29700 }, { "epoch": 0.43776969411348876, "grad_norm": 1.6167775392532349, "learning_rate": 1.7504049954345968e-05, "loss": 0.1507, "step": 29725 }, { "epoch": 0.4381378772035758, "grad_norm": 1.5940417051315308, "learning_rate": 1.7518777061058586e-05, "loss": 0.1463, "step": 29750 }, { "epoch": 0.43850606029366285, "grad_norm": 1.433103084564209, "learning_rate": 1.75335041677712e-05, "loss": 0.1474, "step": 29775 }, { "epoch": 0.43887424338374986, "grad_norm": 1.6639951467514038, "learning_rate": 1.7548231274483816e-05, "loss": 0.1386, "step": 29800 }, { "epoch": 0.43924242647383693, "grad_norm": 1.7384814023971558, "learning_rate": 1.756295838119643e-05, "loss": 0.1409, "step": 29825 }, { "epoch": 0.43961060956392395, "grad_norm": 1.450372576713562, "learning_rate": 1.7577685487909045e-05, "loss": 0.1461, "step": 29850 }, { "epoch": 0.43997879265401096, "grad_norm": 1.1789109706878662, "learning_rate": 1.7592412594621663e-05, "loss": 0.1322, "step": 29875 }, { "epoch": 0.44034697574409803, "grad_norm": 1.5792953968048096, "learning_rate": 1.7607139701334278e-05, "loss": 0.1403, "step": 29900 }, { "epoch": 0.44071515883418505, "grad_norm": 1.680922031402588, "learning_rate": 1.7621866808046892e-05, "loss": 0.1394, "step": 29925 }, { "epoch": 0.4410833419242721, "grad_norm": 1.2849785089492798, "learning_rate": 1.7636593914759507e-05, "loss": 0.1407, "step": 29950 }, { "epoch": 0.44145152501435914, "grad_norm": 1.3818351030349731, "learning_rate": 1.765132102147212e-05, "loss": 0.1399, "step": 29975 }, { "epoch": 0.44181970810444615, "grad_norm": 1.699486255645752, "learning_rate": 1.766604812818474e-05, "loss": 0.1446, "step": 30000 }, { "epoch": 0.4421878911945332, "grad_norm": 1.5980112552642822, "learning_rate": 1.7680775234897354e-05, "loss": 0.1363, "step": 30025 }, { "epoch": 0.44255607428462024, "grad_norm": 1.4802896976470947, "learning_rate": 1.769550234160997e-05, "loss": 0.1386, "step": 30050 }, { "epoch": 0.4429242573747073, "grad_norm": 1.1918233633041382, "learning_rate": 1.7710229448322584e-05, "loss": 0.1421, "step": 30075 }, { "epoch": 0.4432924404647943, "grad_norm": 1.561808466911316, "learning_rate": 1.7724956555035198e-05, "loss": 0.1468, "step": 30100 }, { "epoch": 0.4436606235548814, "grad_norm": 1.0813902616500854, "learning_rate": 1.7739683661747816e-05, "loss": 0.1427, "step": 30125 }, { "epoch": 0.4440288066449684, "grad_norm": 1.9768576622009277, "learning_rate": 1.775441076846043e-05, "loss": 0.1409, "step": 30150 }, { "epoch": 0.4443969897350554, "grad_norm": 1.3829275369644165, "learning_rate": 1.7769137875173046e-05, "loss": 0.1305, "step": 30175 }, { "epoch": 0.4447651728251425, "grad_norm": 1.5168200731277466, "learning_rate": 1.778386498188566e-05, "loss": 0.1402, "step": 30200 }, { "epoch": 0.4451333559152295, "grad_norm": 1.5185489654541016, "learning_rate": 1.7798592088598275e-05, "loss": 0.1445, "step": 30225 }, { "epoch": 0.4455015390053166, "grad_norm": 1.5915287733078003, "learning_rate": 1.781331919531089e-05, "loss": 0.1434, "step": 30250 }, { "epoch": 0.4458697220954036, "grad_norm": 1.6269174814224243, "learning_rate": 1.7828046302023504e-05, "loss": 0.1421, "step": 30275 }, { "epoch": 0.4462379051854906, "grad_norm": 2.000884771347046, "learning_rate": 1.784277340873612e-05, "loss": 0.138, "step": 30300 }, { "epoch": 0.4466060882755777, "grad_norm": 1.268903374671936, "learning_rate": 1.7857500515448737e-05, "loss": 0.1373, "step": 30325 }, { "epoch": 0.4469742713656647, "grad_norm": 1.5966827869415283, "learning_rate": 1.787222762216135e-05, "loss": 0.134, "step": 30350 }, { "epoch": 0.44734245445575177, "grad_norm": 1.3796306848526, "learning_rate": 1.7886954728873966e-05, "loss": 0.132, "step": 30375 }, { "epoch": 0.4477106375458388, "grad_norm": 1.3174057006835938, "learning_rate": 1.790168183558658e-05, "loss": 0.1447, "step": 30400 }, { "epoch": 0.44807882063592586, "grad_norm": 1.365686058998108, "learning_rate": 1.7916408942299195e-05, "loss": 0.1416, "step": 30425 }, { "epoch": 0.44844700372601287, "grad_norm": 1.4339334964752197, "learning_rate": 1.7931136049011814e-05, "loss": 0.131, "step": 30450 }, { "epoch": 0.4488151868160999, "grad_norm": 1.3043586015701294, "learning_rate": 1.7945863155724428e-05, "loss": 0.139, "step": 30475 }, { "epoch": 0.44918336990618696, "grad_norm": 1.7352689504623413, "learning_rate": 1.7960590262437043e-05, "loss": 0.1429, "step": 30500 }, { "epoch": 0.449551552996274, "grad_norm": 1.3499093055725098, "learning_rate": 1.7975317369149657e-05, "loss": 0.1322, "step": 30525 }, { "epoch": 0.44991973608636104, "grad_norm": 1.353043794631958, "learning_rate": 1.7990044475862276e-05, "loss": 0.1352, "step": 30550 }, { "epoch": 0.45028791917644806, "grad_norm": 1.6418284177780151, "learning_rate": 1.800477158257489e-05, "loss": 0.1377, "step": 30575 }, { "epoch": 0.45065610226653513, "grad_norm": 1.2903937101364136, "learning_rate": 1.8019498689287505e-05, "loss": 0.137, "step": 30600 }, { "epoch": 0.45102428535662215, "grad_norm": 1.2895569801330566, "learning_rate": 1.803422579600012e-05, "loss": 0.1354, "step": 30625 }, { "epoch": 0.45139246844670916, "grad_norm": 1.5892612934112549, "learning_rate": 1.8048952902712734e-05, "loss": 0.1364, "step": 30650 }, { "epoch": 0.45176065153679623, "grad_norm": 1.3265799283981323, "learning_rate": 1.8063680009425352e-05, "loss": 0.1387, "step": 30675 }, { "epoch": 0.45212883462688325, "grad_norm": 1.2973653078079224, "learning_rate": 1.8078407116137967e-05, "loss": 0.1317, "step": 30700 }, { "epoch": 0.4524970177169703, "grad_norm": 1.486402988433838, "learning_rate": 1.809313422285058e-05, "loss": 0.1358, "step": 30725 }, { "epoch": 0.45286520080705733, "grad_norm": 1.2491552829742432, "learning_rate": 1.8107861329563193e-05, "loss": 0.1383, "step": 30750 }, { "epoch": 0.45323338389714435, "grad_norm": 1.4163362979888916, "learning_rate": 1.812258843627581e-05, "loss": 0.1325, "step": 30775 }, { "epoch": 0.4536015669872314, "grad_norm": 1.8074849843978882, "learning_rate": 1.8137315542988425e-05, "loss": 0.1387, "step": 30800 }, { "epoch": 0.45396975007731843, "grad_norm": 1.5657782554626465, "learning_rate": 1.815204264970104e-05, "loss": 0.1315, "step": 30825 }, { "epoch": 0.4543379331674055, "grad_norm": 1.5195591449737549, "learning_rate": 1.8166769756413655e-05, "loss": 0.1306, "step": 30850 }, { "epoch": 0.4547061162574925, "grad_norm": 1.4572436809539795, "learning_rate": 1.8181496863126273e-05, "loss": 0.1458, "step": 30875 }, { "epoch": 0.4550742993475796, "grad_norm": 1.7552560567855835, "learning_rate": 1.819563488557038e-05, "loss": 0.1451, "step": 30900 }, { "epoch": 0.4554424824376666, "grad_norm": 1.2025951147079468, "learning_rate": 1.8210361992282998e-05, "loss": 0.1422, "step": 30925 }, { "epoch": 0.4558106655277536, "grad_norm": 1.4719462394714355, "learning_rate": 1.8225089098995613e-05, "loss": 0.1476, "step": 30950 }, { "epoch": 0.4561788486178407, "grad_norm": 1.5610344409942627, "learning_rate": 1.8239816205708228e-05, "loss": 0.1325, "step": 30975 }, { "epoch": 0.4565470317079277, "grad_norm": 1.1036796569824219, "learning_rate": 1.8254543312420842e-05, "loss": 0.1328, "step": 31000 }, { "epoch": 0.4569152147980148, "grad_norm": 1.2756305932998657, "learning_rate": 1.8269270419133457e-05, "loss": 0.1353, "step": 31025 }, { "epoch": 0.4572833978881018, "grad_norm": 1.3201026916503906, "learning_rate": 1.8283997525846075e-05, "loss": 0.1375, "step": 31050 }, { "epoch": 0.4576515809781888, "grad_norm": 1.4888620376586914, "learning_rate": 1.829872463255869e-05, "loss": 0.1423, "step": 31075 }, { "epoch": 0.4580197640682759, "grad_norm": 1.075620174407959, "learning_rate": 1.8313451739271304e-05, "loss": 0.1311, "step": 31100 }, { "epoch": 0.4583879471583629, "grad_norm": 1.1998538970947266, "learning_rate": 1.832817884598392e-05, "loss": 0.1335, "step": 31125 }, { "epoch": 0.45875613024844997, "grad_norm": 1.4449012279510498, "learning_rate": 1.8342905952696534e-05, "loss": 0.1344, "step": 31150 }, { "epoch": 0.459124313338537, "grad_norm": 1.598140835762024, "learning_rate": 1.835763305940915e-05, "loss": 0.1299, "step": 31175 }, { "epoch": 0.45949249642862405, "grad_norm": 1.2417925596237183, "learning_rate": 1.8372360166121766e-05, "loss": 0.1306, "step": 31200 }, { "epoch": 0.45986067951871107, "grad_norm": 1.6606899499893188, "learning_rate": 1.838708727283438e-05, "loss": 0.1334, "step": 31225 }, { "epoch": 0.4602288626087981, "grad_norm": 1.806178331375122, "learning_rate": 1.8401814379546996e-05, "loss": 0.1347, "step": 31250 }, { "epoch": 0.46059704569888515, "grad_norm": 1.320526361465454, "learning_rate": 1.841654148625961e-05, "loss": 0.1263, "step": 31275 }, { "epoch": 0.46096522878897217, "grad_norm": 1.7611198425292969, "learning_rate": 1.8431268592972228e-05, "loss": 0.1317, "step": 31300 }, { "epoch": 0.46133341187905924, "grad_norm": 1.796420693397522, "learning_rate": 1.8445995699684843e-05, "loss": 0.1355, "step": 31325 }, { "epoch": 0.46170159496914626, "grad_norm": 1.1251814365386963, "learning_rate": 1.8460722806397458e-05, "loss": 0.1225, "step": 31350 }, { "epoch": 0.46206977805923327, "grad_norm": 1.2082396745681763, "learning_rate": 1.8475449913110072e-05, "loss": 0.1419, "step": 31375 }, { "epoch": 0.46243796114932034, "grad_norm": 1.1356067657470703, "learning_rate": 1.8490177019822687e-05, "loss": 0.1277, "step": 31400 }, { "epoch": 0.46280614423940736, "grad_norm": 1.736794114112854, "learning_rate": 1.85049041265353e-05, "loss": 0.1285, "step": 31425 }, { "epoch": 0.46317432732949443, "grad_norm": 1.3372302055358887, "learning_rate": 1.8519631233247916e-05, "loss": 0.1392, "step": 31450 }, { "epoch": 0.46354251041958144, "grad_norm": 1.379860281944275, "learning_rate": 1.853435833996053e-05, "loss": 0.1362, "step": 31475 }, { "epoch": 0.4639106935096685, "grad_norm": 1.103456974029541, "learning_rate": 1.854908544667315e-05, "loss": 0.1299, "step": 31500 }, { "epoch": 0.46427887659975553, "grad_norm": 1.44719660282135, "learning_rate": 1.8563812553385764e-05, "loss": 0.1327, "step": 31525 }, { "epoch": 0.46464705968984255, "grad_norm": 1.378851294517517, "learning_rate": 1.8578539660098378e-05, "loss": 0.1323, "step": 31550 }, { "epoch": 0.4650152427799296, "grad_norm": 1.592476725578308, "learning_rate": 1.8593266766810993e-05, "loss": 0.1354, "step": 31575 }, { "epoch": 0.46538342587001663, "grad_norm": 1.4160147905349731, "learning_rate": 1.8607993873523607e-05, "loss": 0.1398, "step": 31600 }, { "epoch": 0.4657516089601037, "grad_norm": 1.1927225589752197, "learning_rate": 1.8622720980236226e-05, "loss": 0.1355, "step": 31625 }, { "epoch": 0.4661197920501907, "grad_norm": 1.4182437658309937, "learning_rate": 1.863744808694884e-05, "loss": 0.1315, "step": 31650 }, { "epoch": 0.46648797514027773, "grad_norm": 1.1829264163970947, "learning_rate": 1.8652175193661455e-05, "loss": 0.128, "step": 31675 }, { "epoch": 0.4668561582303648, "grad_norm": 1.4739160537719727, "learning_rate": 1.866690230037407e-05, "loss": 0.1281, "step": 31700 }, { "epoch": 0.4672243413204518, "grad_norm": 1.5583876371383667, "learning_rate": 1.8681629407086684e-05, "loss": 0.129, "step": 31725 }, { "epoch": 0.4675925244105389, "grad_norm": 1.4014379978179932, "learning_rate": 1.8696356513799302e-05, "loss": 0.135, "step": 31750 }, { "epoch": 0.4679607075006259, "grad_norm": 1.5254454612731934, "learning_rate": 1.8711083620511917e-05, "loss": 0.1316, "step": 31775 }, { "epoch": 0.468328890590713, "grad_norm": 1.4662139415740967, "learning_rate": 1.872581072722453e-05, "loss": 0.1335, "step": 31800 }, { "epoch": 0.4686970736808, "grad_norm": 1.3167014122009277, "learning_rate": 1.8740537833937146e-05, "loss": 0.1356, "step": 31825 }, { "epoch": 0.469065256770887, "grad_norm": 1.8181610107421875, "learning_rate": 1.875526494064976e-05, "loss": 0.137, "step": 31850 }, { "epoch": 0.4694334398609741, "grad_norm": 2.111936569213867, "learning_rate": 1.876999204736238e-05, "loss": 0.1347, "step": 31875 }, { "epoch": 0.4698016229510611, "grad_norm": 1.3071465492248535, "learning_rate": 1.8784719154074993e-05, "loss": 0.1395, "step": 31900 }, { "epoch": 0.47016980604114816, "grad_norm": 1.4029223918914795, "learning_rate": 1.8799446260787608e-05, "loss": 0.1362, "step": 31925 }, { "epoch": 0.4705379891312352, "grad_norm": 1.134627342224121, "learning_rate": 1.8814173367500223e-05, "loss": 0.1413, "step": 31950 }, { "epoch": 0.4709061722213222, "grad_norm": 1.1709901094436646, "learning_rate": 1.8828900474212837e-05, "loss": 0.1465, "step": 31975 }, { "epoch": 0.47127435531140927, "grad_norm": 1.2714532613754272, "learning_rate": 1.8843627580925452e-05, "loss": 0.1293, "step": 32000 }, { "epoch": 0.4716425384014963, "grad_norm": 1.1055426597595215, "learning_rate": 1.8858354687638067e-05, "loss": 0.1339, "step": 32025 }, { "epoch": 0.47201072149158335, "grad_norm": 1.716193437576294, "learning_rate": 1.887308179435068e-05, "loss": 0.1314, "step": 32050 }, { "epoch": 0.47237890458167037, "grad_norm": 1.7784597873687744, "learning_rate": 1.88878089010633e-05, "loss": 0.1343, "step": 32075 }, { "epoch": 0.47274708767175744, "grad_norm": 1.379332184791565, "learning_rate": 1.8902536007775914e-05, "loss": 0.13, "step": 32100 }, { "epoch": 0.47311527076184445, "grad_norm": 1.1867659091949463, "learning_rate": 1.891726311448853e-05, "loss": 0.1302, "step": 32125 }, { "epoch": 0.47348345385193147, "grad_norm": 1.303497314453125, "learning_rate": 1.8931990221201143e-05, "loss": 0.1169, "step": 32150 }, { "epoch": 0.47385163694201854, "grad_norm": 1.1655243635177612, "learning_rate": 1.8946717327913758e-05, "loss": 0.1317, "step": 32175 }, { "epoch": 0.47421982003210555, "grad_norm": 1.1606842279434204, "learning_rate": 1.8961444434626376e-05, "loss": 0.1255, "step": 32200 }, { "epoch": 0.4745880031221926, "grad_norm": 1.7602474689483643, "learning_rate": 1.897617154133899e-05, "loss": 0.1335, "step": 32225 }, { "epoch": 0.47495618621227964, "grad_norm": 1.8369065523147583, "learning_rate": 1.8990898648051605e-05, "loss": 0.1306, "step": 32250 }, { "epoch": 0.47532436930236666, "grad_norm": 1.317317247390747, "learning_rate": 1.900562575476422e-05, "loss": 0.123, "step": 32275 }, { "epoch": 0.4756925523924537, "grad_norm": 1.2774357795715332, "learning_rate": 1.9020352861476835e-05, "loss": 0.1307, "step": 32300 }, { "epoch": 0.47606073548254074, "grad_norm": 1.5253177881240845, "learning_rate": 1.9035079968189453e-05, "loss": 0.1326, "step": 32325 }, { "epoch": 0.4764289185726278, "grad_norm": 1.3377888202667236, "learning_rate": 1.9049807074902067e-05, "loss": 0.1253, "step": 32350 }, { "epoch": 0.47679710166271483, "grad_norm": 1.155227541923523, "learning_rate": 1.9064534181614682e-05, "loss": 0.1298, "step": 32375 }, { "epoch": 0.4771652847528019, "grad_norm": 1.1564701795578003, "learning_rate": 1.9079261288327297e-05, "loss": 0.1376, "step": 32400 }, { "epoch": 0.4775334678428889, "grad_norm": 1.2230069637298584, "learning_rate": 1.909398839503991e-05, "loss": 0.136, "step": 32425 }, { "epoch": 0.47790165093297593, "grad_norm": 1.498449444770813, "learning_rate": 1.910871550175253e-05, "loss": 0.124, "step": 32450 }, { "epoch": 0.478269834023063, "grad_norm": 1.4574726819992065, "learning_rate": 1.912344260846514e-05, "loss": 0.1262, "step": 32475 }, { "epoch": 0.47863801711315, "grad_norm": 1.7008484601974487, "learning_rate": 1.9138169715177755e-05, "loss": 0.1342, "step": 32500 }, { "epoch": 0.4790062002032371, "grad_norm": 1.3751330375671387, "learning_rate": 1.9152896821890373e-05, "loss": 0.1321, "step": 32525 }, { "epoch": 0.4793743832933241, "grad_norm": 1.2620458602905273, "learning_rate": 1.9167623928602988e-05, "loss": 0.128, "step": 32550 }, { "epoch": 0.4797425663834111, "grad_norm": 1.7352402210235596, "learning_rate": 1.9182351035315603e-05, "loss": 0.1294, "step": 32575 }, { "epoch": 0.4801107494734982, "grad_norm": 1.4355820417404175, "learning_rate": 1.9197078142028217e-05, "loss": 0.1292, "step": 32600 }, { "epoch": 0.4804789325635852, "grad_norm": 1.8195645809173584, "learning_rate": 1.9211805248740832e-05, "loss": 0.1385, "step": 32625 }, { "epoch": 0.4808471156536723, "grad_norm": 1.464348554611206, "learning_rate": 1.922653235545345e-05, "loss": 0.1327, "step": 32650 }, { "epoch": 0.4812152987437593, "grad_norm": 1.9561387300491333, "learning_rate": 1.9241259462166065e-05, "loss": 0.1349, "step": 32675 }, { "epoch": 0.48158348183384636, "grad_norm": 1.7696665525436401, "learning_rate": 1.925598656887868e-05, "loss": 0.143, "step": 32700 }, { "epoch": 0.4819516649239334, "grad_norm": 1.2237669229507446, "learning_rate": 1.9270713675591294e-05, "loss": 0.1333, "step": 32725 }, { "epoch": 0.4823198480140204, "grad_norm": 1.3987947702407837, "learning_rate": 1.928544078230391e-05, "loss": 0.1387, "step": 32750 }, { "epoch": 0.48268803110410746, "grad_norm": 1.272199273109436, "learning_rate": 1.9300167889016527e-05, "loss": 0.1315, "step": 32775 }, { "epoch": 0.4830562141941945, "grad_norm": 1.6304975748062134, "learning_rate": 1.931489499572914e-05, "loss": 0.1261, "step": 32800 }, { "epoch": 0.48342439728428155, "grad_norm": 1.2250480651855469, "learning_rate": 1.9329622102441756e-05, "loss": 0.1343, "step": 32825 }, { "epoch": 0.48379258037436856, "grad_norm": 1.7616349458694458, "learning_rate": 1.934434920915437e-05, "loss": 0.1247, "step": 32850 }, { "epoch": 0.4841607634644556, "grad_norm": 1.8865598440170288, "learning_rate": 1.935907631586699e-05, "loss": 0.1284, "step": 32875 }, { "epoch": 0.48452894655454265, "grad_norm": 1.4966330528259277, "learning_rate": 1.9373214338311096e-05, "loss": 0.1444, "step": 32900 }, { "epoch": 0.48489712964462967, "grad_norm": 1.5111415386199951, "learning_rate": 1.938794144502371e-05, "loss": 0.128, "step": 32925 }, { "epoch": 0.48526531273471674, "grad_norm": 1.2859002351760864, "learning_rate": 1.940266855173633e-05, "loss": 0.1303, "step": 32950 }, { "epoch": 0.48563349582480375, "grad_norm": 1.6504935026168823, "learning_rate": 1.9417395658448943e-05, "loss": 0.1244, "step": 32975 }, { "epoch": 0.4860016789148908, "grad_norm": 1.4879761934280396, "learning_rate": 1.9432122765161558e-05, "loss": 0.135, "step": 33000 }, { "epoch": 0.48636986200497784, "grad_norm": 1.4184648990631104, "learning_rate": 1.9446849871874173e-05, "loss": 0.1276, "step": 33025 }, { "epoch": 0.48673804509506485, "grad_norm": 1.7261390686035156, "learning_rate": 1.946157697858679e-05, "loss": 0.1285, "step": 33050 }, { "epoch": 0.4871062281851519, "grad_norm": 1.2350839376449585, "learning_rate": 1.9476304085299405e-05, "loss": 0.129, "step": 33075 }, { "epoch": 0.48747441127523894, "grad_norm": 1.2649427652359009, "learning_rate": 1.949103119201202e-05, "loss": 0.1131, "step": 33100 }, { "epoch": 0.487842594365326, "grad_norm": 1.2869795560836792, "learning_rate": 1.9505758298724635e-05, "loss": 0.1276, "step": 33125 }, { "epoch": 0.488210777455413, "grad_norm": 1.3838227987289429, "learning_rate": 1.952048540543725e-05, "loss": 0.123, "step": 33150 }, { "epoch": 0.48857896054550004, "grad_norm": 1.4343010187149048, "learning_rate": 1.9535212512149864e-05, "loss": 0.1311, "step": 33175 }, { "epoch": 0.4889471436355871, "grad_norm": 1.1730332374572754, "learning_rate": 1.954993961886248e-05, "loss": 0.1315, "step": 33200 }, { "epoch": 0.4893153267256741, "grad_norm": 1.2627941370010376, "learning_rate": 1.9564666725575093e-05, "loss": 0.1333, "step": 33225 }, { "epoch": 0.4896835098157612, "grad_norm": 1.5235569477081299, "learning_rate": 1.9579393832287708e-05, "loss": 0.1262, "step": 33250 }, { "epoch": 0.4900516929058482, "grad_norm": 1.6173646450042725, "learning_rate": 1.9594120939000326e-05, "loss": 0.1284, "step": 33275 }, { "epoch": 0.4904198759959353, "grad_norm": 2.243269205093384, "learning_rate": 1.960884804571294e-05, "loss": 0.1232, "step": 33300 }, { "epoch": 0.4907880590860223, "grad_norm": 1.573286771774292, "learning_rate": 1.9623575152425555e-05, "loss": 0.1235, "step": 33325 }, { "epoch": 0.4911562421761093, "grad_norm": 1.7263425588607788, "learning_rate": 1.963830225913817e-05, "loss": 0.1243, "step": 33350 }, { "epoch": 0.4915244252661964, "grad_norm": 1.7113111019134521, "learning_rate": 1.9653029365850788e-05, "loss": 0.134, "step": 33375 }, { "epoch": 0.4918926083562834, "grad_norm": 1.3466795682907104, "learning_rate": 1.9667756472563403e-05, "loss": 0.1298, "step": 33400 }, { "epoch": 0.49226079144637047, "grad_norm": 1.4281026124954224, "learning_rate": 1.9682483579276017e-05, "loss": 0.119, "step": 33425 }, { "epoch": 0.4926289745364575, "grad_norm": 1.7018014192581177, "learning_rate": 1.9697210685988632e-05, "loss": 0.1324, "step": 33450 }, { "epoch": 0.4929971576265445, "grad_norm": 1.7589343786239624, "learning_rate": 1.9711937792701247e-05, "loss": 0.128, "step": 33475 }, { "epoch": 0.4933653407166316, "grad_norm": 2.133329153060913, "learning_rate": 1.9726664899413865e-05, "loss": 0.125, "step": 33500 }, { "epoch": 0.4937335238067186, "grad_norm": 1.4540302753448486, "learning_rate": 1.974139200612648e-05, "loss": 0.127, "step": 33525 }, { "epoch": 0.49410170689680566, "grad_norm": 1.419734239578247, "learning_rate": 1.9756119112839094e-05, "loss": 0.1274, "step": 33550 }, { "epoch": 0.4944698899868927, "grad_norm": 1.654998779296875, "learning_rate": 1.977084621955171e-05, "loss": 0.1298, "step": 33575 }, { "epoch": 0.49483807307697975, "grad_norm": 1.9749138355255127, "learning_rate": 1.9785573326264323e-05, "loss": 0.1231, "step": 33600 }, { "epoch": 0.49520625616706676, "grad_norm": 1.4517003297805786, "learning_rate": 1.980030043297694e-05, "loss": 0.1294, "step": 33625 }, { "epoch": 0.4955744392571538, "grad_norm": 1.6205573081970215, "learning_rate": 1.9815027539689556e-05, "loss": 0.1378, "step": 33650 }, { "epoch": 0.49594262234724085, "grad_norm": 1.3572132587432861, "learning_rate": 1.9829754646402167e-05, "loss": 0.132, "step": 33675 }, { "epoch": 0.49631080543732786, "grad_norm": 1.6994099617004395, "learning_rate": 1.9844481753114785e-05, "loss": 0.1294, "step": 33700 }, { "epoch": 0.49667898852741493, "grad_norm": 1.5217084884643555, "learning_rate": 1.98592088598274e-05, "loss": 0.1239, "step": 33725 }, { "epoch": 0.49704717161750195, "grad_norm": 1.607015609741211, "learning_rate": 1.9873935966540015e-05, "loss": 0.1259, "step": 33750 }, { "epoch": 0.49741535470758896, "grad_norm": 1.6565525531768799, "learning_rate": 1.988866307325263e-05, "loss": 0.121, "step": 33775 }, { "epoch": 0.49778353779767603, "grad_norm": 1.2221335172653198, "learning_rate": 1.9903390179965244e-05, "loss": 0.1261, "step": 33800 }, { "epoch": 0.49815172088776305, "grad_norm": 1.582075595855713, "learning_rate": 1.9918117286677862e-05, "loss": 0.1249, "step": 33825 }, { "epoch": 0.4985199039778501, "grad_norm": 1.5876294374465942, "learning_rate": 1.9932844393390477e-05, "loss": 0.125, "step": 33850 }, { "epoch": 0.49888808706793714, "grad_norm": 1.7459077835083008, "learning_rate": 1.994757150010309e-05, "loss": 0.1276, "step": 33875 }, { "epoch": 0.4992562701580242, "grad_norm": 1.3217989206314087, "learning_rate": 1.9962298606815706e-05, "loss": 0.1277, "step": 33900 }, { "epoch": 0.4996244532481112, "grad_norm": 1.583194375038147, "learning_rate": 1.997702571352832e-05, "loss": 0.1263, "step": 33925 }, { "epoch": 0.49999263633819824, "grad_norm": 1.1127156019210815, "learning_rate": 1.999175282024094e-05, "loss": 0.1203, "step": 33950 }, { "epoch": 0.5003608194282853, "grad_norm": 1.244364619255066, "learning_rate": 1.9999279996334528e-05, "loss": 0.1366, "step": 33975 }, { "epoch": 0.5007290025183724, "grad_norm": 1.5604734420776367, "learning_rate": 1.9997643624367546e-05, "loss": 0.1304, "step": 34000 }, { "epoch": 0.5010971856084594, "grad_norm": 1.4373990297317505, "learning_rate": 1.999600725240056e-05, "loss": 0.1344, "step": 34025 }, { "epoch": 0.5014653686985464, "grad_norm": 1.628875970840454, "learning_rate": 1.9994370880433575e-05, "loss": 0.1258, "step": 34050 }, { "epoch": 0.5018335517886334, "grad_norm": 1.3895543813705444, "learning_rate": 1.999273450846659e-05, "loss": 0.1268, "step": 34075 }, { "epoch": 0.5022017348787204, "grad_norm": 1.3760876655578613, "learning_rate": 1.9991098136499604e-05, "loss": 0.1273, "step": 34100 }, { "epoch": 0.5025699179688076, "grad_norm": 1.3263821601867676, "learning_rate": 1.998946176453262e-05, "loss": 0.1279, "step": 34125 }, { "epoch": 0.5029381010588946, "grad_norm": 1.43476402759552, "learning_rate": 1.9987825392565636e-05, "loss": 0.1205, "step": 34150 }, { "epoch": 0.5033062841489816, "grad_norm": 1.4423213005065918, "learning_rate": 1.998618902059865e-05, "loss": 0.122, "step": 34175 }, { "epoch": 0.5036744672390686, "grad_norm": 1.5728139877319336, "learning_rate": 1.9984552648631668e-05, "loss": 0.1245, "step": 34200 }, { "epoch": 0.5040426503291556, "grad_norm": 1.6493754386901855, "learning_rate": 1.9982916276664683e-05, "loss": 0.1212, "step": 34225 }, { "epoch": 0.5044108334192428, "grad_norm": 1.5477712154388428, "learning_rate": 1.99812799046977e-05, "loss": 0.124, "step": 34250 }, { "epoch": 0.5047790165093298, "grad_norm": 1.4639281034469604, "learning_rate": 1.9979643532730715e-05, "loss": 0.1305, "step": 34275 }, { "epoch": 0.5051471995994168, "grad_norm": 1.7540819644927979, "learning_rate": 1.997800716076373e-05, "loss": 0.1218, "step": 34300 }, { "epoch": 0.5055153826895038, "grad_norm": 1.3233895301818848, "learning_rate": 1.9976370788796744e-05, "loss": 0.1277, "step": 34325 }, { "epoch": 0.5058835657795909, "grad_norm": 1.4739441871643066, "learning_rate": 1.997473441682976e-05, "loss": 0.1324, "step": 34350 }, { "epoch": 0.5062517488696779, "grad_norm": 1.3000911474227905, "learning_rate": 1.9973098044862776e-05, "loss": 0.1283, "step": 34375 }, { "epoch": 0.506619931959765, "grad_norm": 1.398351788520813, "learning_rate": 1.997146167289579e-05, "loss": 0.1205, "step": 34400 }, { "epoch": 0.506988115049852, "grad_norm": 1.6885526180267334, "learning_rate": 1.9969825300928805e-05, "loss": 0.1243, "step": 34425 }, { "epoch": 0.507356298139939, "grad_norm": 1.6618565320968628, "learning_rate": 1.9968188928961823e-05, "loss": 0.1235, "step": 34450 }, { "epoch": 0.5077244812300261, "grad_norm": 1.8180230855941772, "learning_rate": 1.9966552556994837e-05, "loss": 0.13, "step": 34475 }, { "epoch": 0.5080926643201131, "grad_norm": 1.3617326021194458, "learning_rate": 1.9964916185027852e-05, "loss": 0.1236, "step": 34500 }, { "epoch": 0.5084608474102001, "grad_norm": 1.6490955352783203, "learning_rate": 1.9963279813060866e-05, "loss": 0.1202, "step": 34525 }, { "epoch": 0.5088290305002872, "grad_norm": 1.1779961585998535, "learning_rate": 1.9961643441093884e-05, "loss": 0.1271, "step": 34550 }, { "epoch": 0.5091972135903742, "grad_norm": 1.934651494026184, "learning_rate": 1.99600070691269e-05, "loss": 0.121, "step": 34575 }, { "epoch": 0.5095653966804613, "grad_norm": 1.6155422925949097, "learning_rate": 1.9958370697159913e-05, "loss": 0.1213, "step": 34600 }, { "epoch": 0.5099335797705483, "grad_norm": 1.213598370552063, "learning_rate": 1.995673432519293e-05, "loss": 0.123, "step": 34625 }, { "epoch": 0.5103017628606353, "grad_norm": 1.6968715190887451, "learning_rate": 1.9955097953225945e-05, "loss": 0.1187, "step": 34650 }, { "epoch": 0.5106699459507223, "grad_norm": 1.2820351123809814, "learning_rate": 1.995346158125896e-05, "loss": 0.118, "step": 34675 }, { "epoch": 0.5110381290408094, "grad_norm": 1.6911979913711548, "learning_rate": 1.9951825209291978e-05, "loss": 0.1292, "step": 34700 }, { "epoch": 0.5114063121308965, "grad_norm": 1.2066638469696045, "learning_rate": 1.9950188837324992e-05, "loss": 0.1208, "step": 34725 }, { "epoch": 0.5117744952209835, "grad_norm": 2.2320609092712402, "learning_rate": 1.9948552465358007e-05, "loss": 0.1215, "step": 34750 }, { "epoch": 0.5121426783110705, "grad_norm": 1.6644694805145264, "learning_rate": 1.994691609339102e-05, "loss": 0.1247, "step": 34775 }, { "epoch": 0.5125108614011575, "grad_norm": 1.2053136825561523, "learning_rate": 1.994527972142404e-05, "loss": 0.1217, "step": 34800 }, { "epoch": 0.5128790444912446, "grad_norm": 1.4280292987823486, "learning_rate": 1.9943643349457053e-05, "loss": 0.1215, "step": 34825 }, { "epoch": 0.5132472275813317, "grad_norm": 1.1937901973724365, "learning_rate": 1.9942006977490068e-05, "loss": 0.1311, "step": 34850 }, { "epoch": 0.5136154106714187, "grad_norm": 1.4878497123718262, "learning_rate": 1.9940370605523086e-05, "loss": 0.1215, "step": 34875 }, { "epoch": 0.5139835937615057, "grad_norm": 1.559525966644287, "learning_rate": 1.99387342335561e-05, "loss": 0.1175, "step": 34900 }, { "epoch": 0.5143517768515927, "grad_norm": 1.5552871227264404, "learning_rate": 1.9937097861589115e-05, "loss": 0.1291, "step": 34925 }, { "epoch": 0.5147199599416798, "grad_norm": 1.4817222356796265, "learning_rate": 1.993546148962213e-05, "loss": 0.1179, "step": 34950 }, { "epoch": 0.5150881430317669, "grad_norm": 1.1439144611358643, "learning_rate": 1.9933890572533826e-05, "loss": 0.1237, "step": 34975 }, { "epoch": 0.5154563261218539, "grad_norm": 1.3597592115402222, "learning_rate": 1.993225420056684e-05, "loss": 0.1226, "step": 35000 }, { "epoch": 0.5158245092119409, "grad_norm": 1.6552609205245972, "learning_rate": 1.9930617828599855e-05, "loss": 0.1275, "step": 35025 }, { "epoch": 0.5161926923020279, "grad_norm": 1.4046581983566284, "learning_rate": 1.9928981456632873e-05, "loss": 0.1303, "step": 35050 }, { "epoch": 0.516560875392115, "grad_norm": 1.3805830478668213, "learning_rate": 1.9927345084665888e-05, "loss": 0.1208, "step": 35075 }, { "epoch": 0.516929058482202, "grad_norm": 1.6304214000701904, "learning_rate": 1.9925708712698902e-05, "loss": 0.1273, "step": 35100 }, { "epoch": 0.5172972415722891, "grad_norm": 1.558475136756897, "learning_rate": 1.9924072340731917e-05, "loss": 0.1203, "step": 35125 }, { "epoch": 0.5176654246623761, "grad_norm": 2.1353535652160645, "learning_rate": 1.9922435968764934e-05, "loss": 0.1229, "step": 35150 }, { "epoch": 0.5180336077524631, "grad_norm": 1.4988514184951782, "learning_rate": 1.992079959679795e-05, "loss": 0.1369, "step": 35175 }, { "epoch": 0.5184017908425502, "grad_norm": 1.7591900825500488, "learning_rate": 1.9919163224830963e-05, "loss": 0.1262, "step": 35200 }, { "epoch": 0.5187699739326372, "grad_norm": 1.1921674013137817, "learning_rate": 1.991752685286398e-05, "loss": 0.1204, "step": 35225 }, { "epoch": 0.5191381570227243, "grad_norm": 1.6571495532989502, "learning_rate": 1.9915890480896996e-05, "loss": 0.1244, "step": 35250 }, { "epoch": 0.5195063401128113, "grad_norm": 1.442743182182312, "learning_rate": 1.991425410893001e-05, "loss": 0.1264, "step": 35275 }, { "epoch": 0.5198745232028983, "grad_norm": 1.2768187522888184, "learning_rate": 1.9912617736963028e-05, "loss": 0.1311, "step": 35300 }, { "epoch": 0.5202427062929854, "grad_norm": 1.5320920944213867, "learning_rate": 1.9910981364996042e-05, "loss": 0.1273, "step": 35325 }, { "epoch": 0.5206108893830724, "grad_norm": 1.7867873907089233, "learning_rate": 1.9909344993029057e-05, "loss": 0.1244, "step": 35350 }, { "epoch": 0.5209790724731594, "grad_norm": 2.1170566082000732, "learning_rate": 1.990770862106207e-05, "loss": 0.1271, "step": 35375 }, { "epoch": 0.5213472555632465, "grad_norm": 1.2551827430725098, "learning_rate": 1.990607224909509e-05, "loss": 0.1083, "step": 35400 }, { "epoch": 0.5217154386533335, "grad_norm": 1.4103198051452637, "learning_rate": 1.9904435877128104e-05, "loss": 0.1198, "step": 35425 }, { "epoch": 0.5220836217434206, "grad_norm": 1.889062762260437, "learning_rate": 1.9902799505161118e-05, "loss": 0.1156, "step": 35450 }, { "epoch": 0.5224518048335076, "grad_norm": 1.1291379928588867, "learning_rate": 1.9901163133194136e-05, "loss": 0.1194, "step": 35475 }, { "epoch": 0.5228199879235946, "grad_norm": 1.7137246131896973, "learning_rate": 1.989952676122715e-05, "loss": 0.1299, "step": 35500 }, { "epoch": 0.5231881710136816, "grad_norm": 1.4984245300292969, "learning_rate": 1.9897890389260165e-05, "loss": 0.126, "step": 35525 }, { "epoch": 0.5235563541037688, "grad_norm": 1.8383368253707886, "learning_rate": 1.989625401729318e-05, "loss": 0.1202, "step": 35550 }, { "epoch": 0.5239245371938558, "grad_norm": 1.216751217842102, "learning_rate": 1.9894617645326194e-05, "loss": 0.1217, "step": 35575 }, { "epoch": 0.5242927202839428, "grad_norm": 1.2445589303970337, "learning_rate": 1.989298127335921e-05, "loss": 0.1271, "step": 35600 }, { "epoch": 0.5246609033740298, "grad_norm": 1.3670048713684082, "learning_rate": 1.9891344901392226e-05, "loss": 0.123, "step": 35625 }, { "epoch": 0.5250290864641168, "grad_norm": 1.6851133108139038, "learning_rate": 1.9889708529425244e-05, "loss": 0.1248, "step": 35650 }, { "epoch": 0.525397269554204, "grad_norm": 1.3160358667373657, "learning_rate": 1.9888072157458258e-05, "loss": 0.1241, "step": 35675 }, { "epoch": 0.525765452644291, "grad_norm": 1.607149362564087, "learning_rate": 1.9886435785491273e-05, "loss": 0.1278, "step": 35700 }, { "epoch": 0.526133635734378, "grad_norm": 1.6950891017913818, "learning_rate": 1.988479941352429e-05, "loss": 0.1233, "step": 35725 }, { "epoch": 0.526501818824465, "grad_norm": 1.227681040763855, "learning_rate": 1.9883163041557305e-05, "loss": 0.1216, "step": 35750 }, { "epoch": 0.526870001914552, "grad_norm": 1.3604323863983154, "learning_rate": 1.988152666959032e-05, "loss": 0.1126, "step": 35775 }, { "epoch": 0.5272381850046391, "grad_norm": 1.1941028833389282, "learning_rate": 1.9879890297623334e-05, "loss": 0.1206, "step": 35800 }, { "epoch": 0.5276063680947262, "grad_norm": 1.280265212059021, "learning_rate": 1.987825392565635e-05, "loss": 0.1251, "step": 35825 }, { "epoch": 0.5279745511848132, "grad_norm": 1.2790465354919434, "learning_rate": 1.9876617553689366e-05, "loss": 0.1224, "step": 35850 }, { "epoch": 0.5283427342749002, "grad_norm": 1.8109391927719116, "learning_rate": 1.987498118172238e-05, "loss": 0.1248, "step": 35875 }, { "epoch": 0.5287109173649872, "grad_norm": 1.5231170654296875, "learning_rate": 1.98733448097554e-05, "loss": 0.1223, "step": 35900 }, { "epoch": 0.5290791004550743, "grad_norm": 1.469890832901001, "learning_rate": 1.9871708437788413e-05, "loss": 0.1143, "step": 35925 }, { "epoch": 0.5294472835451614, "grad_norm": 1.472482442855835, "learning_rate": 1.9870072065821427e-05, "loss": 0.1156, "step": 35950 }, { "epoch": 0.5298154666352484, "grad_norm": 1.5676894187927246, "learning_rate": 1.9868435693854442e-05, "loss": 0.1184, "step": 35975 }, { "epoch": 0.5301836497253354, "grad_norm": 1.5137412548065186, "learning_rate": 1.9866799321887456e-05, "loss": 0.1203, "step": 36000 }, { "epoch": 0.5305518328154225, "grad_norm": 1.3111070394515991, "learning_rate": 1.9865162949920474e-05, "loss": 0.1177, "step": 36025 }, { "epoch": 0.5309200159055095, "grad_norm": 1.5890854597091675, "learning_rate": 1.986352657795349e-05, "loss": 0.1261, "step": 36050 }, { "epoch": 0.5312881989955965, "grad_norm": 0.9979857802391052, "learning_rate": 1.9861890205986507e-05, "loss": 0.1285, "step": 36075 }, { "epoch": 0.5316563820856836, "grad_norm": 1.7929030656814575, "learning_rate": 1.986025383401952e-05, "loss": 0.1184, "step": 36100 }, { "epoch": 0.5320245651757706, "grad_norm": 1.296306848526001, "learning_rate": 1.9858617462052535e-05, "loss": 0.114, "step": 36125 }, { "epoch": 0.5323927482658577, "grad_norm": 1.580112099647522, "learning_rate": 1.9856981090085553e-05, "loss": 0.1228, "step": 36150 }, { "epoch": 0.5327609313559447, "grad_norm": 1.4098597764968872, "learning_rate": 1.9855344718118568e-05, "loss": 0.1223, "step": 36175 }, { "epoch": 0.5331291144460317, "grad_norm": 1.383877158164978, "learning_rate": 1.9853708346151582e-05, "loss": 0.1224, "step": 36200 }, { "epoch": 0.5334972975361187, "grad_norm": 1.6452189683914185, "learning_rate": 1.9852071974184597e-05, "loss": 0.1195, "step": 36225 }, { "epoch": 0.5338654806262058, "grad_norm": 1.42545485496521, "learning_rate": 1.985043560221761e-05, "loss": 0.1148, "step": 36250 }, { "epoch": 0.5342336637162929, "grad_norm": 1.2486947774887085, "learning_rate": 1.984879923025063e-05, "loss": 0.1247, "step": 36275 }, { "epoch": 0.5346018468063799, "grad_norm": 1.4072372913360596, "learning_rate": 1.9847162858283643e-05, "loss": 0.1255, "step": 36300 }, { "epoch": 0.5349700298964669, "grad_norm": 1.4645627737045288, "learning_rate": 1.984552648631666e-05, "loss": 0.1229, "step": 36325 }, { "epoch": 0.5353382129865539, "grad_norm": 1.3357573747634888, "learning_rate": 1.9843890114349676e-05, "loss": 0.115, "step": 36350 }, { "epoch": 0.535706396076641, "grad_norm": 1.6706513166427612, "learning_rate": 1.984225374238269e-05, "loss": 0.1136, "step": 36375 }, { "epoch": 0.5360745791667281, "grad_norm": 1.2168182134628296, "learning_rate": 1.9840617370415705e-05, "loss": 0.1192, "step": 36400 }, { "epoch": 0.5364427622568151, "grad_norm": 1.2111717462539673, "learning_rate": 1.983898099844872e-05, "loss": 0.1244, "step": 36425 }, { "epoch": 0.5368109453469021, "grad_norm": 1.5248171091079712, "learning_rate": 1.9837344626481737e-05, "loss": 0.1286, "step": 36450 }, { "epoch": 0.5371791284369891, "grad_norm": 2.1165120601654053, "learning_rate": 1.983570825451475e-05, "loss": 0.1296, "step": 36475 }, { "epoch": 0.5375473115270761, "grad_norm": 1.4845212697982788, "learning_rate": 1.9834071882547766e-05, "loss": 0.1204, "step": 36500 }, { "epoch": 0.5379154946171633, "grad_norm": 1.6977769136428833, "learning_rate": 1.9832435510580784e-05, "loss": 0.1177, "step": 36525 }, { "epoch": 0.5382836777072503, "grad_norm": 1.3811136484146118, "learning_rate": 1.9830799138613798e-05, "loss": 0.124, "step": 36550 }, { "epoch": 0.5386518607973373, "grad_norm": 1.491607427597046, "learning_rate": 1.9829162766646816e-05, "loss": 0.1225, "step": 36575 }, { "epoch": 0.5390200438874243, "grad_norm": 1.7326167821884155, "learning_rate": 1.9827526394679827e-05, "loss": 0.1231, "step": 36600 }, { "epoch": 0.5393882269775114, "grad_norm": 1.3634510040283203, "learning_rate": 1.9825890022712845e-05, "loss": 0.1171, "step": 36625 }, { "epoch": 0.5397564100675984, "grad_norm": 1.6107808351516724, "learning_rate": 1.982425365074586e-05, "loss": 0.1237, "step": 36650 }, { "epoch": 0.5401245931576855, "grad_norm": 1.855141282081604, "learning_rate": 1.9822617278778874e-05, "loss": 0.1161, "step": 36675 }, { "epoch": 0.5404927762477725, "grad_norm": 1.6258479356765747, "learning_rate": 1.9820980906811892e-05, "loss": 0.118, "step": 36700 }, { "epoch": 0.5408609593378595, "grad_norm": 1.2577881813049316, "learning_rate": 1.9819344534844906e-05, "loss": 0.1243, "step": 36725 }, { "epoch": 0.5412291424279466, "grad_norm": 1.2613511085510254, "learning_rate": 1.981770816287792e-05, "loss": 0.1191, "step": 36750 }, { "epoch": 0.5415973255180336, "grad_norm": 1.5669910907745361, "learning_rate": 1.981607179091094e-05, "loss": 0.1271, "step": 36775 }, { "epoch": 0.5419655086081206, "grad_norm": 1.261261224746704, "learning_rate": 1.9814435418943953e-05, "loss": 0.1093, "step": 36800 }, { "epoch": 0.5423336916982077, "grad_norm": 1.5597432851791382, "learning_rate": 1.9812799046976967e-05, "loss": 0.1168, "step": 36825 }, { "epoch": 0.5427018747882947, "grad_norm": 1.5948351621627808, "learning_rate": 1.9811162675009982e-05, "loss": 0.1232, "step": 36850 }, { "epoch": 0.5430700578783818, "grad_norm": 1.305525779724121, "learning_rate": 1.9809526303043e-05, "loss": 0.1186, "step": 36875 }, { "epoch": 0.5434382409684688, "grad_norm": 1.4193530082702637, "learning_rate": 1.9807889931076014e-05, "loss": 0.1246, "step": 36900 }, { "epoch": 0.5438064240585558, "grad_norm": 1.3771121501922607, "learning_rate": 1.980625355910903e-05, "loss": 0.1221, "step": 36925 }, { "epoch": 0.5441746071486429, "grad_norm": 1.1653172969818115, "learning_rate": 1.9804617187142046e-05, "loss": 0.1185, "step": 36950 }, { "epoch": 0.5445427902387299, "grad_norm": 1.3549933433532715, "learning_rate": 1.980298081517506e-05, "loss": 0.1199, "step": 36975 }, { "epoch": 0.544910973328817, "grad_norm": 1.6777881383895874, "learning_rate": 1.9801344443208075e-05, "loss": 0.129, "step": 37000 }, { "epoch": 0.545279156418904, "grad_norm": 1.0790871381759644, "learning_rate": 1.979977352611977e-05, "loss": 0.1126, "step": 37025 }, { "epoch": 0.545647339508991, "grad_norm": 1.8433680534362793, "learning_rate": 1.9798137154152787e-05, "loss": 0.1162, "step": 37050 }, { "epoch": 0.546015522599078, "grad_norm": 2.001495838165283, "learning_rate": 1.97965007821858e-05, "loss": 0.121, "step": 37075 }, { "epoch": 0.5463837056891651, "grad_norm": 1.0270895957946777, "learning_rate": 1.9794864410218816e-05, "loss": 0.118, "step": 37100 }, { "epoch": 0.5467518887792522, "grad_norm": 1.172116756439209, "learning_rate": 1.9793228038251834e-05, "loss": 0.116, "step": 37125 }, { "epoch": 0.5471200718693392, "grad_norm": 1.4375494718551636, "learning_rate": 1.9791591666284848e-05, "loss": 0.1208, "step": 37150 }, { "epoch": 0.5474882549594262, "grad_norm": 1.8035261631011963, "learning_rate": 1.9789955294317863e-05, "loss": 0.125, "step": 37175 }, { "epoch": 0.5478564380495132, "grad_norm": 1.0941935777664185, "learning_rate": 1.978831892235088e-05, "loss": 0.1255, "step": 37200 }, { "epoch": 0.5482246211396004, "grad_norm": 1.4023436307907104, "learning_rate": 1.9786682550383895e-05, "loss": 0.1149, "step": 37225 }, { "epoch": 0.5485928042296874, "grad_norm": 1.587387204170227, "learning_rate": 1.978504617841691e-05, "loss": 0.1218, "step": 37250 }, { "epoch": 0.5489609873197744, "grad_norm": 1.603115439414978, "learning_rate": 1.9783409806449924e-05, "loss": 0.1238, "step": 37275 }, { "epoch": 0.5493291704098614, "grad_norm": 1.7399941682815552, "learning_rate": 1.9781773434482942e-05, "loss": 0.1158, "step": 37300 }, { "epoch": 0.5496973534999484, "grad_norm": 1.4176956415176392, "learning_rate": 1.9780137062515956e-05, "loss": 0.12, "step": 37325 }, { "epoch": 0.5500655365900355, "grad_norm": 1.8631614446640015, "learning_rate": 1.977850069054897e-05, "loss": 0.1212, "step": 37350 }, { "epoch": 0.5504337196801226, "grad_norm": 1.2380393743515015, "learning_rate": 1.977686431858199e-05, "loss": 0.1178, "step": 37375 }, { "epoch": 0.5508019027702096, "grad_norm": 1.3811695575714111, "learning_rate": 1.9775227946615003e-05, "loss": 0.1184, "step": 37400 }, { "epoch": 0.5511700858602966, "grad_norm": 1.2970376014709473, "learning_rate": 1.9773591574648017e-05, "loss": 0.1135, "step": 37425 }, { "epoch": 0.5515382689503836, "grad_norm": 1.7509592771530151, "learning_rate": 1.9771955202681032e-05, "loss": 0.1177, "step": 37450 }, { "epoch": 0.5519064520404707, "grad_norm": 1.4221574068069458, "learning_rate": 1.977031883071405e-05, "loss": 0.111, "step": 37475 }, { "epoch": 0.5522746351305577, "grad_norm": 1.5943177938461304, "learning_rate": 1.9768682458747064e-05, "loss": 0.1286, "step": 37500 }, { "epoch": 0.5526428182206448, "grad_norm": 1.7850430011749268, "learning_rate": 1.976704608678008e-05, "loss": 0.1244, "step": 37525 }, { "epoch": 0.5530110013107318, "grad_norm": 1.545997142791748, "learning_rate": 1.9765409714813097e-05, "loss": 0.1292, "step": 37550 }, { "epoch": 0.5533791844008188, "grad_norm": 2.268383264541626, "learning_rate": 1.976377334284611e-05, "loss": 0.1286, "step": 37575 }, { "epoch": 0.5537473674909059, "grad_norm": 1.4364897012710571, "learning_rate": 1.9762136970879125e-05, "loss": 0.1189, "step": 37600 }, { "epoch": 0.5541155505809929, "grad_norm": 1.4818049669265747, "learning_rate": 1.9760500598912143e-05, "loss": 0.1208, "step": 37625 }, { "epoch": 0.55448373367108, "grad_norm": 1.2418051958084106, "learning_rate": 1.9758864226945158e-05, "loss": 0.1205, "step": 37650 }, { "epoch": 0.554851916761167, "grad_norm": 1.3513381481170654, "learning_rate": 1.9757227854978172e-05, "loss": 0.121, "step": 37675 }, { "epoch": 0.555220099851254, "grad_norm": 1.4748746156692505, "learning_rate": 1.9755591483011187e-05, "loss": 0.1122, "step": 37700 }, { "epoch": 0.5555882829413411, "grad_norm": 1.6285781860351562, "learning_rate": 1.9753955111044205e-05, "loss": 0.1217, "step": 37725 }, { "epoch": 0.5559564660314281, "grad_norm": 1.184108853340149, "learning_rate": 1.975231873907722e-05, "loss": 0.1116, "step": 37750 }, { "epoch": 0.5563246491215151, "grad_norm": 1.7575715780258179, "learning_rate": 1.9750682367110233e-05, "loss": 0.114, "step": 37775 }, { "epoch": 0.5566928322116022, "grad_norm": 1.788015365600586, "learning_rate": 1.974904599514325e-05, "loss": 0.1172, "step": 37800 }, { "epoch": 0.5570610153016893, "grad_norm": 1.2830640077590942, "learning_rate": 1.9747409623176266e-05, "loss": 0.1234, "step": 37825 }, { "epoch": 0.5574291983917763, "grad_norm": 1.3260087966918945, "learning_rate": 1.974577325120928e-05, "loss": 0.1166, "step": 37850 }, { "epoch": 0.5577973814818633, "grad_norm": 1.5391265153884888, "learning_rate": 1.9744136879242295e-05, "loss": 0.124, "step": 37875 }, { "epoch": 0.5581655645719503, "grad_norm": 1.5043689012527466, "learning_rate": 1.974250050727531e-05, "loss": 0.1142, "step": 37900 }, { "epoch": 0.5585337476620373, "grad_norm": 1.7274752855300903, "learning_rate": 1.9740864135308327e-05, "loss": 0.1195, "step": 37925 }, { "epoch": 0.5589019307521245, "grad_norm": 1.496433973312378, "learning_rate": 1.973922776334134e-05, "loss": 0.1108, "step": 37950 }, { "epoch": 0.5592701138422115, "grad_norm": 1.4745001792907715, "learning_rate": 1.973759139137436e-05, "loss": 0.1136, "step": 37975 }, { "epoch": 0.5596382969322985, "grad_norm": 1.769690990447998, "learning_rate": 1.9735955019407374e-05, "loss": 0.1151, "step": 38000 }, { "epoch": 0.5600064800223855, "grad_norm": 1.2575122117996216, "learning_rate": 1.9734318647440388e-05, "loss": 0.1192, "step": 38025 }, { "epoch": 0.5603746631124725, "grad_norm": 1.5350185632705688, "learning_rate": 1.9732682275473406e-05, "loss": 0.1184, "step": 38050 }, { "epoch": 0.5607428462025597, "grad_norm": 1.7502778768539429, "learning_rate": 1.973104590350642e-05, "loss": 0.1367, "step": 38075 }, { "epoch": 0.5611110292926467, "grad_norm": 1.618236780166626, "learning_rate": 1.9729409531539435e-05, "loss": 0.114, "step": 38100 }, { "epoch": 0.5614792123827337, "grad_norm": 1.5210295915603638, "learning_rate": 1.972777315957245e-05, "loss": 0.1084, "step": 38125 }, { "epoch": 0.5618473954728207, "grad_norm": 1.673702597618103, "learning_rate": 1.9726136787605464e-05, "loss": 0.1231, "step": 38150 }, { "epoch": 0.5622155785629077, "grad_norm": 1.2953345775604248, "learning_rate": 1.9724500415638482e-05, "loss": 0.1099, "step": 38175 }, { "epoch": 0.5625837616529948, "grad_norm": 1.6855987310409546, "learning_rate": 1.9722864043671496e-05, "loss": 0.1179, "step": 38200 }, { "epoch": 0.5629519447430819, "grad_norm": 1.5913565158843994, "learning_rate": 1.9721227671704514e-05, "loss": 0.1143, "step": 38225 }, { "epoch": 0.5633201278331689, "grad_norm": 1.6520943641662598, "learning_rate": 1.971959129973753e-05, "loss": 0.1201, "step": 38250 }, { "epoch": 0.5636883109232559, "grad_norm": 1.2099671363830566, "learning_rate": 1.9717954927770543e-05, "loss": 0.1135, "step": 38275 }, { "epoch": 0.5640564940133429, "grad_norm": 1.32758629322052, "learning_rate": 1.9716318555803557e-05, "loss": 0.1127, "step": 38300 }, { "epoch": 0.56442467710343, "grad_norm": 1.3274176120758057, "learning_rate": 1.9714682183836572e-05, "loss": 0.1121, "step": 38325 }, { "epoch": 0.564792860193517, "grad_norm": 1.2761715650558472, "learning_rate": 1.971304581186959e-05, "loss": 0.1192, "step": 38350 }, { "epoch": 0.5651610432836041, "grad_norm": 1.3949676752090454, "learning_rate": 1.9711409439902604e-05, "loss": 0.1139, "step": 38375 }, { "epoch": 0.5655292263736911, "grad_norm": 1.7472232580184937, "learning_rate": 1.970977306793562e-05, "loss": 0.1234, "step": 38400 }, { "epoch": 0.5658974094637782, "grad_norm": 1.031726598739624, "learning_rate": 1.9708136695968636e-05, "loss": 0.1161, "step": 38425 }, { "epoch": 0.5662655925538652, "grad_norm": 1.216566562652588, "learning_rate": 1.970650032400165e-05, "loss": 0.1218, "step": 38450 }, { "epoch": 0.5666337756439522, "grad_norm": 1.6059186458587646, "learning_rate": 1.970486395203467e-05, "loss": 0.1126, "step": 38475 }, { "epoch": 0.5670019587340392, "grad_norm": 1.2862287759780884, "learning_rate": 1.970322758006768e-05, "loss": 0.1151, "step": 38500 }, { "epoch": 0.5673701418241263, "grad_norm": 1.1815801858901978, "learning_rate": 1.9701591208100698e-05, "loss": 0.1145, "step": 38525 }, { "epoch": 0.5677383249142134, "grad_norm": 1.4452340602874756, "learning_rate": 1.9699954836133712e-05, "loss": 0.1203, "step": 38550 }, { "epoch": 0.5681065080043004, "grad_norm": 1.355634331703186, "learning_rate": 1.9698318464166727e-05, "loss": 0.114, "step": 38575 }, { "epoch": 0.5684746910943874, "grad_norm": 1.4844880104064941, "learning_rate": 1.9696682092199744e-05, "loss": 0.1172, "step": 38600 }, { "epoch": 0.5688428741844744, "grad_norm": 2.0493764877319336, "learning_rate": 1.969504572023276e-05, "loss": 0.1178, "step": 38625 }, { "epoch": 0.5692110572745614, "grad_norm": 1.5760899782180786, "learning_rate": 1.9693409348265777e-05, "loss": 0.1172, "step": 38650 }, { "epoch": 0.5695792403646486, "grad_norm": 1.7074697017669678, "learning_rate": 1.969177297629879e-05, "loss": 0.1114, "step": 38675 }, { "epoch": 0.5699474234547356, "grad_norm": 1.3330516815185547, "learning_rate": 1.9690136604331806e-05, "loss": 0.1207, "step": 38700 }, { "epoch": 0.5703156065448226, "grad_norm": 1.3043917417526245, "learning_rate": 1.968850023236482e-05, "loss": 0.1239, "step": 38725 }, { "epoch": 0.5706837896349096, "grad_norm": 1.8919068574905396, "learning_rate": 1.9686863860397835e-05, "loss": 0.1256, "step": 38750 }, { "epoch": 0.5710519727249966, "grad_norm": 1.4486078023910522, "learning_rate": 1.9685227488430852e-05, "loss": 0.1255, "step": 38775 }, { "epoch": 0.5714201558150838, "grad_norm": 1.5534908771514893, "learning_rate": 1.9683591116463867e-05, "loss": 0.1135, "step": 38800 }, { "epoch": 0.5717883389051708, "grad_norm": 1.693839192390442, "learning_rate": 1.968195474449688e-05, "loss": 0.1161, "step": 38825 }, { "epoch": 0.5721565219952578, "grad_norm": 1.4074006080627441, "learning_rate": 1.96803183725299e-05, "loss": 0.1145, "step": 38850 }, { "epoch": 0.5725247050853448, "grad_norm": 1.6323904991149902, "learning_rate": 1.9678682000562914e-05, "loss": 0.1113, "step": 38875 }, { "epoch": 0.5728928881754318, "grad_norm": 1.2794816493988037, "learning_rate": 1.967704562859593e-05, "loss": 0.1098, "step": 38900 }, { "epoch": 0.573261071265519, "grad_norm": 1.4640345573425293, "learning_rate": 1.9675409256628943e-05, "loss": 0.1205, "step": 38925 }, { "epoch": 0.573629254355606, "grad_norm": 1.3224183320999146, "learning_rate": 1.967377288466196e-05, "loss": 0.1095, "step": 38950 }, { "epoch": 0.573997437445693, "grad_norm": 1.985844373703003, "learning_rate": 1.9672136512694975e-05, "loss": 0.1127, "step": 38975 }, { "epoch": 0.57436562053578, "grad_norm": 1.6131892204284668, "learning_rate": 1.967050014072799e-05, "loss": 0.1194, "step": 39000 }, { "epoch": 0.5747338036258671, "grad_norm": 1.5700842142105103, "learning_rate": 1.9668863768761007e-05, "loss": 0.1233, "step": 39025 }, { "epoch": 0.5751019867159541, "grad_norm": 1.1362309455871582, "learning_rate": 1.966722739679402e-05, "loss": 0.1098, "step": 39050 }, { "epoch": 0.5754701698060412, "grad_norm": 1.4811644554138184, "learning_rate": 1.9665656479705715e-05, "loss": 0.1199, "step": 39075 }, { "epoch": 0.5758383528961282, "grad_norm": 1.5162842273712158, "learning_rate": 1.9664020107738733e-05, "loss": 0.1218, "step": 39100 }, { "epoch": 0.5762065359862152, "grad_norm": 1.2654857635498047, "learning_rate": 1.9662383735771748e-05, "loss": 0.1098, "step": 39125 }, { "epoch": 0.5765747190763023, "grad_norm": 1.4312494993209839, "learning_rate": 1.9660747363804762e-05, "loss": 0.1145, "step": 39150 }, { "epoch": 0.5769429021663893, "grad_norm": 1.5518170595169067, "learning_rate": 1.9659110991837777e-05, "loss": 0.1133, "step": 39175 }, { "epoch": 0.5773110852564763, "grad_norm": 1.2797623872756958, "learning_rate": 1.9657474619870795e-05, "loss": 0.1138, "step": 39200 }, { "epoch": 0.5776792683465634, "grad_norm": 1.4248781204223633, "learning_rate": 1.965583824790381e-05, "loss": 0.1207, "step": 39225 }, { "epoch": 0.5780474514366504, "grad_norm": 2.2524561882019043, "learning_rate": 1.9654201875936823e-05, "loss": 0.1107, "step": 39250 }, { "epoch": 0.5784156345267375, "grad_norm": 1.7344497442245483, "learning_rate": 1.965256550396984e-05, "loss": 0.1127, "step": 39275 }, { "epoch": 0.5787838176168245, "grad_norm": 1.7456754446029663, "learning_rate": 1.9650929132002856e-05, "loss": 0.1112, "step": 39300 }, { "epoch": 0.5791520007069115, "grad_norm": 1.370923399925232, "learning_rate": 1.964929276003587e-05, "loss": 0.1093, "step": 39325 }, { "epoch": 0.5795201837969985, "grad_norm": 1.7654410600662231, "learning_rate": 1.9647656388068885e-05, "loss": 0.119, "step": 39350 }, { "epoch": 0.5798883668870856, "grad_norm": 1.5445888042449951, "learning_rate": 1.9646020016101903e-05, "loss": 0.1118, "step": 39375 }, { "epoch": 0.5802565499771727, "grad_norm": 1.3537577390670776, "learning_rate": 1.9644383644134917e-05, "loss": 0.1156, "step": 39400 }, { "epoch": 0.5806247330672597, "grad_norm": 1.781290054321289, "learning_rate": 1.964274727216793e-05, "loss": 0.11, "step": 39425 }, { "epoch": 0.5809929161573467, "grad_norm": 1.2838529348373413, "learning_rate": 1.964111090020095e-05, "loss": 0.1211, "step": 39450 }, { "epoch": 0.5813610992474337, "grad_norm": 1.7609925270080566, "learning_rate": 1.9639474528233964e-05, "loss": 0.1171, "step": 39475 }, { "epoch": 0.5817292823375207, "grad_norm": 1.424841046333313, "learning_rate": 1.9637838156266978e-05, "loss": 0.1137, "step": 39500 }, { "epoch": 0.5820974654276079, "grad_norm": 1.388629674911499, "learning_rate": 1.9636201784299996e-05, "loss": 0.1136, "step": 39525 }, { "epoch": 0.5824656485176949, "grad_norm": 1.6771466732025146, "learning_rate": 1.963456541233301e-05, "loss": 0.1054, "step": 39550 }, { "epoch": 0.5828338316077819, "grad_norm": 1.0926169157028198, "learning_rate": 1.9632929040366025e-05, "loss": 0.1115, "step": 39575 }, { "epoch": 0.5832020146978689, "grad_norm": 1.7161418199539185, "learning_rate": 1.963129266839904e-05, "loss": 0.1063, "step": 39600 }, { "epoch": 0.583570197787956, "grad_norm": 1.692179560661316, "learning_rate": 1.9629656296432057e-05, "loss": 0.114, "step": 39625 }, { "epoch": 0.5839383808780431, "grad_norm": 1.1478818655014038, "learning_rate": 1.9628019924465072e-05, "loss": 0.1219, "step": 39650 }, { "epoch": 0.5843065639681301, "grad_norm": 1.1141635179519653, "learning_rate": 1.9626383552498086e-05, "loss": 0.1134, "step": 39675 }, { "epoch": 0.5846747470582171, "grad_norm": 1.1304652690887451, "learning_rate": 1.9624747180531104e-05, "loss": 0.1189, "step": 39700 }, { "epoch": 0.5850429301483041, "grad_norm": 1.385841965675354, "learning_rate": 1.962311080856412e-05, "loss": 0.1131, "step": 39725 }, { "epoch": 0.5854111132383912, "grad_norm": 1.639119267463684, "learning_rate": 1.9621474436597133e-05, "loss": 0.1084, "step": 39750 }, { "epoch": 0.5857792963284782, "grad_norm": 1.5052881240844727, "learning_rate": 1.9619838064630147e-05, "loss": 0.1125, "step": 39775 }, { "epoch": 0.5861474794185653, "grad_norm": 1.258682131767273, "learning_rate": 1.9618201692663165e-05, "loss": 0.1214, "step": 39800 }, { "epoch": 0.5865156625086523, "grad_norm": 1.3739937543869019, "learning_rate": 1.961656532069618e-05, "loss": 0.1092, "step": 39825 }, { "epoch": 0.5868838455987393, "grad_norm": 1.277521014213562, "learning_rate": 1.9614928948729194e-05, "loss": 0.11, "step": 39850 }, { "epoch": 0.5872520286888264, "grad_norm": 2.2052865028381348, "learning_rate": 1.9613292576762212e-05, "loss": 0.1056, "step": 39875 }, { "epoch": 0.5876202117789134, "grad_norm": 1.716227412223816, "learning_rate": 1.9611656204795226e-05, "loss": 0.1065, "step": 39900 }, { "epoch": 0.5879883948690005, "grad_norm": 1.6410953998565674, "learning_rate": 1.961001983282824e-05, "loss": 0.1105, "step": 39925 }, { "epoch": 0.5883565779590875, "grad_norm": 1.2135493755340576, "learning_rate": 1.960838346086126e-05, "loss": 0.1213, "step": 39950 }, { "epoch": 0.5887247610491745, "grad_norm": 1.3212488889694214, "learning_rate": 1.960674708889427e-05, "loss": 0.1124, "step": 39975 }, { "epoch": 0.5890929441392616, "grad_norm": 1.5297878980636597, "learning_rate": 1.9605110716927288e-05, "loss": 0.1174, "step": 40000 }, { "epoch": 0.5894611272293486, "grad_norm": 1.4061139822006226, "learning_rate": 1.9603474344960302e-05, "loss": 0.1116, "step": 40025 }, { "epoch": 0.5898293103194356, "grad_norm": 1.1281315088272095, "learning_rate": 1.960183797299332e-05, "loss": 0.1169, "step": 40050 }, { "epoch": 0.5901974934095227, "grad_norm": 0.9868475794792175, "learning_rate": 1.9600201601026334e-05, "loss": 0.1128, "step": 40075 }, { "epoch": 0.5905656764996097, "grad_norm": 1.150622010231018, "learning_rate": 1.959856522905935e-05, "loss": 0.1105, "step": 40100 }, { "epoch": 0.5909338595896968, "grad_norm": 1.3219376802444458, "learning_rate": 1.9596928857092367e-05, "loss": 0.1072, "step": 40125 }, { "epoch": 0.5913020426797838, "grad_norm": 1.5164532661437988, "learning_rate": 1.959529248512538e-05, "loss": 0.1051, "step": 40150 }, { "epoch": 0.5916702257698708, "grad_norm": 1.4182822704315186, "learning_rate": 1.9593656113158396e-05, "loss": 0.1102, "step": 40175 }, { "epoch": 0.5920384088599578, "grad_norm": 1.4331551790237427, "learning_rate": 1.959201974119141e-05, "loss": 0.1224, "step": 40200 }, { "epoch": 0.592406591950045, "grad_norm": 1.3870913982391357, "learning_rate": 1.9590383369224425e-05, "loss": 0.1127, "step": 40225 }, { "epoch": 0.592774775040132, "grad_norm": 1.5275180339813232, "learning_rate": 1.9588746997257442e-05, "loss": 0.1148, "step": 40250 }, { "epoch": 0.593142958130219, "grad_norm": 1.587823510169983, "learning_rate": 1.9587110625290457e-05, "loss": 0.1141, "step": 40275 }, { "epoch": 0.593511141220306, "grad_norm": 0.9925780296325684, "learning_rate": 1.9585474253323475e-05, "loss": 0.118, "step": 40300 }, { "epoch": 0.593879324310393, "grad_norm": 1.0631872415542603, "learning_rate": 1.958383788135649e-05, "loss": 0.1128, "step": 40325 }, { "epoch": 0.5942475074004802, "grad_norm": 0.9391284584999084, "learning_rate": 1.9582201509389504e-05, "loss": 0.1134, "step": 40350 }, { "epoch": 0.5946156904905672, "grad_norm": 1.4708532094955444, "learning_rate": 1.958056513742252e-05, "loss": 0.1144, "step": 40375 }, { "epoch": 0.5949838735806542, "grad_norm": 1.280452847480774, "learning_rate": 1.9578928765455533e-05, "loss": 0.1134, "step": 40400 }, { "epoch": 0.5953520566707412, "grad_norm": 1.3945668935775757, "learning_rate": 1.957729239348855e-05, "loss": 0.1072, "step": 40425 }, { "epoch": 0.5957202397608282, "grad_norm": 1.4512226581573486, "learning_rate": 1.9575656021521565e-05, "loss": 0.1103, "step": 40450 }, { "epoch": 0.5960884228509153, "grad_norm": 1.2649255990982056, "learning_rate": 1.957401964955458e-05, "loss": 0.1174, "step": 40475 }, { "epoch": 0.5964566059410024, "grad_norm": 1.0409470796585083, "learning_rate": 1.9572383277587597e-05, "loss": 0.1145, "step": 40500 }, { "epoch": 0.5968247890310894, "grad_norm": 1.2053929567337036, "learning_rate": 1.957074690562061e-05, "loss": 0.1086, "step": 40525 }, { "epoch": 0.5971929721211764, "grad_norm": 1.389586091041565, "learning_rate": 1.956911053365363e-05, "loss": 0.1137, "step": 40550 }, { "epoch": 0.5975611552112634, "grad_norm": 1.229688048362732, "learning_rate": 1.9567474161686644e-05, "loss": 0.1157, "step": 40575 }, { "epoch": 0.5979293383013505, "grad_norm": 1.4029161930084229, "learning_rate": 1.956583778971966e-05, "loss": 0.1122, "step": 40600 }, { "epoch": 0.5982975213914375, "grad_norm": 1.3531118631362915, "learning_rate": 1.9564201417752673e-05, "loss": 0.1301, "step": 40625 }, { "epoch": 0.5986657044815246, "grad_norm": 1.492425560951233, "learning_rate": 1.9562565045785687e-05, "loss": 0.1067, "step": 40650 }, { "epoch": 0.5990338875716116, "grad_norm": 1.325297236442566, "learning_rate": 1.9560928673818705e-05, "loss": 0.1117, "step": 40675 }, { "epoch": 0.5994020706616987, "grad_norm": 1.4290852546691895, "learning_rate": 1.955929230185172e-05, "loss": 0.1132, "step": 40700 }, { "epoch": 0.5997702537517857, "grad_norm": 1.2368844747543335, "learning_rate": 1.9557655929884734e-05, "loss": 0.1104, "step": 40725 }, { "epoch": 0.6001384368418727, "grad_norm": 1.4443602561950684, "learning_rate": 1.9556019557917752e-05, "loss": 0.1171, "step": 40750 }, { "epoch": 0.6005066199319598, "grad_norm": 1.6459391117095947, "learning_rate": 1.9554383185950766e-05, "loss": 0.1158, "step": 40775 }, { "epoch": 0.6008748030220468, "grad_norm": 1.3893733024597168, "learning_rate": 1.9552746813983784e-05, "loss": 0.1063, "step": 40800 }, { "epoch": 0.6012429861121339, "grad_norm": 1.834304690361023, "learning_rate": 1.9551110442016795e-05, "loss": 0.1095, "step": 40825 }, { "epoch": 0.6016111692022209, "grad_norm": 1.7188986539840698, "learning_rate": 1.9549474070049813e-05, "loss": 0.1129, "step": 40850 }, { "epoch": 0.6019793522923079, "grad_norm": 1.3190722465515137, "learning_rate": 1.9547837698082828e-05, "loss": 0.1164, "step": 40875 }, { "epoch": 0.6023475353823949, "grad_norm": 1.718002200126648, "learning_rate": 1.9546201326115842e-05, "loss": 0.1153, "step": 40900 }, { "epoch": 0.602715718472482, "grad_norm": 1.3596863746643066, "learning_rate": 1.954456495414886e-05, "loss": 0.109, "step": 40925 }, { "epoch": 0.6030839015625691, "grad_norm": 1.914109230041504, "learning_rate": 1.9542928582181874e-05, "loss": 0.1141, "step": 40950 }, { "epoch": 0.6034520846526561, "grad_norm": 1.6325136423110962, "learning_rate": 1.9541292210214892e-05, "loss": 0.1193, "step": 40975 }, { "epoch": 0.6038202677427431, "grad_norm": 1.3778468370437622, "learning_rate": 1.9539655838247907e-05, "loss": 0.1121, "step": 41000 }, { "epoch": 0.6041884508328301, "grad_norm": 1.5594487190246582, "learning_rate": 1.953801946628092e-05, "loss": 0.1136, "step": 41025 }, { "epoch": 0.6045566339229171, "grad_norm": 1.9369747638702393, "learning_rate": 1.9536383094313936e-05, "loss": 0.11, "step": 41050 }, { "epoch": 0.6049248170130043, "grad_norm": 1.337169885635376, "learning_rate": 1.953474672234695e-05, "loss": 0.1077, "step": 41075 }, { "epoch": 0.6052930001030913, "grad_norm": 1.4679465293884277, "learning_rate": 1.9533110350379968e-05, "loss": 0.118, "step": 41100 }, { "epoch": 0.6056611831931783, "grad_norm": 1.0459349155426025, "learning_rate": 1.9531473978412982e-05, "loss": 0.1052, "step": 41125 }, { "epoch": 0.6060293662832653, "grad_norm": 1.3841328620910645, "learning_rate": 1.9529837606445997e-05, "loss": 0.1085, "step": 41150 }, { "epoch": 0.6063975493733523, "grad_norm": 1.6530568599700928, "learning_rate": 1.9528201234479015e-05, "loss": 0.1141, "step": 41175 }, { "epoch": 0.6067657324634395, "grad_norm": 1.4696321487426758, "learning_rate": 1.952656486251203e-05, "loss": 0.1042, "step": 41200 }, { "epoch": 0.6071339155535265, "grad_norm": 1.3700885772705078, "learning_rate": 1.9524928490545047e-05, "loss": 0.1157, "step": 41225 }, { "epoch": 0.6075020986436135, "grad_norm": 1.6159257888793945, "learning_rate": 1.9523292118578058e-05, "loss": 0.1095, "step": 41250 }, { "epoch": 0.6078702817337005, "grad_norm": 1.4576144218444824, "learning_rate": 1.9521655746611076e-05, "loss": 0.1094, "step": 41275 }, { "epoch": 0.6082384648237876, "grad_norm": 1.3515383005142212, "learning_rate": 1.952001937464409e-05, "loss": 0.1105, "step": 41300 }, { "epoch": 0.6086066479138746, "grad_norm": 2.048415184020996, "learning_rate": 1.9518383002677105e-05, "loss": 0.1126, "step": 41325 }, { "epoch": 0.6089748310039617, "grad_norm": 1.4868552684783936, "learning_rate": 1.9516746630710123e-05, "loss": 0.1131, "step": 41350 }, { "epoch": 0.6093430140940487, "grad_norm": 1.4398176670074463, "learning_rate": 1.9515110258743137e-05, "loss": 0.1109, "step": 41375 }, { "epoch": 0.6097111971841357, "grad_norm": 1.5747963190078735, "learning_rate": 1.951347388677615e-05, "loss": 0.1237, "step": 41400 }, { "epoch": 0.6100793802742228, "grad_norm": 1.4340903759002686, "learning_rate": 1.951183751480917e-05, "loss": 0.113, "step": 41425 }, { "epoch": 0.6104475633643098, "grad_norm": 1.3397008180618286, "learning_rate": 1.9510201142842184e-05, "loss": 0.1166, "step": 41450 }, { "epoch": 0.6108157464543968, "grad_norm": 1.5803062915802002, "learning_rate": 1.9508564770875198e-05, "loss": 0.1028, "step": 41475 }, { "epoch": 0.6111839295444839, "grad_norm": 1.4686222076416016, "learning_rate": 1.9506928398908213e-05, "loss": 0.1124, "step": 41500 }, { "epoch": 0.6115521126345709, "grad_norm": 1.6495213508605957, "learning_rate": 1.950529202694123e-05, "loss": 0.1091, "step": 41525 }, { "epoch": 0.611920295724658, "grad_norm": 1.6780121326446533, "learning_rate": 1.9503721109852924e-05, "loss": 0.1079, "step": 41550 }, { "epoch": 0.612288478814745, "grad_norm": 1.4636390209197998, "learning_rate": 1.950208473788594e-05, "loss": 0.1078, "step": 41575 }, { "epoch": 0.612656661904832, "grad_norm": 1.2767215967178345, "learning_rate": 1.9500448365918957e-05, "loss": 0.1039, "step": 41600 }, { "epoch": 0.613024844994919, "grad_norm": 1.3839603662490845, "learning_rate": 1.949881199395197e-05, "loss": 0.1116, "step": 41625 }, { "epoch": 0.6133930280850061, "grad_norm": 1.48479163646698, "learning_rate": 1.9497175621984986e-05, "loss": 0.1145, "step": 41650 }, { "epoch": 0.6137612111750932, "grad_norm": 1.3524249792099, "learning_rate": 1.9495539250018e-05, "loss": 0.1125, "step": 41675 }, { "epoch": 0.6141293942651802, "grad_norm": 1.472377061843872, "learning_rate": 1.9493902878051018e-05, "loss": 0.1158, "step": 41700 }, { "epoch": 0.6144975773552672, "grad_norm": 1.097522258758545, "learning_rate": 1.9492266506084032e-05, "loss": 0.1025, "step": 41725 }, { "epoch": 0.6148657604453542, "grad_norm": 1.3611419200897217, "learning_rate": 1.9490630134117047e-05, "loss": 0.1135, "step": 41750 }, { "epoch": 0.6152339435354413, "grad_norm": 1.1177406311035156, "learning_rate": 1.9488993762150065e-05, "loss": 0.1124, "step": 41775 }, { "epoch": 0.6156021266255284, "grad_norm": 1.864156723022461, "learning_rate": 1.948735739018308e-05, "loss": 0.1108, "step": 41800 }, { "epoch": 0.6159703097156154, "grad_norm": 1.9078421592712402, "learning_rate": 1.9485721018216094e-05, "loss": 0.1115, "step": 41825 }, { "epoch": 0.6163384928057024, "grad_norm": 1.9030513763427734, "learning_rate": 1.948408464624911e-05, "loss": 0.1153, "step": 41850 }, { "epoch": 0.6167066758957894, "grad_norm": 1.1852060556411743, "learning_rate": 1.9482448274282123e-05, "loss": 0.1108, "step": 41875 }, { "epoch": 0.6170748589858766, "grad_norm": 1.758246660232544, "learning_rate": 1.948081190231514e-05, "loss": 0.1056, "step": 41900 }, { "epoch": 0.6174430420759636, "grad_norm": 1.5357226133346558, "learning_rate": 1.9479175530348155e-05, "loss": 0.1132, "step": 41925 }, { "epoch": 0.6178112251660506, "grad_norm": 1.4268290996551514, "learning_rate": 1.9477539158381173e-05, "loss": 0.116, "step": 41950 }, { "epoch": 0.6181794082561376, "grad_norm": 1.5423840284347534, "learning_rate": 1.9475902786414187e-05, "loss": 0.1027, "step": 41975 }, { "epoch": 0.6185475913462246, "grad_norm": 1.2127351760864258, "learning_rate": 1.94742664144472e-05, "loss": 0.1033, "step": 42000 }, { "epoch": 0.6189157744363117, "grad_norm": 1.6671496629714966, "learning_rate": 1.947263004248022e-05, "loss": 0.1077, "step": 42025 }, { "epoch": 0.6192839575263988, "grad_norm": 1.988416314125061, "learning_rate": 1.9470993670513234e-05, "loss": 0.1102, "step": 42050 }, { "epoch": 0.6196521406164858, "grad_norm": 1.5522608757019043, "learning_rate": 1.946935729854625e-05, "loss": 0.1188, "step": 42075 }, { "epoch": 0.6200203237065728, "grad_norm": 1.4972020387649536, "learning_rate": 1.9467720926579263e-05, "loss": 0.1041, "step": 42100 }, { "epoch": 0.6203885067966598, "grad_norm": 1.4773261547088623, "learning_rate": 1.9466084554612277e-05, "loss": 0.1054, "step": 42125 }, { "epoch": 0.6207566898867469, "grad_norm": 1.4228787422180176, "learning_rate": 1.9464448182645295e-05, "loss": 0.1071, "step": 42150 }, { "epoch": 0.6211248729768339, "grad_norm": 2.1490607261657715, "learning_rate": 1.946281181067831e-05, "loss": 0.1134, "step": 42175 }, { "epoch": 0.621493056066921, "grad_norm": 1.4083261489868164, "learning_rate": 1.9461175438711327e-05, "loss": 0.1183, "step": 42200 }, { "epoch": 0.621861239157008, "grad_norm": 1.4812012910842896, "learning_rate": 1.9459539066744342e-05, "loss": 0.1102, "step": 42225 }, { "epoch": 0.622229422247095, "grad_norm": 1.5146194696426392, "learning_rate": 1.9457902694777356e-05, "loss": 0.1146, "step": 42250 }, { "epoch": 0.6225976053371821, "grad_norm": 1.5088244676589966, "learning_rate": 1.9456266322810374e-05, "loss": 0.1137, "step": 42275 }, { "epoch": 0.6229657884272691, "grad_norm": 1.7292484045028687, "learning_rate": 1.9454629950843385e-05, "loss": 0.1018, "step": 42300 }, { "epoch": 0.6233339715173561, "grad_norm": 1.335016131401062, "learning_rate": 1.9452993578876403e-05, "loss": 0.11, "step": 42325 }, { "epoch": 0.6237021546074432, "grad_norm": 1.0359456539154053, "learning_rate": 1.9451357206909418e-05, "loss": 0.1155, "step": 42350 }, { "epoch": 0.6240703376975302, "grad_norm": 1.4816323518753052, "learning_rate": 1.9449720834942435e-05, "loss": 0.1079, "step": 42375 }, { "epoch": 0.6244385207876173, "grad_norm": 1.6909608840942383, "learning_rate": 1.944808446297545e-05, "loss": 0.1039, "step": 42400 }, { "epoch": 0.6248067038777043, "grad_norm": 1.2284351587295532, "learning_rate": 1.9446448091008464e-05, "loss": 0.1059, "step": 42425 }, { "epoch": 0.6251748869677913, "grad_norm": 1.3990721702575684, "learning_rate": 1.9444811719041482e-05, "loss": 0.109, "step": 42450 }, { "epoch": 0.6255430700578783, "grad_norm": 2.34006929397583, "learning_rate": 1.9443175347074497e-05, "loss": 0.106, "step": 42475 }, { "epoch": 0.6259112531479655, "grad_norm": 1.3233106136322021, "learning_rate": 1.944153897510751e-05, "loss": 0.1112, "step": 42500 }, { "epoch": 0.6262794362380525, "grad_norm": 1.436488151550293, "learning_rate": 1.9439902603140526e-05, "loss": 0.1114, "step": 42525 }, { "epoch": 0.6266476193281395, "grad_norm": 1.1722458600997925, "learning_rate": 1.943826623117354e-05, "loss": 0.1062, "step": 42550 }, { "epoch": 0.6270158024182265, "grad_norm": 1.8988996744155884, "learning_rate": 1.9436629859206558e-05, "loss": 0.1202, "step": 42575 }, { "epoch": 0.6273839855083135, "grad_norm": 1.3709423542022705, "learning_rate": 1.9434993487239572e-05, "loss": 0.1151, "step": 42600 }, { "epoch": 0.6277521685984007, "grad_norm": 1.8680797815322876, "learning_rate": 1.943335711527259e-05, "loss": 0.1145, "step": 42625 }, { "epoch": 0.6281203516884877, "grad_norm": 1.2792071104049683, "learning_rate": 1.9431720743305605e-05, "loss": 0.1071, "step": 42650 }, { "epoch": 0.6284885347785747, "grad_norm": 1.495862364768982, "learning_rate": 1.943008437133862e-05, "loss": 0.1039, "step": 42675 }, { "epoch": 0.6288567178686617, "grad_norm": 1.2855013608932495, "learning_rate": 1.9428447999371637e-05, "loss": 0.1123, "step": 42700 }, { "epoch": 0.6292249009587487, "grad_norm": 1.80340576171875, "learning_rate": 1.9426811627404648e-05, "loss": 0.1017, "step": 42725 }, { "epoch": 0.6295930840488358, "grad_norm": 1.610538125038147, "learning_rate": 1.9425175255437666e-05, "loss": 0.1106, "step": 42750 }, { "epoch": 0.6299612671389229, "grad_norm": 1.4962716102600098, "learning_rate": 1.942353888347068e-05, "loss": 0.1064, "step": 42775 }, { "epoch": 0.6303294502290099, "grad_norm": 1.2373082637786865, "learning_rate": 1.9421902511503695e-05, "loss": 0.1069, "step": 42800 }, { "epoch": 0.6306976333190969, "grad_norm": 1.5830014944076538, "learning_rate": 1.9420266139536713e-05, "loss": 0.1079, "step": 42825 }, { "epoch": 0.6310658164091839, "grad_norm": 1.6252222061157227, "learning_rate": 1.9418629767569727e-05, "loss": 0.1062, "step": 42850 }, { "epoch": 0.631433999499271, "grad_norm": 1.5839202404022217, "learning_rate": 1.9416993395602745e-05, "loss": 0.1035, "step": 42875 }, { "epoch": 0.631802182589358, "grad_norm": 1.857698917388916, "learning_rate": 1.941535702363576e-05, "loss": 0.1091, "step": 42900 }, { "epoch": 0.6321703656794451, "grad_norm": 1.4146044254302979, "learning_rate": 1.9413720651668774e-05, "loss": 0.1031, "step": 42925 }, { "epoch": 0.6325385487695321, "grad_norm": 1.1775039434432983, "learning_rate": 1.9412084279701788e-05, "loss": 0.1153, "step": 42950 }, { "epoch": 0.6329067318596191, "grad_norm": 1.484903335571289, "learning_rate": 1.9410447907734803e-05, "loss": 0.1088, "step": 42975 }, { "epoch": 0.6332749149497062, "grad_norm": 1.5222270488739014, "learning_rate": 1.940881153576782e-05, "loss": 0.1147, "step": 43000 }, { "epoch": 0.6336430980397932, "grad_norm": 1.775303602218628, "learning_rate": 1.9407175163800835e-05, "loss": 0.0978, "step": 43025 }, { "epoch": 0.6340112811298803, "grad_norm": 1.9845086336135864, "learning_rate": 1.940553879183385e-05, "loss": 0.1163, "step": 43050 }, { "epoch": 0.6343794642199673, "grad_norm": 1.6547818183898926, "learning_rate": 1.9403902419866867e-05, "loss": 0.1053, "step": 43075 }, { "epoch": 0.6347476473100544, "grad_norm": 1.240424633026123, "learning_rate": 1.9402266047899882e-05, "loss": 0.0989, "step": 43100 }, { "epoch": 0.6351158304001414, "grad_norm": 1.1651711463928223, "learning_rate": 1.94006296759329e-05, "loss": 0.1086, "step": 43125 }, { "epoch": 0.6354840134902284, "grad_norm": 1.3533746004104614, "learning_rate": 1.939899330396591e-05, "loss": 0.1, "step": 43150 }, { "epoch": 0.6358521965803154, "grad_norm": 1.5015009641647339, "learning_rate": 1.939735693199893e-05, "loss": 0.1121, "step": 43175 }, { "epoch": 0.6362203796704025, "grad_norm": 1.7772409915924072, "learning_rate": 1.9395720560031943e-05, "loss": 0.1094, "step": 43200 }, { "epoch": 0.6365885627604896, "grad_norm": 1.4367293119430542, "learning_rate": 1.9394084188064957e-05, "loss": 0.1031, "step": 43225 }, { "epoch": 0.6369567458505766, "grad_norm": 1.2458570003509521, "learning_rate": 1.9392447816097975e-05, "loss": 0.1058, "step": 43250 }, { "epoch": 0.6373249289406636, "grad_norm": 1.1537954807281494, "learning_rate": 1.939081144413099e-05, "loss": 0.1131, "step": 43275 }, { "epoch": 0.6376931120307506, "grad_norm": 1.1713804006576538, "learning_rate": 1.9389175072164008e-05, "loss": 0.1085, "step": 43300 }, { "epoch": 0.6380612951208376, "grad_norm": 1.6194653511047363, "learning_rate": 1.9387538700197022e-05, "loss": 0.1141, "step": 43325 }, { "epoch": 0.6384294782109248, "grad_norm": 2.1495585441589355, "learning_rate": 1.9385902328230037e-05, "loss": 0.1127, "step": 43350 }, { "epoch": 0.6387976613010118, "grad_norm": 1.8170406818389893, "learning_rate": 1.938426595626305e-05, "loss": 0.1092, "step": 43375 }, { "epoch": 0.6391658443910988, "grad_norm": 1.5318728685379028, "learning_rate": 1.9382629584296065e-05, "loss": 0.1059, "step": 43400 }, { "epoch": 0.6395340274811858, "grad_norm": 1.771115779876709, "learning_rate": 1.9380993212329083e-05, "loss": 0.109, "step": 43425 }, { "epoch": 0.6399022105712728, "grad_norm": 1.321984887123108, "learning_rate": 1.9379356840362098e-05, "loss": 0.1135, "step": 43450 }, { "epoch": 0.64027039366136, "grad_norm": 1.3801411390304565, "learning_rate": 1.9377720468395112e-05, "loss": 0.1083, "step": 43475 }, { "epoch": 0.640638576751447, "grad_norm": 1.4694974422454834, "learning_rate": 1.937608409642813e-05, "loss": 0.1127, "step": 43500 }, { "epoch": 0.641006759841534, "grad_norm": 1.6573680639266968, "learning_rate": 1.9374447724461144e-05, "loss": 0.1092, "step": 43525 }, { "epoch": 0.641374942931621, "grad_norm": 1.3211015462875366, "learning_rate": 1.9372811352494162e-05, "loss": 0.1069, "step": 43550 }, { "epoch": 0.641743126021708, "grad_norm": 1.2467752695083618, "learning_rate": 1.9371174980527173e-05, "loss": 0.1052, "step": 43575 }, { "epoch": 0.6421113091117951, "grad_norm": 1.2694215774536133, "learning_rate": 1.936953860856019e-05, "loss": 0.1136, "step": 43600 }, { "epoch": 0.6424794922018822, "grad_norm": 1.6949684619903564, "learning_rate": 1.9367902236593206e-05, "loss": 0.1119, "step": 43625 }, { "epoch": 0.6428476752919692, "grad_norm": 1.5692588090896606, "learning_rate": 1.936626586462622e-05, "loss": 0.1033, "step": 43650 }, { "epoch": 0.6432158583820562, "grad_norm": 1.5174171924591064, "learning_rate": 1.9364694947537917e-05, "loss": 0.1063, "step": 43675 }, { "epoch": 0.6435840414721433, "grad_norm": 1.9864505529403687, "learning_rate": 1.9363058575570932e-05, "loss": 0.111, "step": 43700 }, { "epoch": 0.6439522245622303, "grad_norm": 1.7174837589263916, "learning_rate": 1.9361422203603946e-05, "loss": 0.114, "step": 43725 }, { "epoch": 0.6443204076523174, "grad_norm": 1.2995622158050537, "learning_rate": 1.9359785831636964e-05, "loss": 0.1177, "step": 43750 }, { "epoch": 0.6446885907424044, "grad_norm": 1.2390779256820679, "learning_rate": 1.935814945966998e-05, "loss": 0.1078, "step": 43775 }, { "epoch": 0.6450567738324914, "grad_norm": 1.2460603713989258, "learning_rate": 1.9356513087702993e-05, "loss": 0.1135, "step": 43800 }, { "epoch": 0.6454249569225785, "grad_norm": 1.3848339319229126, "learning_rate": 1.9354876715736008e-05, "loss": 0.1114, "step": 43825 }, { "epoch": 0.6457931400126655, "grad_norm": 1.4496217966079712, "learning_rate": 1.9353240343769025e-05, "loss": 0.1001, "step": 43850 }, { "epoch": 0.6461613231027525, "grad_norm": 1.855474591255188, "learning_rate": 1.935160397180204e-05, "loss": 0.1069, "step": 43875 }, { "epoch": 0.6465295061928396, "grad_norm": 1.4969669580459595, "learning_rate": 1.9349967599835054e-05, "loss": 0.1193, "step": 43900 }, { "epoch": 0.6468976892829266, "grad_norm": 1.3219048976898193, "learning_rate": 1.9348331227868072e-05, "loss": 0.0964, "step": 43925 }, { "epoch": 0.6472658723730137, "grad_norm": 1.2522467374801636, "learning_rate": 1.9346694855901087e-05, "loss": 0.0986, "step": 43950 }, { "epoch": 0.6476340554631007, "grad_norm": 1.5771262645721436, "learning_rate": 1.93450584839341e-05, "loss": 0.1143, "step": 43975 }, { "epoch": 0.6480022385531877, "grad_norm": 1.211543321609497, "learning_rate": 1.9343422111967116e-05, "loss": 0.1013, "step": 44000 }, { "epoch": 0.6483704216432747, "grad_norm": 1.8445452451705933, "learning_rate": 1.9341785740000133e-05, "loss": 0.1195, "step": 44025 }, { "epoch": 0.6487386047333618, "grad_norm": 1.344293475151062, "learning_rate": 1.9340149368033148e-05, "loss": 0.1063, "step": 44050 }, { "epoch": 0.6491067878234489, "grad_norm": 1.3796851634979248, "learning_rate": 1.9338512996066162e-05, "loss": 0.106, "step": 44075 }, { "epoch": 0.6494749709135359, "grad_norm": 1.2502880096435547, "learning_rate": 1.933687662409918e-05, "loss": 0.1091, "step": 44100 }, { "epoch": 0.6498431540036229, "grad_norm": 1.391850233078003, "learning_rate": 1.9335240252132195e-05, "loss": 0.1119, "step": 44125 }, { "epoch": 0.6502113370937099, "grad_norm": 1.270440936088562, "learning_rate": 1.933360388016521e-05, "loss": 0.1073, "step": 44150 }, { "epoch": 0.6505795201837969, "grad_norm": 1.4724406003952026, "learning_rate": 1.9331967508198227e-05, "loss": 0.1171, "step": 44175 }, { "epoch": 0.6509477032738841, "grad_norm": 0.945414662361145, "learning_rate": 1.9330331136231238e-05, "loss": 0.103, "step": 44200 }, { "epoch": 0.6513158863639711, "grad_norm": 1.268459439277649, "learning_rate": 1.9328694764264256e-05, "loss": 0.1011, "step": 44225 }, { "epoch": 0.6516840694540581, "grad_norm": 1.4114058017730713, "learning_rate": 1.932705839229727e-05, "loss": 0.1106, "step": 44250 }, { "epoch": 0.6520522525441451, "grad_norm": 1.3198155164718628, "learning_rate": 1.9325422020330288e-05, "loss": 0.0992, "step": 44275 }, { "epoch": 0.6524204356342322, "grad_norm": 1.4559319019317627, "learning_rate": 1.9323785648363303e-05, "loss": 0.1134, "step": 44300 }, { "epoch": 0.6527886187243193, "grad_norm": 1.640488624572754, "learning_rate": 1.9322149276396317e-05, "loss": 0.107, "step": 44325 }, { "epoch": 0.6531568018144063, "grad_norm": 1.2860337495803833, "learning_rate": 1.9320512904429335e-05, "loss": 0.1009, "step": 44350 }, { "epoch": 0.6535249849044933, "grad_norm": 1.3136343955993652, "learning_rate": 1.931887653246235e-05, "loss": 0.1106, "step": 44375 }, { "epoch": 0.6538931679945803, "grad_norm": 1.3006441593170166, "learning_rate": 1.9317240160495364e-05, "loss": 0.1101, "step": 44400 }, { "epoch": 0.6542613510846674, "grad_norm": 1.223717212677002, "learning_rate": 1.9315603788528378e-05, "loss": 0.1096, "step": 44425 }, { "epoch": 0.6546295341747544, "grad_norm": 2.2580654621124268, "learning_rate": 1.9313967416561393e-05, "loss": 0.1163, "step": 44450 }, { "epoch": 0.6549977172648415, "grad_norm": 1.3434696197509766, "learning_rate": 1.931233104459441e-05, "loss": 0.1031, "step": 44475 }, { "epoch": 0.6553659003549285, "grad_norm": 1.7967331409454346, "learning_rate": 1.9310694672627425e-05, "loss": 0.1071, "step": 44500 }, { "epoch": 0.6557340834450155, "grad_norm": 1.4282450675964355, "learning_rate": 1.9309058300660443e-05, "loss": 0.1068, "step": 44525 }, { "epoch": 0.6561022665351026, "grad_norm": 1.3770360946655273, "learning_rate": 1.9307421928693457e-05, "loss": 0.1076, "step": 44550 }, { "epoch": 0.6564704496251896, "grad_norm": 1.8066473007202148, "learning_rate": 1.9305785556726472e-05, "loss": 0.1057, "step": 44575 }, { "epoch": 0.6568386327152766, "grad_norm": 1.4721416234970093, "learning_rate": 1.930414918475949e-05, "loss": 0.1073, "step": 44600 }, { "epoch": 0.6572068158053637, "grad_norm": 1.9056334495544434, "learning_rate": 1.93025128127925e-05, "loss": 0.1033, "step": 44625 }, { "epoch": 0.6575749988954507, "grad_norm": 1.0975908041000366, "learning_rate": 1.930087644082552e-05, "loss": 0.1067, "step": 44650 }, { "epoch": 0.6579431819855378, "grad_norm": 2.040506362915039, "learning_rate": 1.9299240068858533e-05, "loss": 0.1071, "step": 44675 }, { "epoch": 0.6583113650756248, "grad_norm": 1.5115771293640137, "learning_rate": 1.929760369689155e-05, "loss": 0.1092, "step": 44700 }, { "epoch": 0.6586795481657118, "grad_norm": 1.4625195264816284, "learning_rate": 1.9295967324924565e-05, "loss": 0.1043, "step": 44725 }, { "epoch": 0.6590477312557989, "grad_norm": 1.707213282585144, "learning_rate": 1.929433095295758e-05, "loss": 0.1064, "step": 44750 }, { "epoch": 0.6594159143458859, "grad_norm": 1.4760093688964844, "learning_rate": 1.9292694580990598e-05, "loss": 0.1084, "step": 44775 }, { "epoch": 0.659784097435973, "grad_norm": 1.9389700889587402, "learning_rate": 1.9291058209023612e-05, "loss": 0.1023, "step": 44800 }, { "epoch": 0.66015228052606, "grad_norm": 1.11986243724823, "learning_rate": 1.9289421837056627e-05, "loss": 0.0964, "step": 44825 }, { "epoch": 0.660520463616147, "grad_norm": 1.4513438940048218, "learning_rate": 1.928778546508964e-05, "loss": 0.1132, "step": 44850 }, { "epoch": 0.660888646706234, "grad_norm": 1.8370686769485474, "learning_rate": 1.9286149093122655e-05, "loss": 0.1034, "step": 44875 }, { "epoch": 0.6612568297963212, "grad_norm": 1.4491779804229736, "learning_rate": 1.9284512721155673e-05, "loss": 0.1071, "step": 44900 }, { "epoch": 0.6616250128864082, "grad_norm": 1.2620149850845337, "learning_rate": 1.9282876349188688e-05, "loss": 0.1063, "step": 44925 }, { "epoch": 0.6619931959764952, "grad_norm": 1.2185320854187012, "learning_rate": 1.9281239977221706e-05, "loss": 0.1048, "step": 44950 }, { "epoch": 0.6623613790665822, "grad_norm": 1.1284700632095337, "learning_rate": 1.927960360525472e-05, "loss": 0.1087, "step": 44975 }, { "epoch": 0.6627295621566692, "grad_norm": 1.401711344718933, "learning_rate": 1.9277967233287735e-05, "loss": 0.1018, "step": 45000 }, { "epoch": 0.6630977452467564, "grad_norm": 1.722172737121582, "learning_rate": 1.9276330861320752e-05, "loss": 0.1013, "step": 45025 }, { "epoch": 0.6634659283368434, "grad_norm": 1.3857054710388184, "learning_rate": 1.9274694489353763e-05, "loss": 0.1008, "step": 45050 }, { "epoch": 0.6638341114269304, "grad_norm": 1.1007604598999023, "learning_rate": 1.927305811738678e-05, "loss": 0.103, "step": 45075 }, { "epoch": 0.6642022945170174, "grad_norm": 1.9387741088867188, "learning_rate": 1.9271421745419796e-05, "loss": 0.1052, "step": 45100 }, { "epoch": 0.6645704776071044, "grad_norm": 1.5912046432495117, "learning_rate": 1.926978537345281e-05, "loss": 0.1066, "step": 45125 }, { "epoch": 0.6649386606971915, "grad_norm": 1.3904691934585571, "learning_rate": 1.9268149001485828e-05, "loss": 0.1111, "step": 45150 }, { "epoch": 0.6653068437872786, "grad_norm": 1.140342354774475, "learning_rate": 1.9266512629518842e-05, "loss": 0.0987, "step": 45175 }, { "epoch": 0.6656750268773656, "grad_norm": 1.0333884954452515, "learning_rate": 1.926487625755186e-05, "loss": 0.1088, "step": 45200 }, { "epoch": 0.6660432099674526, "grad_norm": 1.5406895875930786, "learning_rate": 1.9263239885584875e-05, "loss": 0.1052, "step": 45225 }, { "epoch": 0.6664113930575396, "grad_norm": 1.225435495376587, "learning_rate": 1.926160351361789e-05, "loss": 0.0954, "step": 45250 }, { "epoch": 0.6667795761476267, "grad_norm": 1.4176855087280273, "learning_rate": 1.9259967141650904e-05, "loss": 0.0989, "step": 45275 }, { "epoch": 0.6671477592377137, "grad_norm": 1.8527737855911255, "learning_rate": 1.9258330769683918e-05, "loss": 0.1098, "step": 45300 }, { "epoch": 0.6675159423278008, "grad_norm": 1.5954591035842896, "learning_rate": 1.9256694397716936e-05, "loss": 0.1018, "step": 45325 }, { "epoch": 0.6678841254178878, "grad_norm": 1.2099926471710205, "learning_rate": 1.925505802574995e-05, "loss": 0.1029, "step": 45350 }, { "epoch": 0.6682523085079748, "grad_norm": 1.6647226810455322, "learning_rate": 1.9253421653782965e-05, "loss": 0.1101, "step": 45375 }, { "epoch": 0.6686204915980619, "grad_norm": 1.2445528507232666, "learning_rate": 1.9251785281815983e-05, "loss": 0.1045, "step": 45400 }, { "epoch": 0.6689886746881489, "grad_norm": 1.1713930368423462, "learning_rate": 1.9250148909848997e-05, "loss": 0.1051, "step": 45425 }, { "epoch": 0.669356857778236, "grad_norm": 1.7021634578704834, "learning_rate": 1.9248512537882015e-05, "loss": 0.0968, "step": 45450 }, { "epoch": 0.669725040868323, "grad_norm": 1.8190854787826538, "learning_rate": 1.9246876165915026e-05, "loss": 0.1001, "step": 45475 }, { "epoch": 0.6700932239584101, "grad_norm": 1.7377605438232422, "learning_rate": 1.9245239793948044e-05, "loss": 0.1045, "step": 45500 }, { "epoch": 0.6704614070484971, "grad_norm": 1.0050832033157349, "learning_rate": 1.924360342198106e-05, "loss": 0.1011, "step": 45525 }, { "epoch": 0.6708295901385841, "grad_norm": 1.3286410570144653, "learning_rate": 1.9241967050014073e-05, "loss": 0.116, "step": 45550 }, { "epoch": 0.6711977732286711, "grad_norm": 1.4806352853775024, "learning_rate": 1.924033067804709e-05, "loss": 0.1095, "step": 45575 }, { "epoch": 0.6715659563187582, "grad_norm": 1.5563225746154785, "learning_rate": 1.9238694306080105e-05, "loss": 0.1004, "step": 45600 }, { "epoch": 0.6719341394088453, "grad_norm": 1.7315696477890015, "learning_rate": 1.9237057934113123e-05, "loss": 0.0998, "step": 45625 }, { "epoch": 0.6723023224989323, "grad_norm": 1.3105772733688354, "learning_rate": 1.9235421562146137e-05, "loss": 0.0992, "step": 45650 }, { "epoch": 0.6726705055890193, "grad_norm": 1.2332693338394165, "learning_rate": 1.9233785190179152e-05, "loss": 0.1004, "step": 45675 }, { "epoch": 0.6730386886791063, "grad_norm": 1.423226237297058, "learning_rate": 1.9232148818212166e-05, "loss": 0.1114, "step": 45700 }, { "epoch": 0.6734068717691933, "grad_norm": 1.5766901969909668, "learning_rate": 1.923057790112386e-05, "loss": 0.1077, "step": 45725 }, { "epoch": 0.6737750548592805, "grad_norm": 1.4316579103469849, "learning_rate": 1.9228941529156878e-05, "loss": 0.1067, "step": 45750 }, { "epoch": 0.6741432379493675, "grad_norm": 1.2268834114074707, "learning_rate": 1.9227305157189893e-05, "loss": 0.1103, "step": 45775 }, { "epoch": 0.6745114210394545, "grad_norm": 1.6540658473968506, "learning_rate": 1.9225668785222907e-05, "loss": 0.1024, "step": 45800 }, { "epoch": 0.6748796041295415, "grad_norm": 1.6553481817245483, "learning_rate": 1.9224032413255925e-05, "loss": 0.1014, "step": 45825 }, { "epoch": 0.6752477872196285, "grad_norm": 1.6704182624816895, "learning_rate": 1.922239604128894e-05, "loss": 0.1028, "step": 45850 }, { "epoch": 0.6756159703097157, "grad_norm": 1.7540957927703857, "learning_rate": 1.9220759669321957e-05, "loss": 0.111, "step": 45875 }, { "epoch": 0.6759841533998027, "grad_norm": 1.5921074151992798, "learning_rate": 1.9219123297354968e-05, "loss": 0.1012, "step": 45900 }, { "epoch": 0.6763523364898897, "grad_norm": 1.7300814390182495, "learning_rate": 1.9217486925387986e-05, "loss": 0.102, "step": 45925 }, { "epoch": 0.6767205195799767, "grad_norm": 1.493948221206665, "learning_rate": 1.9215850553421e-05, "loss": 0.1036, "step": 45950 }, { "epoch": 0.6770887026700638, "grad_norm": 1.2351493835449219, "learning_rate": 1.9214214181454015e-05, "loss": 0.1038, "step": 45975 }, { "epoch": 0.6774568857601508, "grad_norm": 1.4516537189483643, "learning_rate": 1.9212577809487033e-05, "loss": 0.1038, "step": 46000 }, { "epoch": 0.6778250688502379, "grad_norm": 1.8888888359069824, "learning_rate": 1.9210941437520047e-05, "loss": 0.1044, "step": 46025 }, { "epoch": 0.6781932519403249, "grad_norm": 1.982967495918274, "learning_rate": 1.9209305065553062e-05, "loss": 0.0951, "step": 46050 }, { "epoch": 0.6785614350304119, "grad_norm": 1.325954794883728, "learning_rate": 1.920766869358608e-05, "loss": 0.1011, "step": 46075 }, { "epoch": 0.678929618120499, "grad_norm": 1.3796824216842651, "learning_rate": 1.9206032321619094e-05, "loss": 0.1051, "step": 46100 }, { "epoch": 0.679297801210586, "grad_norm": 1.883231520652771, "learning_rate": 1.920439594965211e-05, "loss": 0.1092, "step": 46125 }, { "epoch": 0.679665984300673, "grad_norm": 1.543330430984497, "learning_rate": 1.9202759577685123e-05, "loss": 0.1077, "step": 46150 }, { "epoch": 0.6800341673907601, "grad_norm": 1.438488245010376, "learning_rate": 1.920112320571814e-05, "loss": 0.1017, "step": 46175 }, { "epoch": 0.6804023504808471, "grad_norm": 1.5786123275756836, "learning_rate": 1.9199486833751155e-05, "loss": 0.1038, "step": 46200 }, { "epoch": 0.6807705335709342, "grad_norm": 1.271911382675171, "learning_rate": 1.919785046178417e-05, "loss": 0.1069, "step": 46225 }, { "epoch": 0.6811387166610212, "grad_norm": 1.5244625806808472, "learning_rate": 1.9196214089817188e-05, "loss": 0.0964, "step": 46250 }, { "epoch": 0.6815068997511082, "grad_norm": 1.725836157798767, "learning_rate": 1.9194577717850202e-05, "loss": 0.1022, "step": 46275 }, { "epoch": 0.6818750828411952, "grad_norm": 1.7017706632614136, "learning_rate": 1.9192941345883217e-05, "loss": 0.0946, "step": 46300 }, { "epoch": 0.6822432659312823, "grad_norm": 1.0990180969238281, "learning_rate": 1.919130497391623e-05, "loss": 0.0988, "step": 46325 }, { "epoch": 0.6826114490213694, "grad_norm": 1.1162875890731812, "learning_rate": 1.918966860194925e-05, "loss": 0.1001, "step": 46350 }, { "epoch": 0.6829796321114564, "grad_norm": 1.4849499464035034, "learning_rate": 1.9188032229982263e-05, "loss": 0.0977, "step": 46375 }, { "epoch": 0.6833478152015434, "grad_norm": 1.6886072158813477, "learning_rate": 1.9186395858015278e-05, "loss": 0.1129, "step": 46400 }, { "epoch": 0.6837159982916304, "grad_norm": 1.665539264678955, "learning_rate": 1.9184759486048296e-05, "loss": 0.1016, "step": 46425 }, { "epoch": 0.6840841813817174, "grad_norm": 1.4427030086517334, "learning_rate": 1.918312311408131e-05, "loss": 0.1056, "step": 46450 }, { "epoch": 0.6844523644718046, "grad_norm": 1.3449229001998901, "learning_rate": 1.9181486742114325e-05, "loss": 0.0998, "step": 46475 }, { "epoch": 0.6848205475618916, "grad_norm": 1.1819543838500977, "learning_rate": 1.9179850370147342e-05, "loss": 0.1079, "step": 46500 }, { "epoch": 0.6851887306519786, "grad_norm": 1.7344188690185547, "learning_rate": 1.9178213998180353e-05, "loss": 0.1093, "step": 46525 }, { "epoch": 0.6855569137420656, "grad_norm": 1.2552084922790527, "learning_rate": 1.917657762621337e-05, "loss": 0.1038, "step": 46550 }, { "epoch": 0.6859250968321527, "grad_norm": 1.4446220397949219, "learning_rate": 1.9174941254246386e-05, "loss": 0.1054, "step": 46575 }, { "epoch": 0.6862932799222398, "grad_norm": 1.3900601863861084, "learning_rate": 1.9173304882279404e-05, "loss": 0.1096, "step": 46600 }, { "epoch": 0.6866614630123268, "grad_norm": 1.7714508771896362, "learning_rate": 1.9171668510312418e-05, "loss": 0.1009, "step": 46625 }, { "epoch": 0.6870296461024138, "grad_norm": 1.361495852470398, "learning_rate": 1.9170032138345432e-05, "loss": 0.0987, "step": 46650 }, { "epoch": 0.6873978291925008, "grad_norm": 1.5515962839126587, "learning_rate": 1.916839576637845e-05, "loss": 0.1005, "step": 46675 }, { "epoch": 0.6877660122825879, "grad_norm": 1.361990213394165, "learning_rate": 1.9166759394411465e-05, "loss": 0.1097, "step": 46700 }, { "epoch": 0.688134195372675, "grad_norm": 1.5414332151412964, "learning_rate": 1.916512302244448e-05, "loss": 0.1074, "step": 46725 }, { "epoch": 0.688502378462762, "grad_norm": 1.8689407110214233, "learning_rate": 1.9163486650477494e-05, "loss": 0.0993, "step": 46750 }, { "epoch": 0.688870561552849, "grad_norm": 1.325414776802063, "learning_rate": 1.9161850278510508e-05, "loss": 0.1015, "step": 46775 }, { "epoch": 0.689238744642936, "grad_norm": 1.1635738611221313, "learning_rate": 1.9160213906543526e-05, "loss": 0.1052, "step": 46800 }, { "epoch": 0.6896069277330231, "grad_norm": 1.255697250366211, "learning_rate": 1.915857753457654e-05, "loss": 0.0973, "step": 46825 }, { "epoch": 0.6899751108231101, "grad_norm": 2.148343563079834, "learning_rate": 1.915694116260956e-05, "loss": 0.0998, "step": 46850 }, { "epoch": 0.6903432939131972, "grad_norm": 2.244750738143921, "learning_rate": 1.9155304790642573e-05, "loss": 0.1032, "step": 46875 }, { "epoch": 0.6907114770032842, "grad_norm": 1.555161476135254, "learning_rate": 1.9153668418675587e-05, "loss": 0.1129, "step": 46900 }, { "epoch": 0.6910796600933712, "grad_norm": 1.2285830974578857, "learning_rate": 1.9152032046708605e-05, "loss": 0.102, "step": 46925 }, { "epoch": 0.6914478431834583, "grad_norm": 1.63913893699646, "learning_rate": 1.9150395674741616e-05, "loss": 0.1069, "step": 46950 }, { "epoch": 0.6918160262735453, "grad_norm": 1.7224215269088745, "learning_rate": 1.9148759302774634e-05, "loss": 0.1048, "step": 46975 }, { "epoch": 0.6921842093636323, "grad_norm": 1.382535696029663, "learning_rate": 1.914712293080765e-05, "loss": 0.107, "step": 47000 }, { "epoch": 0.6925523924537194, "grad_norm": 1.5920780897140503, "learning_rate": 1.9145486558840666e-05, "loss": 0.1054, "step": 47025 }, { "epoch": 0.6929205755438064, "grad_norm": 1.670591950416565, "learning_rate": 1.914385018687368e-05, "loss": 0.1017, "step": 47050 }, { "epoch": 0.6932887586338935, "grad_norm": 1.547635793685913, "learning_rate": 1.9142213814906695e-05, "loss": 0.1063, "step": 47075 }, { "epoch": 0.6936569417239805, "grad_norm": 1.6910289525985718, "learning_rate": 1.9140577442939713e-05, "loss": 0.1062, "step": 47100 }, { "epoch": 0.6940251248140675, "grad_norm": 1.394062876701355, "learning_rate": 1.9138941070972728e-05, "loss": 0.1, "step": 47125 }, { "epoch": 0.6943933079041545, "grad_norm": 1.03938889503479, "learning_rate": 1.9137304699005742e-05, "loss": 0.0971, "step": 47150 }, { "epoch": 0.6947614909942417, "grad_norm": 1.3395599126815796, "learning_rate": 1.9135668327038756e-05, "loss": 0.0974, "step": 47175 }, { "epoch": 0.6951296740843287, "grad_norm": 1.4715617895126343, "learning_rate": 1.913403195507177e-05, "loss": 0.1013, "step": 47200 }, { "epoch": 0.6954978571744157, "grad_norm": 1.3387597799301147, "learning_rate": 1.913239558310479e-05, "loss": 0.1071, "step": 47225 }, { "epoch": 0.6958660402645027, "grad_norm": 1.8374826908111572, "learning_rate": 1.9130759211137803e-05, "loss": 0.1064, "step": 47250 }, { "epoch": 0.6962342233545897, "grad_norm": 1.387162446975708, "learning_rate": 1.912912283917082e-05, "loss": 0.1077, "step": 47275 }, { "epoch": 0.6966024064446769, "grad_norm": 1.6131870746612549, "learning_rate": 1.9127486467203835e-05, "loss": 0.0996, "step": 47300 }, { "epoch": 0.6969705895347639, "grad_norm": 1.4666141271591187, "learning_rate": 1.912585009523685e-05, "loss": 0.0977, "step": 47325 }, { "epoch": 0.6973387726248509, "grad_norm": 1.093837857246399, "learning_rate": 1.9124213723269868e-05, "loss": 0.0956, "step": 47350 }, { "epoch": 0.6977069557149379, "grad_norm": 1.4562652111053467, "learning_rate": 1.912257735130288e-05, "loss": 0.1075, "step": 47375 }, { "epoch": 0.6980751388050249, "grad_norm": 1.4729552268981934, "learning_rate": 1.9120940979335897e-05, "loss": 0.0974, "step": 47400 }, { "epoch": 0.698443321895112, "grad_norm": 2.071443557739258, "learning_rate": 1.911930460736891e-05, "loss": 0.0979, "step": 47425 }, { "epoch": 0.6988115049851991, "grad_norm": 1.8818188905715942, "learning_rate": 1.9117668235401926e-05, "loss": 0.1072, "step": 47450 }, { "epoch": 0.6991796880752861, "grad_norm": 1.4410918951034546, "learning_rate": 1.9116031863434943e-05, "loss": 0.099, "step": 47475 }, { "epoch": 0.6995478711653731, "grad_norm": 1.4807765483856201, "learning_rate": 1.9114395491467958e-05, "loss": 0.1082, "step": 47500 }, { "epoch": 0.6999160542554601, "grad_norm": 1.0479247570037842, "learning_rate": 1.9112759119500976e-05, "loss": 0.0945, "step": 47525 }, { "epoch": 0.7002842373455472, "grad_norm": 1.7598178386688232, "learning_rate": 1.911112274753399e-05, "loss": 0.0995, "step": 47550 }, { "epoch": 0.7006524204356342, "grad_norm": 0.9280902147293091, "learning_rate": 1.9109486375567005e-05, "loss": 0.0956, "step": 47575 }, { "epoch": 0.7010206035257213, "grad_norm": 1.8822145462036133, "learning_rate": 1.910785000360002e-05, "loss": 0.0997, "step": 47600 }, { "epoch": 0.7013887866158083, "grad_norm": 1.2990777492523193, "learning_rate": 1.9106213631633034e-05, "loss": 0.0977, "step": 47625 }, { "epoch": 0.7017569697058953, "grad_norm": 1.517279863357544, "learning_rate": 1.910457725966605e-05, "loss": 0.1051, "step": 47650 }, { "epoch": 0.7021251527959824, "grad_norm": 1.7701480388641357, "learning_rate": 1.9102940887699066e-05, "loss": 0.0956, "step": 47675 }, { "epoch": 0.7024933358860694, "grad_norm": 1.5461384057998657, "learning_rate": 1.910130451573208e-05, "loss": 0.1058, "step": 47700 }, { "epoch": 0.7028615189761565, "grad_norm": 2.013125419616699, "learning_rate": 1.9099668143765098e-05, "loss": 0.1029, "step": 47725 }, { "epoch": 0.7032297020662435, "grad_norm": 1.3453510999679565, "learning_rate": 1.9098031771798113e-05, "loss": 0.1037, "step": 47750 }, { "epoch": 0.7035978851563306, "grad_norm": 1.1628808975219727, "learning_rate": 1.9096395399831127e-05, "loss": 0.1009, "step": 47775 }, { "epoch": 0.7039660682464176, "grad_norm": 1.7610334157943726, "learning_rate": 1.909475902786414e-05, "loss": 0.0948, "step": 47800 }, { "epoch": 0.7043342513365046, "grad_norm": 1.581941843032837, "learning_rate": 1.909312265589716e-05, "loss": 0.1027, "step": 47825 }, { "epoch": 0.7047024344265916, "grad_norm": 1.7696298360824585, "learning_rate": 1.9091486283930174e-05, "loss": 0.0988, "step": 47850 }, { "epoch": 0.7050706175166787, "grad_norm": 1.5074560642242432, "learning_rate": 1.9089915366841868e-05, "loss": 0.1077, "step": 47875 }, { "epoch": 0.7054388006067658, "grad_norm": 1.5226465463638306, "learning_rate": 1.9088278994874886e-05, "loss": 0.1016, "step": 47900 }, { "epoch": 0.7058069836968528, "grad_norm": 1.6150729656219482, "learning_rate": 1.90866426229079e-05, "loss": 0.1051, "step": 47925 }, { "epoch": 0.7061751667869398, "grad_norm": 1.286678433418274, "learning_rate": 1.9085006250940915e-05, "loss": 0.0969, "step": 47950 }, { "epoch": 0.7065433498770268, "grad_norm": 1.6489728689193726, "learning_rate": 1.9083369878973932e-05, "loss": 0.1043, "step": 47975 }, { "epoch": 0.7069115329671138, "grad_norm": 1.15176260471344, "learning_rate": 1.9081733507006947e-05, "loss": 0.1039, "step": 48000 }, { "epoch": 0.707279716057201, "grad_norm": 1.4781898260116577, "learning_rate": 1.908009713503996e-05, "loss": 0.1004, "step": 48025 }, { "epoch": 0.707647899147288, "grad_norm": 1.9934347867965698, "learning_rate": 1.9078460763072976e-05, "loss": 0.0992, "step": 48050 }, { "epoch": 0.708016082237375, "grad_norm": 1.642067313194275, "learning_rate": 1.9076824391105994e-05, "loss": 0.1089, "step": 48075 }, { "epoch": 0.708384265327462, "grad_norm": 1.3216331005096436, "learning_rate": 1.9075188019139008e-05, "loss": 0.1035, "step": 48100 }, { "epoch": 0.708752448417549, "grad_norm": 1.0773968696594238, "learning_rate": 1.9073551647172022e-05, "loss": 0.1009, "step": 48125 }, { "epoch": 0.7091206315076362, "grad_norm": 1.0356942415237427, "learning_rate": 1.907191527520504e-05, "loss": 0.1127, "step": 48150 }, { "epoch": 0.7094888145977232, "grad_norm": 1.7541707754135132, "learning_rate": 1.9070278903238055e-05, "loss": 0.102, "step": 48175 }, { "epoch": 0.7098569976878102, "grad_norm": 1.1507172584533691, "learning_rate": 1.906864253127107e-05, "loss": 0.1051, "step": 48200 }, { "epoch": 0.7102251807778972, "grad_norm": 1.210410237312317, "learning_rate": 1.9067006159304084e-05, "loss": 0.0963, "step": 48225 }, { "epoch": 0.7105933638679842, "grad_norm": 1.0585200786590576, "learning_rate": 1.90653697873371e-05, "loss": 0.0939, "step": 48250 }, { "epoch": 0.7109615469580713, "grad_norm": 1.4461408853530884, "learning_rate": 1.9063733415370116e-05, "loss": 0.0969, "step": 48275 }, { "epoch": 0.7113297300481584, "grad_norm": 1.4013006687164307, "learning_rate": 1.906209704340313e-05, "loss": 0.1043, "step": 48300 }, { "epoch": 0.7116979131382454, "grad_norm": 1.6809771060943604, "learning_rate": 1.906046067143615e-05, "loss": 0.1049, "step": 48325 }, { "epoch": 0.7120660962283324, "grad_norm": 1.4544810056686401, "learning_rate": 1.9058824299469163e-05, "loss": 0.0983, "step": 48350 }, { "epoch": 0.7124342793184195, "grad_norm": 1.4573628902435303, "learning_rate": 1.9057187927502177e-05, "loss": 0.1028, "step": 48375 }, { "epoch": 0.7128024624085065, "grad_norm": 1.4733716249465942, "learning_rate": 1.9055551555535195e-05, "loss": 0.0979, "step": 48400 }, { "epoch": 0.7131706454985935, "grad_norm": 1.5174107551574707, "learning_rate": 1.905391518356821e-05, "loss": 0.1083, "step": 48425 }, { "epoch": 0.7135388285886806, "grad_norm": 1.47136652469635, "learning_rate": 1.9052278811601224e-05, "loss": 0.0999, "step": 48450 }, { "epoch": 0.7139070116787676, "grad_norm": 1.5804349184036255, "learning_rate": 1.905064243963424e-05, "loss": 0.0951, "step": 48475 }, { "epoch": 0.7142751947688547, "grad_norm": 1.93483567237854, "learning_rate": 1.9049006067667256e-05, "loss": 0.1065, "step": 48500 }, { "epoch": 0.7146433778589417, "grad_norm": 1.601036787033081, "learning_rate": 1.904736969570027e-05, "loss": 0.1056, "step": 48525 }, { "epoch": 0.7150115609490287, "grad_norm": 0.9350731372833252, "learning_rate": 1.9045733323733285e-05, "loss": 0.1049, "step": 48550 }, { "epoch": 0.7153797440391158, "grad_norm": 1.6524691581726074, "learning_rate": 1.9044096951766303e-05, "loss": 0.0961, "step": 48575 }, { "epoch": 0.7157479271292028, "grad_norm": 1.2378066778182983, "learning_rate": 1.9042460579799318e-05, "loss": 0.0932, "step": 48600 }, { "epoch": 0.7161161102192899, "grad_norm": 1.697403907775879, "learning_rate": 1.9040824207832332e-05, "loss": 0.106, "step": 48625 }, { "epoch": 0.7164842933093769, "grad_norm": 1.5412137508392334, "learning_rate": 1.9039187835865346e-05, "loss": 0.111, "step": 48650 }, { "epoch": 0.7168524763994639, "grad_norm": 1.4458122253417969, "learning_rate": 1.9037551463898364e-05, "loss": 0.101, "step": 48675 }, { "epoch": 0.7172206594895509, "grad_norm": 2.023613691329956, "learning_rate": 1.903591509193138e-05, "loss": 0.1081, "step": 48700 }, { "epoch": 0.717588842579638, "grad_norm": 1.2220475673675537, "learning_rate": 1.9034278719964393e-05, "loss": 0.0996, "step": 48725 }, { "epoch": 0.7179570256697251, "grad_norm": 1.5562247037887573, "learning_rate": 1.903264234799741e-05, "loss": 0.1014, "step": 48750 }, { "epoch": 0.7183252087598121, "grad_norm": 1.511494755744934, "learning_rate": 1.9031005976030425e-05, "loss": 0.1051, "step": 48775 }, { "epoch": 0.7186933918498991, "grad_norm": 1.2906818389892578, "learning_rate": 1.902936960406344e-05, "loss": 0.0984, "step": 48800 }, { "epoch": 0.7190615749399861, "grad_norm": 1.2554727792739868, "learning_rate": 1.9027733232096458e-05, "loss": 0.0995, "step": 48825 }, { "epoch": 0.7194297580300731, "grad_norm": 1.3066587448120117, "learning_rate": 1.902609686012947e-05, "loss": 0.1006, "step": 48850 }, { "epoch": 0.7197979411201603, "grad_norm": 1.3932998180389404, "learning_rate": 1.9024460488162487e-05, "loss": 0.0994, "step": 48875 }, { "epoch": 0.7201661242102473, "grad_norm": 1.5048588514328003, "learning_rate": 1.90228241161955e-05, "loss": 0.0983, "step": 48900 }, { "epoch": 0.7205343073003343, "grad_norm": 1.2415173053741455, "learning_rate": 1.902118774422852e-05, "loss": 0.1065, "step": 48925 }, { "epoch": 0.7209024903904213, "grad_norm": 1.327630877494812, "learning_rate": 1.9019551372261533e-05, "loss": 0.0943, "step": 48950 }, { "epoch": 0.7212706734805084, "grad_norm": 1.4476395845413208, "learning_rate": 1.9017915000294548e-05, "loss": 0.1102, "step": 48975 }, { "epoch": 0.7216388565705955, "grad_norm": 1.1207923889160156, "learning_rate": 1.9016278628327566e-05, "loss": 0.0969, "step": 49000 }, { "epoch": 0.7220070396606825, "grad_norm": 1.5516877174377441, "learning_rate": 1.901464225636058e-05, "loss": 0.0969, "step": 49025 }, { "epoch": 0.7223752227507695, "grad_norm": 1.5809789896011353, "learning_rate": 1.9013005884393595e-05, "loss": 0.1005, "step": 49050 }, { "epoch": 0.7227434058408565, "grad_norm": 1.2460857629776, "learning_rate": 1.901136951242661e-05, "loss": 0.0944, "step": 49075 }, { "epoch": 0.7231115889309436, "grad_norm": 1.2830605506896973, "learning_rate": 1.9009733140459624e-05, "loss": 0.0912, "step": 49100 }, { "epoch": 0.7234797720210306, "grad_norm": 1.442643642425537, "learning_rate": 1.900809676849264e-05, "loss": 0.1051, "step": 49125 }, { "epoch": 0.7238479551111177, "grad_norm": 1.4712581634521484, "learning_rate": 1.9006460396525656e-05, "loss": 0.1013, "step": 49150 }, { "epoch": 0.7242161382012047, "grad_norm": 1.5503424406051636, "learning_rate": 1.9004824024558674e-05, "loss": 0.1065, "step": 49175 }, { "epoch": 0.7245843212912917, "grad_norm": 1.4155282974243164, "learning_rate": 1.9003187652591688e-05, "loss": 0.1058, "step": 49200 }, { "epoch": 0.7249525043813788, "grad_norm": 1.432908535003662, "learning_rate": 1.9001551280624703e-05, "loss": 0.0947, "step": 49225 }, { "epoch": 0.7253206874714658, "grad_norm": 1.153944969177246, "learning_rate": 1.8999914908657717e-05, "loss": 0.1052, "step": 49250 }, { "epoch": 0.7256888705615528, "grad_norm": 1.7101420164108276, "learning_rate": 1.899827853669073e-05, "loss": 0.0985, "step": 49275 }, { "epoch": 0.7260570536516399, "grad_norm": 1.1989765167236328, "learning_rate": 1.899664216472375e-05, "loss": 0.0974, "step": 49300 }, { "epoch": 0.7264252367417269, "grad_norm": 1.0323476791381836, "learning_rate": 1.8995005792756764e-05, "loss": 0.0998, "step": 49325 }, { "epoch": 0.726793419831814, "grad_norm": 1.4167869091033936, "learning_rate": 1.8993369420789782e-05, "loss": 0.0997, "step": 49350 }, { "epoch": 0.727161602921901, "grad_norm": 1.1620646715164185, "learning_rate": 1.8991733048822796e-05, "loss": 0.1069, "step": 49375 }, { "epoch": 0.727529786011988, "grad_norm": 0.7772794961929321, "learning_rate": 1.899009667685581e-05, "loss": 0.0982, "step": 49400 }, { "epoch": 0.727897969102075, "grad_norm": 1.385764241218567, "learning_rate": 1.898846030488883e-05, "loss": 0.0987, "step": 49425 }, { "epoch": 0.7282661521921621, "grad_norm": 1.3250408172607422, "learning_rate": 1.8986823932921843e-05, "loss": 0.0984, "step": 49450 }, { "epoch": 0.7286343352822492, "grad_norm": 1.0105986595153809, "learning_rate": 1.8985187560954857e-05, "loss": 0.0967, "step": 49475 }, { "epoch": 0.7290025183723362, "grad_norm": 1.5108296871185303, "learning_rate": 1.8983551188987872e-05, "loss": 0.1031, "step": 49500 }, { "epoch": 0.7293707014624232, "grad_norm": 1.0454095602035522, "learning_rate": 1.8981914817020886e-05, "loss": 0.0982, "step": 49525 }, { "epoch": 0.7297388845525102, "grad_norm": 0.983040452003479, "learning_rate": 1.8980278445053904e-05, "loss": 0.1011, "step": 49550 }, { "epoch": 0.7301070676425974, "grad_norm": 1.607489824295044, "learning_rate": 1.897864207308692e-05, "loss": 0.0948, "step": 49575 }, { "epoch": 0.7304752507326844, "grad_norm": 1.830379605293274, "learning_rate": 1.8977005701119936e-05, "loss": 0.0978, "step": 49600 }, { "epoch": 0.7308434338227714, "grad_norm": 1.6186549663543701, "learning_rate": 1.897536932915295e-05, "loss": 0.0967, "step": 49625 }, { "epoch": 0.7312116169128584, "grad_norm": 1.5712846517562866, "learning_rate": 1.8973732957185965e-05, "loss": 0.105, "step": 49650 }, { "epoch": 0.7315798000029454, "grad_norm": 1.5042476654052734, "learning_rate": 1.897209658521898e-05, "loss": 0.1053, "step": 49675 }, { "epoch": 0.7319479830930326, "grad_norm": 1.6564544439315796, "learning_rate": 1.8970460213251994e-05, "loss": 0.0985, "step": 49700 }, { "epoch": 0.7323161661831196, "grad_norm": 1.3116837739944458, "learning_rate": 1.8968823841285012e-05, "loss": 0.0964, "step": 49725 }, { "epoch": 0.7326843492732066, "grad_norm": 2.3516788482666016, "learning_rate": 1.8967187469318027e-05, "loss": 0.0924, "step": 49750 }, { "epoch": 0.7330525323632936, "grad_norm": 1.4502614736557007, "learning_rate": 1.896555109735104e-05, "loss": 0.1021, "step": 49775 }, { "epoch": 0.7334207154533806, "grad_norm": 1.4912548065185547, "learning_rate": 1.896391472538406e-05, "loss": 0.1108, "step": 49800 }, { "epoch": 0.7337888985434677, "grad_norm": 1.382511019706726, "learning_rate": 1.8962278353417073e-05, "loss": 0.1022, "step": 49825 }, { "epoch": 0.7341570816335548, "grad_norm": 1.6152180433273315, "learning_rate": 1.896064198145009e-05, "loss": 0.1032, "step": 49850 }, { "epoch": 0.7345252647236418, "grad_norm": 1.4031436443328857, "learning_rate": 1.8959071064361785e-05, "loss": 0.0986, "step": 49875 }, { "epoch": 0.7348934478137288, "grad_norm": 1.2503541707992554, "learning_rate": 1.89574346923948e-05, "loss": 0.0975, "step": 49900 }, { "epoch": 0.7352616309038158, "grad_norm": 1.4993572235107422, "learning_rate": 1.8955798320427814e-05, "loss": 0.0995, "step": 49925 }, { "epoch": 0.7356298139939029, "grad_norm": 1.1862359046936035, "learning_rate": 1.895416194846083e-05, "loss": 0.1063, "step": 49950 }, { "epoch": 0.7359979970839899, "grad_norm": 1.5639508962631226, "learning_rate": 1.8952525576493846e-05, "loss": 0.1043, "step": 49975 }, { "epoch": 0.736366180174077, "grad_norm": 1.435240387916565, "learning_rate": 1.895088920452686e-05, "loss": 0.098, "step": 50000 }, { "epoch": 0.736734363264164, "grad_norm": 1.7050061225891113, "learning_rate": 1.8949252832559875e-05, "loss": 0.0956, "step": 50025 }, { "epoch": 0.737102546354251, "grad_norm": 1.7929081916809082, "learning_rate": 1.8947616460592893e-05, "loss": 0.1079, "step": 50050 }, { "epoch": 0.7374707294443381, "grad_norm": 1.2964180707931519, "learning_rate": 1.8945980088625908e-05, "loss": 0.0921, "step": 50075 }, { "epoch": 0.7378389125344251, "grad_norm": 1.3983927965164185, "learning_rate": 1.8944343716658922e-05, "loss": 0.098, "step": 50100 }, { "epoch": 0.7382070956245121, "grad_norm": 1.472080111503601, "learning_rate": 1.8942707344691936e-05, "loss": 0.0974, "step": 50125 }, { "epoch": 0.7385752787145992, "grad_norm": 1.0241786241531372, "learning_rate": 1.8941070972724954e-05, "loss": 0.099, "step": 50150 }, { "epoch": 0.7389434618046863, "grad_norm": 1.2100846767425537, "learning_rate": 1.893943460075797e-05, "loss": 0.0942, "step": 50175 }, { "epoch": 0.7393116448947733, "grad_norm": 1.2217756509780884, "learning_rate": 1.8937798228790983e-05, "loss": 0.1009, "step": 50200 }, { "epoch": 0.7396798279848603, "grad_norm": 1.4107080698013306, "learning_rate": 1.8936161856824e-05, "loss": 0.0926, "step": 50225 }, { "epoch": 0.7400480110749473, "grad_norm": 1.2571321725845337, "learning_rate": 1.8934525484857015e-05, "loss": 0.0994, "step": 50250 }, { "epoch": 0.7404161941650343, "grad_norm": 1.1018223762512207, "learning_rate": 1.893288911289003e-05, "loss": 0.0966, "step": 50275 }, { "epoch": 0.7407843772551215, "grad_norm": 1.0355523824691772, "learning_rate": 1.8931252740923048e-05, "loss": 0.0983, "step": 50300 }, { "epoch": 0.7411525603452085, "grad_norm": 1.5832747220993042, "learning_rate": 1.8929616368956062e-05, "loss": 0.1021, "step": 50325 }, { "epoch": 0.7415207434352955, "grad_norm": 1.28887939453125, "learning_rate": 1.8927979996989077e-05, "loss": 0.094, "step": 50350 }, { "epoch": 0.7418889265253825, "grad_norm": 1.5176652669906616, "learning_rate": 1.892634362502209e-05, "loss": 0.1066, "step": 50375 }, { "epoch": 0.7422571096154695, "grad_norm": 1.239126205444336, "learning_rate": 1.892470725305511e-05, "loss": 0.1003, "step": 50400 }, { "epoch": 0.7426252927055567, "grad_norm": 2.0967373847961426, "learning_rate": 1.8923070881088123e-05, "loss": 0.1038, "step": 50425 }, { "epoch": 0.7429934757956437, "grad_norm": 1.3348290920257568, "learning_rate": 1.8921434509121138e-05, "loss": 0.0968, "step": 50450 }, { "epoch": 0.7433616588857307, "grad_norm": 1.012868046760559, "learning_rate": 1.8919798137154156e-05, "loss": 0.1006, "step": 50475 }, { "epoch": 0.7437298419758177, "grad_norm": 1.712973713874817, "learning_rate": 1.891816176518717e-05, "loss": 0.1002, "step": 50500 }, { "epoch": 0.7440980250659047, "grad_norm": 1.1453803777694702, "learning_rate": 1.8916525393220185e-05, "loss": 0.0967, "step": 50525 }, { "epoch": 0.7444662081559918, "grad_norm": 1.5949763059616089, "learning_rate": 1.89148890212532e-05, "loss": 0.1106, "step": 50550 }, { "epoch": 0.7448343912460789, "grad_norm": 1.4714535474777222, "learning_rate": 1.8913252649286217e-05, "loss": 0.0939, "step": 50575 }, { "epoch": 0.7452025743361659, "grad_norm": 1.741516351699829, "learning_rate": 1.891161627731923e-05, "loss": 0.1047, "step": 50600 }, { "epoch": 0.7455707574262529, "grad_norm": 1.5519564151763916, "learning_rate": 1.8909979905352246e-05, "loss": 0.1025, "step": 50625 }, { "epoch": 0.7459389405163399, "grad_norm": 1.5566942691802979, "learning_rate": 1.8908343533385264e-05, "loss": 0.098, "step": 50650 }, { "epoch": 0.746307123606427, "grad_norm": 1.7024180889129639, "learning_rate": 1.8906707161418278e-05, "loss": 0.0984, "step": 50675 }, { "epoch": 0.746675306696514, "grad_norm": 1.2596818208694458, "learning_rate": 1.8905070789451293e-05, "loss": 0.101, "step": 50700 }, { "epoch": 0.7470434897866011, "grad_norm": 1.231046438217163, "learning_rate": 1.8903434417484307e-05, "loss": 0.0949, "step": 50725 }, { "epoch": 0.7474116728766881, "grad_norm": 1.4248147010803223, "learning_rate": 1.8901798045517325e-05, "loss": 0.0969, "step": 50750 }, { "epoch": 0.7477798559667752, "grad_norm": 1.8414579629898071, "learning_rate": 1.890016167355034e-05, "loss": 0.094, "step": 50775 }, { "epoch": 0.7481480390568622, "grad_norm": 1.1229593753814697, "learning_rate": 1.8898525301583354e-05, "loss": 0.1004, "step": 50800 }, { "epoch": 0.7485162221469492, "grad_norm": 1.872498631477356, "learning_rate": 1.8896888929616372e-05, "loss": 0.0978, "step": 50825 }, { "epoch": 0.7488844052370363, "grad_norm": 1.0315674543380737, "learning_rate": 1.8895252557649386e-05, "loss": 0.1044, "step": 50850 }, { "epoch": 0.7492525883271233, "grad_norm": 1.80376398563385, "learning_rate": 1.88936161856824e-05, "loss": 0.1019, "step": 50875 }, { "epoch": 0.7496207714172104, "grad_norm": 1.7327715158462524, "learning_rate": 1.889197981371542e-05, "loss": 0.0947, "step": 50900 }, { "epoch": 0.7499889545072974, "grad_norm": 1.1131285429000854, "learning_rate": 1.8890343441748433e-05, "loss": 0.0939, "step": 50925 }, { "epoch": 0.7503571375973844, "grad_norm": 1.7352478504180908, "learning_rate": 1.8888707069781447e-05, "loss": 0.0947, "step": 50950 }, { "epoch": 0.7507253206874714, "grad_norm": 1.7419734001159668, "learning_rate": 1.8887070697814462e-05, "loss": 0.0983, "step": 50975 }, { "epoch": 0.7510935037775585, "grad_norm": 1.6313399076461792, "learning_rate": 1.888543432584748e-05, "loss": 0.0972, "step": 51000 }, { "epoch": 0.7514616868676456, "grad_norm": 1.653120756149292, "learning_rate": 1.8883797953880494e-05, "loss": 0.1021, "step": 51025 }, { "epoch": 0.7518298699577326, "grad_norm": 1.4025392532348633, "learning_rate": 1.888216158191351e-05, "loss": 0.0995, "step": 51050 }, { "epoch": 0.7521980530478196, "grad_norm": 1.034682273864746, "learning_rate": 1.8880525209946526e-05, "loss": 0.0936, "step": 51075 }, { "epoch": 0.7525662361379066, "grad_norm": 1.1712440252304077, "learning_rate": 1.887888883797954e-05, "loss": 0.0963, "step": 51100 }, { "epoch": 0.7529344192279936, "grad_norm": 1.9316719770431519, "learning_rate": 1.8877252466012555e-05, "loss": 0.1021, "step": 51125 }, { "epoch": 0.7533026023180808, "grad_norm": 1.1776498556137085, "learning_rate": 1.887561609404557e-05, "loss": 0.0985, "step": 51150 }, { "epoch": 0.7536707854081678, "grad_norm": 1.3127164840698242, "learning_rate": 1.8873979722078584e-05, "loss": 0.0965, "step": 51175 }, { "epoch": 0.7540389684982548, "grad_norm": 1.6149221658706665, "learning_rate": 1.8872343350111602e-05, "loss": 0.0954, "step": 51200 }, { "epoch": 0.7544071515883418, "grad_norm": 1.319986343383789, "learning_rate": 1.8870706978144617e-05, "loss": 0.1009, "step": 51225 }, { "epoch": 0.7547753346784289, "grad_norm": 1.0375257730484009, "learning_rate": 1.8869070606177634e-05, "loss": 0.0988, "step": 51250 }, { "epoch": 0.755143517768516, "grad_norm": 1.3273632526397705, "learning_rate": 1.886743423421065e-05, "loss": 0.0969, "step": 51275 }, { "epoch": 0.755511700858603, "grad_norm": 1.7834882736206055, "learning_rate": 1.8865797862243663e-05, "loss": 0.1045, "step": 51300 }, { "epoch": 0.75587988394869, "grad_norm": 1.6718878746032715, "learning_rate": 1.886416149027668e-05, "loss": 0.0987, "step": 51325 }, { "epoch": 0.756248067038777, "grad_norm": 1.5869431495666504, "learning_rate": 1.8862525118309696e-05, "loss": 0.102, "step": 51350 }, { "epoch": 0.7566162501288641, "grad_norm": 1.3101059198379517, "learning_rate": 1.886088874634271e-05, "loss": 0.0975, "step": 51375 }, { "epoch": 0.7569844332189511, "grad_norm": 1.4751836061477661, "learning_rate": 1.8859252374375725e-05, "loss": 0.103, "step": 51400 }, { "epoch": 0.7573526163090382, "grad_norm": 1.408093810081482, "learning_rate": 1.885761600240874e-05, "loss": 0.0998, "step": 51425 }, { "epoch": 0.7577207993991252, "grad_norm": 1.8048471212387085, "learning_rate": 1.8855979630441757e-05, "loss": 0.0959, "step": 51450 }, { "epoch": 0.7580889824892122, "grad_norm": 1.5421810150146484, "learning_rate": 1.885434325847477e-05, "loss": 0.096, "step": 51475 }, { "epoch": 0.7584571655792993, "grad_norm": 1.2161544561386108, "learning_rate": 1.885270688650779e-05, "loss": 0.1043, "step": 51500 }, { "epoch": 0.7588253486693863, "grad_norm": 0.9103922247886658, "learning_rate": 1.8851070514540804e-05, "loss": 0.0863, "step": 51525 }, { "epoch": 0.7591935317594734, "grad_norm": 1.5881121158599854, "learning_rate": 1.8849434142573818e-05, "loss": 0.0995, "step": 51550 }, { "epoch": 0.7595617148495604, "grad_norm": 1.5382658243179321, "learning_rate": 1.8847797770606833e-05, "loss": 0.0954, "step": 51575 }, { "epoch": 0.7599298979396474, "grad_norm": 1.3792065382003784, "learning_rate": 1.8846161398639847e-05, "loss": 0.0971, "step": 51600 }, { "epoch": 0.7602980810297345, "grad_norm": 1.2182016372680664, "learning_rate": 1.8844525026672865e-05, "loss": 0.1004, "step": 51625 }, { "epoch": 0.7606662641198215, "grad_norm": 1.3744044303894043, "learning_rate": 1.884288865470588e-05, "loss": 0.1003, "step": 51650 }, { "epoch": 0.7610344472099085, "grad_norm": 1.3731317520141602, "learning_rate": 1.8841252282738897e-05, "loss": 0.101, "step": 51675 }, { "epoch": 0.7614026302999956, "grad_norm": 1.3473219871520996, "learning_rate": 1.883961591077191e-05, "loss": 0.1077, "step": 51700 }, { "epoch": 0.7617708133900826, "grad_norm": 1.207746148109436, "learning_rate": 1.8837979538804926e-05, "loss": 0.0994, "step": 51725 }, { "epoch": 0.7621389964801697, "grad_norm": 1.661965250968933, "learning_rate": 1.8836343166837944e-05, "loss": 0.1004, "step": 51750 }, { "epoch": 0.7625071795702567, "grad_norm": 1.4879807233810425, "learning_rate": 1.883470679487096e-05, "loss": 0.0915, "step": 51775 }, { "epoch": 0.7628753626603437, "grad_norm": 1.2649250030517578, "learning_rate": 1.8833070422903973e-05, "loss": 0.1006, "step": 51800 }, { "epoch": 0.7632435457504307, "grad_norm": 1.497623085975647, "learning_rate": 1.8831434050936987e-05, "loss": 0.097, "step": 51825 }, { "epoch": 0.7636117288405179, "grad_norm": 1.1468896865844727, "learning_rate": 1.8829797678970002e-05, "loss": 0.0939, "step": 51850 }, { "epoch": 0.7639799119306049, "grad_norm": 2.0533607006073, "learning_rate": 1.882816130700302e-05, "loss": 0.1044, "step": 51875 }, { "epoch": 0.7643480950206919, "grad_norm": 1.141491413116455, "learning_rate": 1.8826590389914713e-05, "loss": 0.1028, "step": 51900 }, { "epoch": 0.7647162781107789, "grad_norm": 1.504295825958252, "learning_rate": 1.882495401794773e-05, "loss": 0.0931, "step": 51925 }, { "epoch": 0.7650844612008659, "grad_norm": 1.668541431427002, "learning_rate": 1.8823317645980746e-05, "loss": 0.0998, "step": 51950 }, { "epoch": 0.765452644290953, "grad_norm": 1.099595308303833, "learning_rate": 1.882168127401376e-05, "loss": 0.0928, "step": 51975 }, { "epoch": 0.7658208273810401, "grad_norm": 2.686854839324951, "learning_rate": 1.8820044902046775e-05, "loss": 0.0979, "step": 52000 }, { "epoch": 0.7661890104711271, "grad_norm": 1.238408088684082, "learning_rate": 1.881840853007979e-05, "loss": 0.0923, "step": 52025 }, { "epoch": 0.7665571935612141, "grad_norm": 1.9959614276885986, "learning_rate": 1.8816772158112807e-05, "loss": 0.0916, "step": 52050 }, { "epoch": 0.7669253766513011, "grad_norm": 1.8930375576019287, "learning_rate": 1.881513578614582e-05, "loss": 0.0878, "step": 52075 }, { "epoch": 0.7672935597413882, "grad_norm": 1.0580706596374512, "learning_rate": 1.8813499414178836e-05, "loss": 0.0916, "step": 52100 }, { "epoch": 0.7676617428314753, "grad_norm": 1.445089340209961, "learning_rate": 1.8811863042211854e-05, "loss": 0.1049, "step": 52125 }, { "epoch": 0.7680299259215623, "grad_norm": 1.469886064529419, "learning_rate": 1.8810226670244868e-05, "loss": 0.0931, "step": 52150 }, { "epoch": 0.7683981090116493, "grad_norm": 1.663535714149475, "learning_rate": 1.8808590298277886e-05, "loss": 0.094, "step": 52175 }, { "epoch": 0.7687662921017363, "grad_norm": 0.9408849477767944, "learning_rate": 1.88069539263109e-05, "loss": 0.0922, "step": 52200 }, { "epoch": 0.7691344751918234, "grad_norm": 1.3737974166870117, "learning_rate": 1.8805317554343915e-05, "loss": 0.0944, "step": 52225 }, { "epoch": 0.7695026582819104, "grad_norm": 1.6405061483383179, "learning_rate": 1.880368118237693e-05, "loss": 0.0995, "step": 52250 }, { "epoch": 0.7698708413719975, "grad_norm": 1.3962156772613525, "learning_rate": 1.8802044810409944e-05, "loss": 0.0988, "step": 52275 }, { "epoch": 0.7702390244620845, "grad_norm": 2.0040688514709473, "learning_rate": 1.8800408438442962e-05, "loss": 0.1005, "step": 52300 }, { "epoch": 0.7706072075521715, "grad_norm": 1.5786962509155273, "learning_rate": 1.8798772066475976e-05, "loss": 0.1015, "step": 52325 }, { "epoch": 0.7709753906422586, "grad_norm": 1.340768575668335, "learning_rate": 1.879713569450899e-05, "loss": 0.1015, "step": 52350 }, { "epoch": 0.7713435737323456, "grad_norm": 1.3153142929077148, "learning_rate": 1.879549932254201e-05, "loss": 0.1, "step": 52375 }, { "epoch": 0.7717117568224326, "grad_norm": 1.3656054735183716, "learning_rate": 1.8793862950575023e-05, "loss": 0.093, "step": 52400 }, { "epoch": 0.7720799399125197, "grad_norm": 1.4795976877212524, "learning_rate": 1.8792226578608037e-05, "loss": 0.093, "step": 52425 }, { "epoch": 0.7724481230026068, "grad_norm": 1.3683342933654785, "learning_rate": 1.8790590206641052e-05, "loss": 0.0931, "step": 52450 }, { "epoch": 0.7728163060926938, "grad_norm": 1.2923591136932373, "learning_rate": 1.878895383467407e-05, "loss": 0.0998, "step": 52475 }, { "epoch": 0.7731844891827808, "grad_norm": 1.2481595277786255, "learning_rate": 1.8787317462707084e-05, "loss": 0.1047, "step": 52500 }, { "epoch": 0.7735526722728678, "grad_norm": 1.6275888681411743, "learning_rate": 1.87856810907401e-05, "loss": 0.095, "step": 52525 }, { "epoch": 0.7739208553629549, "grad_norm": 1.199249267578125, "learning_rate": 1.8784044718773116e-05, "loss": 0.104, "step": 52550 }, { "epoch": 0.774289038453042, "grad_norm": 1.2225192785263062, "learning_rate": 1.878240834680613e-05, "loss": 0.0986, "step": 52575 }, { "epoch": 0.774657221543129, "grad_norm": 1.8041224479675293, "learning_rate": 1.8780771974839145e-05, "loss": 0.0943, "step": 52600 }, { "epoch": 0.775025404633216, "grad_norm": 1.5905193090438843, "learning_rate": 1.877913560287216e-05, "loss": 0.0926, "step": 52625 }, { "epoch": 0.775393587723303, "grad_norm": 1.4913384914398193, "learning_rate": 1.8777499230905178e-05, "loss": 0.1008, "step": 52650 }, { "epoch": 0.77576177081339, "grad_norm": 1.650696039199829, "learning_rate": 1.8775862858938192e-05, "loss": 0.1046, "step": 52675 }, { "epoch": 0.7761299539034772, "grad_norm": 1.3778458833694458, "learning_rate": 1.8774226486971207e-05, "loss": 0.1011, "step": 52700 }, { "epoch": 0.7764981369935642, "grad_norm": 1.8180224895477295, "learning_rate": 1.8772590115004224e-05, "loss": 0.097, "step": 52725 }, { "epoch": 0.7768663200836512, "grad_norm": 1.3458949327468872, "learning_rate": 1.877095374303724e-05, "loss": 0.0948, "step": 52750 }, { "epoch": 0.7772345031737382, "grad_norm": 1.6592265367507935, "learning_rate": 1.8769317371070253e-05, "loss": 0.1001, "step": 52775 }, { "epoch": 0.7776026862638252, "grad_norm": 1.6483943462371826, "learning_rate": 1.876768099910327e-05, "loss": 0.1036, "step": 52800 }, { "epoch": 0.7779708693539124, "grad_norm": 1.3661013841629028, "learning_rate": 1.8766044627136286e-05, "loss": 0.0912, "step": 52825 }, { "epoch": 0.7783390524439994, "grad_norm": 1.1678917407989502, "learning_rate": 1.87644082551693e-05, "loss": 0.0945, "step": 52850 }, { "epoch": 0.7787072355340864, "grad_norm": 1.4729899168014526, "learning_rate": 1.8762771883202315e-05, "loss": 0.107, "step": 52875 }, { "epoch": 0.7790754186241734, "grad_norm": 1.327132225036621, "learning_rate": 1.8761135511235332e-05, "loss": 0.0947, "step": 52900 }, { "epoch": 0.7794436017142604, "grad_norm": 1.6538245677947998, "learning_rate": 1.8759499139268347e-05, "loss": 0.0962, "step": 52925 }, { "epoch": 0.7798117848043475, "grad_norm": 1.7244343757629395, "learning_rate": 1.875786276730136e-05, "loss": 0.0945, "step": 52950 }, { "epoch": 0.7801799678944346, "grad_norm": 1.4939213991165161, "learning_rate": 1.875622639533438e-05, "loss": 0.0957, "step": 52975 }, { "epoch": 0.7805481509845216, "grad_norm": 1.5427114963531494, "learning_rate": 1.8754590023367394e-05, "loss": 0.1036, "step": 53000 }, { "epoch": 0.7809163340746086, "grad_norm": 1.3049370050430298, "learning_rate": 1.8752953651400408e-05, "loss": 0.1012, "step": 53025 }, { "epoch": 0.7812845171646957, "grad_norm": 1.4951084852218628, "learning_rate": 1.8751317279433423e-05, "loss": 0.1063, "step": 53050 }, { "epoch": 0.7816527002547827, "grad_norm": 1.6251379251480103, "learning_rate": 1.874968090746644e-05, "loss": 0.0988, "step": 53075 }, { "epoch": 0.7820208833448697, "grad_norm": 1.550650954246521, "learning_rate": 1.8748044535499455e-05, "loss": 0.0948, "step": 53100 }, { "epoch": 0.7823890664349568, "grad_norm": 1.4532803297042847, "learning_rate": 1.874640816353247e-05, "loss": 0.0898, "step": 53125 }, { "epoch": 0.7827572495250438, "grad_norm": 1.425616979598999, "learning_rate": 1.8744771791565487e-05, "loss": 0.0985, "step": 53150 }, { "epoch": 0.7831254326151309, "grad_norm": 1.7752586603164673, "learning_rate": 1.87431354195985e-05, "loss": 0.1004, "step": 53175 }, { "epoch": 0.7834936157052179, "grad_norm": 1.4681975841522217, "learning_rate": 1.8741499047631516e-05, "loss": 0.0989, "step": 53200 }, { "epoch": 0.7838617987953049, "grad_norm": 1.60545814037323, "learning_rate": 1.8739862675664534e-05, "loss": 0.0974, "step": 53225 }, { "epoch": 0.784229981885392, "grad_norm": 1.7402600049972534, "learning_rate": 1.873822630369755e-05, "loss": 0.0988, "step": 53250 }, { "epoch": 0.784598164975479, "grad_norm": 1.5226898193359375, "learning_rate": 1.8736589931730563e-05, "loss": 0.0927, "step": 53275 }, { "epoch": 0.7849663480655661, "grad_norm": 1.093587040901184, "learning_rate": 1.8734953559763577e-05, "loss": 0.0971, "step": 53300 }, { "epoch": 0.7853345311556531, "grad_norm": 1.4167509078979492, "learning_rate": 1.8733317187796595e-05, "loss": 0.0947, "step": 53325 }, { "epoch": 0.7857027142457401, "grad_norm": 1.8050156831741333, "learning_rate": 1.873168081582961e-05, "loss": 0.0932, "step": 53350 }, { "epoch": 0.7860708973358271, "grad_norm": 1.195334792137146, "learning_rate": 1.8730044443862624e-05, "loss": 0.0969, "step": 53375 }, { "epoch": 0.7864390804259141, "grad_norm": 1.2866966724395752, "learning_rate": 1.8728408071895642e-05, "loss": 0.0972, "step": 53400 }, { "epoch": 0.7868072635160013, "grad_norm": 1.5800254344940186, "learning_rate": 1.8726771699928656e-05, "loss": 0.0934, "step": 53425 }, { "epoch": 0.7871754466060883, "grad_norm": 0.9870246052742004, "learning_rate": 1.872513532796167e-05, "loss": 0.0941, "step": 53450 }, { "epoch": 0.7875436296961753, "grad_norm": 1.4179213047027588, "learning_rate": 1.8723498955994685e-05, "loss": 0.0998, "step": 53475 }, { "epoch": 0.7879118127862623, "grad_norm": 1.460280179977417, "learning_rate": 1.87218625840277e-05, "loss": 0.0978, "step": 53500 }, { "epoch": 0.7882799958763493, "grad_norm": 1.501106858253479, "learning_rate": 1.8720226212060718e-05, "loss": 0.0951, "step": 53525 }, { "epoch": 0.7886481789664365, "grad_norm": 1.4157145023345947, "learning_rate": 1.8718589840093732e-05, "loss": 0.0982, "step": 53550 }, { "epoch": 0.7890163620565235, "grad_norm": 1.8089770078659058, "learning_rate": 1.871695346812675e-05, "loss": 0.0971, "step": 53575 }, { "epoch": 0.7893845451466105, "grad_norm": 1.740575909614563, "learning_rate": 1.8715317096159764e-05, "loss": 0.0985, "step": 53600 }, { "epoch": 0.7897527282366975, "grad_norm": 1.326170802116394, "learning_rate": 1.871368072419278e-05, "loss": 0.1007, "step": 53625 }, { "epoch": 0.7901209113267846, "grad_norm": 1.310464859008789, "learning_rate": 1.8712044352225797e-05, "loss": 0.0967, "step": 53650 }, { "epoch": 0.7904890944168717, "grad_norm": 1.653826355934143, "learning_rate": 1.871040798025881e-05, "loss": 0.0923, "step": 53675 }, { "epoch": 0.7908572775069587, "grad_norm": 1.535834789276123, "learning_rate": 1.8708771608291826e-05, "loss": 0.0947, "step": 53700 }, { "epoch": 0.7912254605970457, "grad_norm": 1.4991453886032104, "learning_rate": 1.870713523632484e-05, "loss": 0.0947, "step": 53725 }, { "epoch": 0.7915936436871327, "grad_norm": 1.3092700242996216, "learning_rate": 1.8705498864357854e-05, "loss": 0.1005, "step": 53750 }, { "epoch": 0.7919618267772198, "grad_norm": 1.345397710800171, "learning_rate": 1.8703862492390872e-05, "loss": 0.0856, "step": 53775 }, { "epoch": 0.7923300098673068, "grad_norm": 1.520815372467041, "learning_rate": 1.8702226120423887e-05, "loss": 0.0926, "step": 53800 }, { "epoch": 0.7926981929573939, "grad_norm": 1.2822909355163574, "learning_rate": 1.8700589748456905e-05, "loss": 0.0926, "step": 53825 }, { "epoch": 0.7930663760474809, "grad_norm": 1.1882009506225586, "learning_rate": 1.869895337648992e-05, "loss": 0.0886, "step": 53850 }, { "epoch": 0.7934345591375679, "grad_norm": 1.1720936298370361, "learning_rate": 1.8697317004522934e-05, "loss": 0.0972, "step": 53875 }, { "epoch": 0.793802742227655, "grad_norm": 1.4800114631652832, "learning_rate": 1.8695680632555948e-05, "loss": 0.1012, "step": 53900 }, { "epoch": 0.794170925317742, "grad_norm": 0.954601526260376, "learning_rate": 1.8694044260588962e-05, "loss": 0.0892, "step": 53925 }, { "epoch": 0.794539108407829, "grad_norm": 1.4864304065704346, "learning_rate": 1.869240788862198e-05, "loss": 0.1083, "step": 53950 }, { "epoch": 0.7949072914979161, "grad_norm": 2.026531219482422, "learning_rate": 1.8690836971533674e-05, "loss": 0.0989, "step": 53975 }, { "epoch": 0.7952754745880031, "grad_norm": 1.781833291053772, "learning_rate": 1.868920059956669e-05, "loss": 0.0949, "step": 54000 }, { "epoch": 0.7956436576780902, "grad_norm": 1.2690705060958862, "learning_rate": 1.8687564227599706e-05, "loss": 0.0969, "step": 54025 }, { "epoch": 0.7960118407681772, "grad_norm": 1.2814421653747559, "learning_rate": 1.868592785563272e-05, "loss": 0.096, "step": 54050 }, { "epoch": 0.7963800238582642, "grad_norm": 1.3262174129486084, "learning_rate": 1.868429148366574e-05, "loss": 0.0981, "step": 54075 }, { "epoch": 0.7967482069483512, "grad_norm": 1.3594454526901245, "learning_rate": 1.868265511169875e-05, "loss": 0.0977, "step": 54100 }, { "epoch": 0.7971163900384383, "grad_norm": 1.4144097566604614, "learning_rate": 1.8681018739731768e-05, "loss": 0.0927, "step": 54125 }, { "epoch": 0.7974845731285254, "grad_norm": 1.2568429708480835, "learning_rate": 1.8679382367764782e-05, "loss": 0.0937, "step": 54150 }, { "epoch": 0.7978527562186124, "grad_norm": 1.5641965866088867, "learning_rate": 1.8677745995797797e-05, "loss": 0.0999, "step": 54175 }, { "epoch": 0.7982209393086994, "grad_norm": 1.2955148220062256, "learning_rate": 1.8676109623830814e-05, "loss": 0.0846, "step": 54200 }, { "epoch": 0.7985891223987864, "grad_norm": 1.4151428937911987, "learning_rate": 1.867447325186383e-05, "loss": 0.089, "step": 54225 }, { "epoch": 0.7989573054888736, "grad_norm": 1.3893907070159912, "learning_rate": 1.8672836879896847e-05, "loss": 0.0923, "step": 54250 }, { "epoch": 0.7993254885789606, "grad_norm": 1.4878407716751099, "learning_rate": 1.867120050792986e-05, "loss": 0.0998, "step": 54275 }, { "epoch": 0.7996936716690476, "grad_norm": 1.5924643278121948, "learning_rate": 1.8669564135962876e-05, "loss": 0.0942, "step": 54300 }, { "epoch": 0.8000618547591346, "grad_norm": 1.259968876838684, "learning_rate": 1.866792776399589e-05, "loss": 0.0956, "step": 54325 }, { "epoch": 0.8004300378492216, "grad_norm": 1.595425009727478, "learning_rate": 1.8666291392028905e-05, "loss": 0.1024, "step": 54350 }, { "epoch": 0.8007982209393087, "grad_norm": 1.4488376379013062, "learning_rate": 1.8664655020061922e-05, "loss": 0.0957, "step": 54375 }, { "epoch": 0.8011664040293958, "grad_norm": 1.3736494779586792, "learning_rate": 1.8663018648094937e-05, "loss": 0.0925, "step": 54400 }, { "epoch": 0.8015345871194828, "grad_norm": 1.542526364326477, "learning_rate": 1.866138227612795e-05, "loss": 0.0972, "step": 54425 }, { "epoch": 0.8019027702095698, "grad_norm": 1.651017427444458, "learning_rate": 1.865974590416097e-05, "loss": 0.0982, "step": 54450 }, { "epoch": 0.8022709532996568, "grad_norm": 1.4605393409729004, "learning_rate": 1.8658109532193984e-05, "loss": 0.0995, "step": 54475 }, { "epoch": 0.8026391363897439, "grad_norm": 1.0872023105621338, "learning_rate": 1.8656473160227e-05, "loss": 0.0966, "step": 54500 }, { "epoch": 0.803007319479831, "grad_norm": 1.2862284183502197, "learning_rate": 1.8654836788260013e-05, "loss": 0.0917, "step": 54525 }, { "epoch": 0.803375502569918, "grad_norm": 1.5432548522949219, "learning_rate": 1.865320041629303e-05, "loss": 0.0954, "step": 54550 }, { "epoch": 0.803743685660005, "grad_norm": 1.3444406986236572, "learning_rate": 1.8651564044326045e-05, "loss": 0.0967, "step": 54575 }, { "epoch": 0.804111868750092, "grad_norm": 1.7558733224868774, "learning_rate": 1.864992767235906e-05, "loss": 0.1004, "step": 54600 }, { "epoch": 0.8044800518401791, "grad_norm": 1.575652837753296, "learning_rate": 1.8648291300392077e-05, "loss": 0.0996, "step": 54625 }, { "epoch": 0.8048482349302661, "grad_norm": 1.2382612228393555, "learning_rate": 1.864665492842509e-05, "loss": 0.0857, "step": 54650 }, { "epoch": 0.8052164180203532, "grad_norm": 1.3315802812576294, "learning_rate": 1.8645018556458106e-05, "loss": 0.0888, "step": 54675 }, { "epoch": 0.8055846011104402, "grad_norm": 2.0507583618164062, "learning_rate": 1.8643382184491124e-05, "loss": 0.0951, "step": 54700 }, { "epoch": 0.8059527842005272, "grad_norm": 1.2660714387893677, "learning_rate": 1.864174581252414e-05, "loss": 0.102, "step": 54725 }, { "epoch": 0.8063209672906143, "grad_norm": 1.1463576555252075, "learning_rate": 1.8640109440557153e-05, "loss": 0.0911, "step": 54750 }, { "epoch": 0.8066891503807013, "grad_norm": 1.4842114448547363, "learning_rate": 1.8638473068590167e-05, "loss": 0.0954, "step": 54775 }, { "epoch": 0.8070573334707883, "grad_norm": 1.8125190734863281, "learning_rate": 1.8636836696623185e-05, "loss": 0.092, "step": 54800 }, { "epoch": 0.8074255165608754, "grad_norm": 1.4588818550109863, "learning_rate": 1.86352003246562e-05, "loss": 0.0943, "step": 54825 }, { "epoch": 0.8077936996509625, "grad_norm": 1.6068251132965088, "learning_rate": 1.8633563952689214e-05, "loss": 0.0974, "step": 54850 }, { "epoch": 0.8081618827410495, "grad_norm": 1.197568416595459, "learning_rate": 1.8631927580722232e-05, "loss": 0.0875, "step": 54875 }, { "epoch": 0.8085300658311365, "grad_norm": 1.2989412546157837, "learning_rate": 1.8630291208755246e-05, "loss": 0.1026, "step": 54900 }, { "epoch": 0.8088982489212235, "grad_norm": 1.7455549240112305, "learning_rate": 1.862865483678826e-05, "loss": 0.0931, "step": 54925 }, { "epoch": 0.8092664320113105, "grad_norm": 1.8075650930404663, "learning_rate": 1.8627018464821275e-05, "loss": 0.0916, "step": 54950 }, { "epoch": 0.8096346151013977, "grad_norm": 1.4007229804992676, "learning_rate": 1.8625382092854293e-05, "loss": 0.1013, "step": 54975 }, { "epoch": 0.8100027981914847, "grad_norm": 1.1319169998168945, "learning_rate": 1.8623745720887308e-05, "loss": 0.0948, "step": 55000 }, { "epoch": 0.8103709812815717, "grad_norm": 1.3432068824768066, "learning_rate": 1.8622109348920322e-05, "loss": 0.0953, "step": 55025 }, { "epoch": 0.8107391643716587, "grad_norm": 1.4980003833770752, "learning_rate": 1.862047297695334e-05, "loss": 0.0942, "step": 55050 }, { "epoch": 0.8111073474617457, "grad_norm": 1.552869200706482, "learning_rate": 1.8618836604986354e-05, "loss": 0.096, "step": 55075 }, { "epoch": 0.8114755305518329, "grad_norm": 1.2944960594177246, "learning_rate": 1.861720023301937e-05, "loss": 0.0908, "step": 55100 }, { "epoch": 0.8118437136419199, "grad_norm": 1.6127983331680298, "learning_rate": 1.8615563861052387e-05, "loss": 0.0931, "step": 55125 }, { "epoch": 0.8122118967320069, "grad_norm": 2.118488073348999, "learning_rate": 1.86139274890854e-05, "loss": 0.0996, "step": 55150 }, { "epoch": 0.8125800798220939, "grad_norm": 1.4998732805252075, "learning_rate": 1.8612291117118416e-05, "loss": 0.0971, "step": 55175 }, { "epoch": 0.8129482629121809, "grad_norm": 1.4116191864013672, "learning_rate": 1.861065474515143e-05, "loss": 0.0931, "step": 55200 }, { "epoch": 0.813316446002268, "grad_norm": 0.9913114309310913, "learning_rate": 1.8609018373184448e-05, "loss": 0.0902, "step": 55225 }, { "epoch": 0.8136846290923551, "grad_norm": 1.3107503652572632, "learning_rate": 1.8607382001217462e-05, "loss": 0.0885, "step": 55250 }, { "epoch": 0.8140528121824421, "grad_norm": 1.0871694087982178, "learning_rate": 1.8605745629250477e-05, "loss": 0.0936, "step": 55275 }, { "epoch": 0.8144209952725291, "grad_norm": 1.364189863204956, "learning_rate": 1.8604109257283495e-05, "loss": 0.0962, "step": 55300 }, { "epoch": 0.8147891783626161, "grad_norm": 1.3079403638839722, "learning_rate": 1.860247288531651e-05, "loss": 0.0914, "step": 55325 }, { "epoch": 0.8151573614527032, "grad_norm": 1.3661223649978638, "learning_rate": 1.8600836513349524e-05, "loss": 0.0981, "step": 55350 }, { "epoch": 0.8155255445427902, "grad_norm": 1.3914251327514648, "learning_rate": 1.8599200141382538e-05, "loss": 0.0936, "step": 55375 }, { "epoch": 0.8158937276328773, "grad_norm": 1.5198974609375, "learning_rate": 1.8597563769415556e-05, "loss": 0.0973, "step": 55400 }, { "epoch": 0.8162619107229643, "grad_norm": 1.5449283123016357, "learning_rate": 1.859592739744857e-05, "loss": 0.0994, "step": 55425 }, { "epoch": 0.8166300938130514, "grad_norm": 1.245825171470642, "learning_rate": 1.8594291025481585e-05, "loss": 0.0895, "step": 55450 }, { "epoch": 0.8169982769031384, "grad_norm": 1.4883716106414795, "learning_rate": 1.8592654653514603e-05, "loss": 0.09, "step": 55475 }, { "epoch": 0.8173664599932254, "grad_norm": 1.524454116821289, "learning_rate": 1.8591018281547617e-05, "loss": 0.1025, "step": 55500 }, { "epoch": 0.8177346430833125, "grad_norm": 1.436755657196045, "learning_rate": 1.858938190958063e-05, "loss": 0.0985, "step": 55525 }, { "epoch": 0.8181028261733995, "grad_norm": 1.3007491827011108, "learning_rate": 1.858774553761365e-05, "loss": 0.1037, "step": 55550 }, { "epoch": 0.8184710092634866, "grad_norm": 1.233238935470581, "learning_rate": 1.858610916564666e-05, "loss": 0.0873, "step": 55575 }, { "epoch": 0.8188391923535736, "grad_norm": 1.7281906604766846, "learning_rate": 1.8584472793679678e-05, "loss": 0.0932, "step": 55600 }, { "epoch": 0.8192073754436606, "grad_norm": 1.6181998252868652, "learning_rate": 1.8582836421712693e-05, "loss": 0.0944, "step": 55625 }, { "epoch": 0.8195755585337476, "grad_norm": 1.7249391078948975, "learning_rate": 1.858120004974571e-05, "loss": 0.0988, "step": 55650 }, { "epoch": 0.8199437416238347, "grad_norm": 1.0093046426773071, "learning_rate": 1.8579563677778725e-05, "loss": 0.0903, "step": 55675 }, { "epoch": 0.8203119247139218, "grad_norm": 1.7590138912200928, "learning_rate": 1.857792730581174e-05, "loss": 0.0921, "step": 55700 }, { "epoch": 0.8206801078040088, "grad_norm": 1.2769867181777954, "learning_rate": 1.8576290933844757e-05, "loss": 0.0949, "step": 55725 }, { "epoch": 0.8210482908940958, "grad_norm": 1.095051884651184, "learning_rate": 1.8574654561877772e-05, "loss": 0.0954, "step": 55750 }, { "epoch": 0.8214164739841828, "grad_norm": 0.9720871448516846, "learning_rate": 1.8573018189910786e-05, "loss": 0.0897, "step": 55775 }, { "epoch": 0.8217846570742698, "grad_norm": 1.9454920291900635, "learning_rate": 1.85713818179438e-05, "loss": 0.0949, "step": 55800 }, { "epoch": 0.822152840164357, "grad_norm": 1.1506351232528687, "learning_rate": 1.8569745445976815e-05, "loss": 0.0969, "step": 55825 }, { "epoch": 0.822521023254444, "grad_norm": 1.3336644172668457, "learning_rate": 1.8568109074009833e-05, "loss": 0.0965, "step": 55850 }, { "epoch": 0.822889206344531, "grad_norm": 1.7930415868759155, "learning_rate": 1.8566472702042847e-05, "loss": 0.0942, "step": 55875 }, { "epoch": 0.823257389434618, "grad_norm": 1.3584586381912231, "learning_rate": 1.8564836330075865e-05, "loss": 0.0974, "step": 55900 }, { "epoch": 0.8236255725247051, "grad_norm": 1.6696362495422363, "learning_rate": 1.856319995810888e-05, "loss": 0.0976, "step": 55925 }, { "epoch": 0.8239937556147922, "grad_norm": 1.704949140548706, "learning_rate": 1.8561563586141894e-05, "loss": 0.0821, "step": 55950 }, { "epoch": 0.8243619387048792, "grad_norm": 1.5611543655395508, "learning_rate": 1.8559927214174912e-05, "loss": 0.0874, "step": 55975 }, { "epoch": 0.8247301217949662, "grad_norm": 1.1352851390838623, "learning_rate": 1.8558290842207923e-05, "loss": 0.096, "step": 56000 }, { "epoch": 0.8250983048850532, "grad_norm": 1.6469817161560059, "learning_rate": 1.855665447024094e-05, "loss": 0.0915, "step": 56025 }, { "epoch": 0.8254664879751403, "grad_norm": 1.9793018102645874, "learning_rate": 1.8555018098273955e-05, "loss": 0.0953, "step": 56050 }, { "epoch": 0.8258346710652273, "grad_norm": 1.0863748788833618, "learning_rate": 1.855344718118565e-05, "loss": 0.0988, "step": 56075 }, { "epoch": 0.8262028541553144, "grad_norm": 1.2504758834838867, "learning_rate": 1.8551810809218667e-05, "loss": 0.0904, "step": 56100 }, { "epoch": 0.8265710372454014, "grad_norm": 1.7556089162826538, "learning_rate": 1.855017443725168e-05, "loss": 0.1011, "step": 56125 }, { "epoch": 0.8269392203354884, "grad_norm": 1.510802984237671, "learning_rate": 1.85485380652847e-05, "loss": 0.0947, "step": 56150 }, { "epoch": 0.8273074034255755, "grad_norm": 1.0437312126159668, "learning_rate": 1.8546901693317714e-05, "loss": 0.0912, "step": 56175 }, { "epoch": 0.8276755865156625, "grad_norm": 1.0683832168579102, "learning_rate": 1.854526532135073e-05, "loss": 0.091, "step": 56200 }, { "epoch": 0.8280437696057495, "grad_norm": 1.1077921390533447, "learning_rate": 1.8543628949383743e-05, "loss": 0.0922, "step": 56225 }, { "epoch": 0.8284119526958366, "grad_norm": 1.2730482816696167, "learning_rate": 1.8541992577416757e-05, "loss": 0.0969, "step": 56250 }, { "epoch": 0.8287801357859236, "grad_norm": 1.4467557668685913, "learning_rate": 1.8540356205449775e-05, "loss": 0.0995, "step": 56275 }, { "epoch": 0.8291483188760107, "grad_norm": 1.4756429195404053, "learning_rate": 1.853871983348279e-05, "loss": 0.093, "step": 56300 }, { "epoch": 0.8295165019660977, "grad_norm": 1.5162990093231201, "learning_rate": 1.8537083461515804e-05, "loss": 0.0921, "step": 56325 }, { "epoch": 0.8298846850561847, "grad_norm": 1.324678897857666, "learning_rate": 1.8535447089548822e-05, "loss": 0.095, "step": 56350 }, { "epoch": 0.8302528681462717, "grad_norm": 1.3935545682907104, "learning_rate": 1.8533810717581836e-05, "loss": 0.0941, "step": 56375 }, { "epoch": 0.8306210512363588, "grad_norm": 1.6868776082992554, "learning_rate": 1.8532174345614854e-05, "loss": 0.0943, "step": 56400 }, { "epoch": 0.8309892343264459, "grad_norm": 1.6698260307312012, "learning_rate": 1.8530537973647865e-05, "loss": 0.0816, "step": 56425 }, { "epoch": 0.8313574174165329, "grad_norm": 1.8940362930297852, "learning_rate": 1.8528901601680883e-05, "loss": 0.093, "step": 56450 }, { "epoch": 0.8317256005066199, "grad_norm": 1.2623918056488037, "learning_rate": 1.8527265229713898e-05, "loss": 0.0872, "step": 56475 }, { "epoch": 0.8320937835967069, "grad_norm": 1.6204856634140015, "learning_rate": 1.8525628857746912e-05, "loss": 0.0943, "step": 56500 }, { "epoch": 0.8324619666867941, "grad_norm": 1.3996667861938477, "learning_rate": 1.852399248577993e-05, "loss": 0.0973, "step": 56525 }, { "epoch": 0.8328301497768811, "grad_norm": 1.9733930826187134, "learning_rate": 1.8522356113812944e-05, "loss": 0.0949, "step": 56550 }, { "epoch": 0.8331983328669681, "grad_norm": 1.4230526685714722, "learning_rate": 1.8520719741845962e-05, "loss": 0.1038, "step": 56575 }, { "epoch": 0.8335665159570551, "grad_norm": 1.498453974723816, "learning_rate": 1.8519083369878977e-05, "loss": 0.1003, "step": 56600 }, { "epoch": 0.8339346990471421, "grad_norm": 1.9248186349868774, "learning_rate": 1.851744699791199e-05, "loss": 0.0961, "step": 56625 }, { "epoch": 0.8343028821372293, "grad_norm": 1.9036245346069336, "learning_rate": 1.8515810625945006e-05, "loss": 0.1036, "step": 56650 }, { "epoch": 0.8346710652273163, "grad_norm": 1.4368208646774292, "learning_rate": 1.851417425397802e-05, "loss": 0.0963, "step": 56675 }, { "epoch": 0.8350392483174033, "grad_norm": 1.6156413555145264, "learning_rate": 1.8512537882011038e-05, "loss": 0.1007, "step": 56700 }, { "epoch": 0.8354074314074903, "grad_norm": 1.900478482246399, "learning_rate": 1.8510901510044052e-05, "loss": 0.0959, "step": 56725 }, { "epoch": 0.8357756144975773, "grad_norm": 1.5122090578079224, "learning_rate": 1.8509265138077067e-05, "loss": 0.0962, "step": 56750 }, { "epoch": 0.8361437975876644, "grad_norm": 1.2271615266799927, "learning_rate": 1.8507628766110085e-05, "loss": 0.0954, "step": 56775 }, { "epoch": 0.8365119806777515, "grad_norm": 1.4659217596054077, "learning_rate": 1.85059923941431e-05, "loss": 0.0922, "step": 56800 }, { "epoch": 0.8368801637678385, "grad_norm": 1.7371073961257935, "learning_rate": 1.8504356022176117e-05, "loss": 0.0893, "step": 56825 }, { "epoch": 0.8372483468579255, "grad_norm": 1.746718168258667, "learning_rate": 1.8502719650209128e-05, "loss": 0.0905, "step": 56850 }, { "epoch": 0.8376165299480125, "grad_norm": 1.482497215270996, "learning_rate": 1.8501083278242146e-05, "loss": 0.0869, "step": 56875 }, { "epoch": 0.8379847130380996, "grad_norm": 1.482007622718811, "learning_rate": 1.849944690627516e-05, "loss": 0.1014, "step": 56900 }, { "epoch": 0.8383528961281866, "grad_norm": 1.6660585403442383, "learning_rate": 1.8497810534308175e-05, "loss": 0.0959, "step": 56925 }, { "epoch": 0.8387210792182737, "grad_norm": 2.038933753967285, "learning_rate": 1.8496174162341193e-05, "loss": 0.0969, "step": 56950 }, { "epoch": 0.8390892623083607, "grad_norm": 1.5888947248458862, "learning_rate": 1.8494537790374207e-05, "loss": 0.0857, "step": 56975 }, { "epoch": 0.8394574453984477, "grad_norm": 1.071915864944458, "learning_rate": 1.849290141840722e-05, "loss": 0.1016, "step": 57000 }, { "epoch": 0.8398256284885348, "grad_norm": 1.7160131931304932, "learning_rate": 1.849126504644024e-05, "loss": 0.0933, "step": 57025 }, { "epoch": 0.8401938115786218, "grad_norm": 1.2102928161621094, "learning_rate": 1.8489628674473254e-05, "loss": 0.0905, "step": 57050 }, { "epoch": 0.8405619946687088, "grad_norm": 1.4412623643875122, "learning_rate": 1.8487992302506268e-05, "loss": 0.0948, "step": 57075 }, { "epoch": 0.8409301777587959, "grad_norm": 1.5016887187957764, "learning_rate": 1.8486355930539283e-05, "loss": 0.0866, "step": 57100 }, { "epoch": 0.841298360848883, "grad_norm": 1.4665857553482056, "learning_rate": 1.84847195585723e-05, "loss": 0.091, "step": 57125 }, { "epoch": 0.84166654393897, "grad_norm": 1.3634446859359741, "learning_rate": 1.8483083186605315e-05, "loss": 0.093, "step": 57150 }, { "epoch": 0.842034727029057, "grad_norm": 1.4443259239196777, "learning_rate": 1.848144681463833e-05, "loss": 0.092, "step": 57175 }, { "epoch": 0.842402910119144, "grad_norm": 1.4307743310928345, "learning_rate": 1.8479810442671347e-05, "loss": 0.0945, "step": 57200 }, { "epoch": 0.842771093209231, "grad_norm": 1.4008501768112183, "learning_rate": 1.8478174070704362e-05, "loss": 0.093, "step": 57225 }, { "epoch": 0.8431392762993182, "grad_norm": 1.025970697402954, "learning_rate": 1.8476537698737376e-05, "loss": 0.0928, "step": 57250 }, { "epoch": 0.8435074593894052, "grad_norm": 1.709593653678894, "learning_rate": 1.847490132677039e-05, "loss": 0.0987, "step": 57275 }, { "epoch": 0.8438756424794922, "grad_norm": 1.183369755744934, "learning_rate": 1.847326495480341e-05, "loss": 0.0893, "step": 57300 }, { "epoch": 0.8442438255695792, "grad_norm": 1.7387748956680298, "learning_rate": 1.8471628582836423e-05, "loss": 0.0922, "step": 57325 }, { "epoch": 0.8446120086596662, "grad_norm": 1.3709940910339355, "learning_rate": 1.8469992210869437e-05, "loss": 0.0957, "step": 57350 }, { "epoch": 0.8449801917497534, "grad_norm": 1.73492431640625, "learning_rate": 1.8468355838902455e-05, "loss": 0.0928, "step": 57375 }, { "epoch": 0.8453483748398404, "grad_norm": 1.300275206565857, "learning_rate": 1.846671946693547e-05, "loss": 0.095, "step": 57400 }, { "epoch": 0.8457165579299274, "grad_norm": 1.6966084241867065, "learning_rate": 1.8465083094968484e-05, "loss": 0.0977, "step": 57425 }, { "epoch": 0.8460847410200144, "grad_norm": 1.2545547485351562, "learning_rate": 1.8463446723001502e-05, "loss": 0.1018, "step": 57450 }, { "epoch": 0.8464529241101014, "grad_norm": 1.1827770471572876, "learning_rate": 1.8461810351034513e-05, "loss": 0.086, "step": 57475 }, { "epoch": 0.8468211072001885, "grad_norm": 1.3192706108093262, "learning_rate": 1.846017397906753e-05, "loss": 0.095, "step": 57500 }, { "epoch": 0.8471892902902756, "grad_norm": 1.3012096881866455, "learning_rate": 1.8458537607100545e-05, "loss": 0.0864, "step": 57525 }, { "epoch": 0.8475574733803626, "grad_norm": 1.1796921491622925, "learning_rate": 1.8456901235133563e-05, "loss": 0.0837, "step": 57550 }, { "epoch": 0.8479256564704496, "grad_norm": 1.5892575979232788, "learning_rate": 1.8455264863166578e-05, "loss": 0.0837, "step": 57575 }, { "epoch": 0.8482938395605366, "grad_norm": 1.2497235536575317, "learning_rate": 1.8453628491199592e-05, "loss": 0.0981, "step": 57600 }, { "epoch": 0.8486620226506237, "grad_norm": 1.4007341861724854, "learning_rate": 1.845199211923261e-05, "loss": 0.0823, "step": 57625 }, { "epoch": 0.8490302057407108, "grad_norm": 1.7249845266342163, "learning_rate": 1.8450355747265625e-05, "loss": 0.0919, "step": 57650 }, { "epoch": 0.8493983888307978, "grad_norm": 2.2453880310058594, "learning_rate": 1.844871937529864e-05, "loss": 0.0919, "step": 57675 }, { "epoch": 0.8497665719208848, "grad_norm": 1.5137193202972412, "learning_rate": 1.8447083003331653e-05, "loss": 0.0943, "step": 57700 }, { "epoch": 0.8501347550109719, "grad_norm": 1.403775691986084, "learning_rate": 1.844544663136467e-05, "loss": 0.0984, "step": 57725 }, { "epoch": 0.8505029381010589, "grad_norm": 2.0289063453674316, "learning_rate": 1.8443810259397686e-05, "loss": 0.0935, "step": 57750 }, { "epoch": 0.8508711211911459, "grad_norm": 0.902027428150177, "learning_rate": 1.84421738874307e-05, "loss": 0.089, "step": 57775 }, { "epoch": 0.851239304281233, "grad_norm": 1.2147347927093506, "learning_rate": 1.8440537515463718e-05, "loss": 0.0808, "step": 57800 }, { "epoch": 0.85160748737132, "grad_norm": 1.3671728372573853, "learning_rate": 1.8438901143496733e-05, "loss": 0.0875, "step": 57825 }, { "epoch": 0.8519756704614071, "grad_norm": 1.4228578805923462, "learning_rate": 1.8437264771529747e-05, "loss": 0.0954, "step": 57850 }, { "epoch": 0.8523438535514941, "grad_norm": 1.6377575397491455, "learning_rate": 1.8435628399562765e-05, "loss": 0.1018, "step": 57875 }, { "epoch": 0.8527120366415811, "grad_norm": 1.4888041019439697, "learning_rate": 1.8433992027595776e-05, "loss": 0.0933, "step": 57900 }, { "epoch": 0.8530802197316681, "grad_norm": 1.489107370376587, "learning_rate": 1.8432355655628794e-05, "loss": 0.0994, "step": 57925 }, { "epoch": 0.8534484028217552, "grad_norm": 1.6297394037246704, "learning_rate": 1.8430719283661808e-05, "loss": 0.0839, "step": 57950 }, { "epoch": 0.8538165859118423, "grad_norm": 1.50656259059906, "learning_rate": 1.8429082911694826e-05, "loss": 0.096, "step": 57975 }, { "epoch": 0.8541847690019293, "grad_norm": 1.3311855792999268, "learning_rate": 1.842744653972784e-05, "loss": 0.0905, "step": 58000 }, { "epoch": 0.8545529520920163, "grad_norm": 1.7467899322509766, "learning_rate": 1.8425810167760855e-05, "loss": 0.0911, "step": 58025 }, { "epoch": 0.8549211351821033, "grad_norm": 1.619917869567871, "learning_rate": 1.8424173795793873e-05, "loss": 0.101, "step": 58050 }, { "epoch": 0.8552893182721903, "grad_norm": 1.5799951553344727, "learning_rate": 1.8422537423826887e-05, "loss": 0.0836, "step": 58075 }, { "epoch": 0.8556575013622775, "grad_norm": 1.628247857093811, "learning_rate": 1.8420901051859902e-05, "loss": 0.0912, "step": 58100 }, { "epoch": 0.8560256844523645, "grad_norm": 1.4958336353302002, "learning_rate": 1.8419264679892916e-05, "loss": 0.0998, "step": 58125 }, { "epoch": 0.8563938675424515, "grad_norm": 1.626678466796875, "learning_rate": 1.841762830792593e-05, "loss": 0.0905, "step": 58150 }, { "epoch": 0.8567620506325385, "grad_norm": 1.1951794624328613, "learning_rate": 1.841599193595895e-05, "loss": 0.0943, "step": 58175 }, { "epoch": 0.8571302337226255, "grad_norm": 1.804368019104004, "learning_rate": 1.8414355563991963e-05, "loss": 0.0946, "step": 58200 }, { "epoch": 0.8574984168127127, "grad_norm": 1.451155662536621, "learning_rate": 1.841271919202498e-05, "loss": 0.0981, "step": 58225 }, { "epoch": 0.8578665999027997, "grad_norm": 1.9024156332015991, "learning_rate": 1.8411082820057995e-05, "loss": 0.0992, "step": 58250 }, { "epoch": 0.8582347829928867, "grad_norm": 1.3425402641296387, "learning_rate": 1.840944644809101e-05, "loss": 0.1, "step": 58275 }, { "epoch": 0.8586029660829737, "grad_norm": 1.6113873720169067, "learning_rate": 1.8407875531002707e-05, "loss": 0.0881, "step": 58300 }, { "epoch": 0.8589711491730608, "grad_norm": 1.4160832166671753, "learning_rate": 1.8406239159035718e-05, "loss": 0.1002, "step": 58325 }, { "epoch": 0.8593393322631478, "grad_norm": 1.4228044748306274, "learning_rate": 1.8404602787068736e-05, "loss": 0.0969, "step": 58350 }, { "epoch": 0.8597075153532349, "grad_norm": 1.7306244373321533, "learning_rate": 1.840296641510175e-05, "loss": 0.0903, "step": 58375 }, { "epoch": 0.8600756984433219, "grad_norm": 1.4546258449554443, "learning_rate": 1.8401330043134765e-05, "loss": 0.0914, "step": 58400 }, { "epoch": 0.8604438815334089, "grad_norm": 1.4770536422729492, "learning_rate": 1.8399693671167783e-05, "loss": 0.0957, "step": 58425 }, { "epoch": 0.860812064623496, "grad_norm": 1.6159794330596924, "learning_rate": 1.8398057299200797e-05, "loss": 0.0967, "step": 58450 }, { "epoch": 0.861180247713583, "grad_norm": 1.7649896144866943, "learning_rate": 1.8396420927233815e-05, "loss": 0.0976, "step": 58475 }, { "epoch": 0.86154843080367, "grad_norm": 1.5954879522323608, "learning_rate": 1.839478455526683e-05, "loss": 0.0905, "step": 58500 }, { "epoch": 0.8619166138937571, "grad_norm": 1.1200867891311646, "learning_rate": 1.8393148183299844e-05, "loss": 0.0925, "step": 58525 }, { "epoch": 0.8622847969838441, "grad_norm": 1.5213146209716797, "learning_rate": 1.8391511811332858e-05, "loss": 0.0998, "step": 58550 }, { "epoch": 0.8626529800739312, "grad_norm": 1.3924262523651123, "learning_rate": 1.8389875439365873e-05, "loss": 0.0902, "step": 58575 }, { "epoch": 0.8630211631640182, "grad_norm": 1.9037601947784424, "learning_rate": 1.838823906739889e-05, "loss": 0.09, "step": 58600 }, { "epoch": 0.8633893462541052, "grad_norm": 1.5457444190979004, "learning_rate": 1.8386602695431905e-05, "loss": 0.0881, "step": 58625 }, { "epoch": 0.8637575293441923, "grad_norm": 1.3769618272781372, "learning_rate": 1.838496632346492e-05, "loss": 0.0928, "step": 58650 }, { "epoch": 0.8641257124342793, "grad_norm": 1.835276484489441, "learning_rate": 1.8383329951497937e-05, "loss": 0.097, "step": 58675 }, { "epoch": 0.8644938955243664, "grad_norm": 1.8566334247589111, "learning_rate": 1.8381693579530952e-05, "loss": 0.0921, "step": 58700 }, { "epoch": 0.8648620786144534, "grad_norm": 1.4504879713058472, "learning_rate": 1.838005720756397e-05, "loss": 0.0866, "step": 58725 }, { "epoch": 0.8652302617045404, "grad_norm": 1.6636848449707031, "learning_rate": 1.837842083559698e-05, "loss": 0.0933, "step": 58750 }, { "epoch": 0.8655984447946274, "grad_norm": 1.262019395828247, "learning_rate": 1.837678446363e-05, "loss": 0.0924, "step": 58775 }, { "epoch": 0.8659666278847145, "grad_norm": 1.5972861051559448, "learning_rate": 1.8375148091663013e-05, "loss": 0.0924, "step": 58800 }, { "epoch": 0.8663348109748016, "grad_norm": 1.4554160833358765, "learning_rate": 1.8373511719696027e-05, "loss": 0.0894, "step": 58825 }, { "epoch": 0.8667029940648886, "grad_norm": 1.5403928756713867, "learning_rate": 1.8371875347729045e-05, "loss": 0.0868, "step": 58850 }, { "epoch": 0.8670711771549756, "grad_norm": 1.3819833993911743, "learning_rate": 1.837023897576206e-05, "loss": 0.0962, "step": 58875 }, { "epoch": 0.8674393602450626, "grad_norm": 1.4847444295883179, "learning_rate": 1.8368602603795078e-05, "loss": 0.0857, "step": 58900 }, { "epoch": 0.8678075433351498, "grad_norm": 1.8123759031295776, "learning_rate": 1.8366966231828092e-05, "loss": 0.0971, "step": 58925 }, { "epoch": 0.8681757264252368, "grad_norm": 1.2447463274002075, "learning_rate": 1.8365329859861107e-05, "loss": 0.0942, "step": 58950 }, { "epoch": 0.8685439095153238, "grad_norm": 1.2621221542358398, "learning_rate": 1.836369348789412e-05, "loss": 0.0934, "step": 58975 }, { "epoch": 0.8689120926054108, "grad_norm": 1.3399676084518433, "learning_rate": 1.8362057115927135e-05, "loss": 0.0898, "step": 59000 }, { "epoch": 0.8692802756954978, "grad_norm": 2.2415242195129395, "learning_rate": 1.8360420743960153e-05, "loss": 0.0867, "step": 59025 }, { "epoch": 0.8696484587855849, "grad_norm": 1.316906452178955, "learning_rate": 1.8358784371993168e-05, "loss": 0.0853, "step": 59050 }, { "epoch": 0.870016641875672, "grad_norm": 1.25469172000885, "learning_rate": 1.8357148000026182e-05, "loss": 0.0913, "step": 59075 }, { "epoch": 0.870384824965759, "grad_norm": 1.3025234937667847, "learning_rate": 1.83555116280592e-05, "loss": 0.0873, "step": 59100 }, { "epoch": 0.870753008055846, "grad_norm": 1.4825505018234253, "learning_rate": 1.8353875256092215e-05, "loss": 0.0951, "step": 59125 }, { "epoch": 0.871121191145933, "grad_norm": 1.3739885091781616, "learning_rate": 1.8352238884125232e-05, "loss": 0.0889, "step": 59150 }, { "epoch": 0.8714893742360201, "grad_norm": 1.4379063844680786, "learning_rate": 1.8350602512158243e-05, "loss": 0.0921, "step": 59175 }, { "epoch": 0.8718575573261071, "grad_norm": 1.3322168588638306, "learning_rate": 1.834896614019126e-05, "loss": 0.0977, "step": 59200 }, { "epoch": 0.8722257404161942, "grad_norm": 1.9719409942626953, "learning_rate": 1.8347329768224276e-05, "loss": 0.0874, "step": 59225 }, { "epoch": 0.8725939235062812, "grad_norm": 1.30929434299469, "learning_rate": 1.834569339625729e-05, "loss": 0.0981, "step": 59250 }, { "epoch": 0.8729621065963682, "grad_norm": 1.5915038585662842, "learning_rate": 1.8344057024290308e-05, "loss": 0.0909, "step": 59275 }, { "epoch": 0.8733302896864553, "grad_norm": 1.4955673217773438, "learning_rate": 1.8342420652323323e-05, "loss": 0.092, "step": 59300 }, { "epoch": 0.8736984727765423, "grad_norm": 1.8077608346939087, "learning_rate": 1.8340784280356337e-05, "loss": 0.0886, "step": 59325 }, { "epoch": 0.8740666558666293, "grad_norm": 1.2306609153747559, "learning_rate": 1.8339147908389355e-05, "loss": 0.0881, "step": 59350 }, { "epoch": 0.8744348389567164, "grad_norm": 1.1256755590438843, "learning_rate": 1.833751153642237e-05, "loss": 0.096, "step": 59375 }, { "epoch": 0.8748030220468034, "grad_norm": 1.2353475093841553, "learning_rate": 1.8335875164455384e-05, "loss": 0.0873, "step": 59400 }, { "epoch": 0.8751712051368905, "grad_norm": 1.5929750204086304, "learning_rate": 1.8334238792488398e-05, "loss": 0.0943, "step": 59425 }, { "epoch": 0.8755393882269775, "grad_norm": 1.3684848546981812, "learning_rate": 1.8332602420521416e-05, "loss": 0.0869, "step": 59450 }, { "epoch": 0.8759075713170645, "grad_norm": 1.2853327989578247, "learning_rate": 1.833096604855443e-05, "loss": 0.0947, "step": 59475 }, { "epoch": 0.8762757544071516, "grad_norm": 1.2716856002807617, "learning_rate": 1.8329329676587445e-05, "loss": 0.0946, "step": 59500 }, { "epoch": 0.8766439374972387, "grad_norm": 1.4013023376464844, "learning_rate": 1.8327693304620463e-05, "loss": 0.086, "step": 59525 }, { "epoch": 0.8770121205873257, "grad_norm": 1.4766559600830078, "learning_rate": 1.8326056932653477e-05, "loss": 0.1019, "step": 59550 }, { "epoch": 0.8773803036774127, "grad_norm": 1.2658036947250366, "learning_rate": 1.8324420560686492e-05, "loss": 0.0889, "step": 59575 }, { "epoch": 0.8777484867674997, "grad_norm": 1.474961757659912, "learning_rate": 1.8322784188719506e-05, "loss": 0.088, "step": 59600 }, { "epoch": 0.8781166698575867, "grad_norm": 1.3331751823425293, "learning_rate": 1.8321147816752524e-05, "loss": 0.0889, "step": 59625 }, { "epoch": 0.8784848529476739, "grad_norm": 1.0465377569198608, "learning_rate": 1.831951144478554e-05, "loss": 0.0819, "step": 59650 }, { "epoch": 0.8788530360377609, "grad_norm": 1.4545894861221313, "learning_rate": 1.8317875072818553e-05, "loss": 0.09, "step": 59675 }, { "epoch": 0.8792212191278479, "grad_norm": 1.7496753931045532, "learning_rate": 1.831623870085157e-05, "loss": 0.0936, "step": 59700 }, { "epoch": 0.8795894022179349, "grad_norm": 1.6359854936599731, "learning_rate": 1.8314602328884585e-05, "loss": 0.0936, "step": 59725 }, { "epoch": 0.8799575853080219, "grad_norm": 1.7177306413650513, "learning_rate": 1.83129659569176e-05, "loss": 0.0924, "step": 59750 }, { "epoch": 0.880325768398109, "grad_norm": 0.9038068652153015, "learning_rate": 1.8311329584950618e-05, "loss": 0.0893, "step": 59775 }, { "epoch": 0.8806939514881961, "grad_norm": 1.7128791809082031, "learning_rate": 1.830969321298363e-05, "loss": 0.0939, "step": 59800 }, { "epoch": 0.8810621345782831, "grad_norm": 1.707631230354309, "learning_rate": 1.8308056841016646e-05, "loss": 0.0925, "step": 59825 }, { "epoch": 0.8814303176683701, "grad_norm": 1.3236687183380127, "learning_rate": 1.830642046904966e-05, "loss": 0.0977, "step": 59850 }, { "epoch": 0.8817985007584571, "grad_norm": 1.399573564529419, "learning_rate": 1.830478409708268e-05, "loss": 0.0956, "step": 59875 }, { "epoch": 0.8821666838485442, "grad_norm": 1.2191760540008545, "learning_rate": 1.8303147725115693e-05, "loss": 0.0807, "step": 59900 }, { "epoch": 0.8825348669386313, "grad_norm": 1.6837016344070435, "learning_rate": 1.8301511353148708e-05, "loss": 0.0893, "step": 59925 }, { "epoch": 0.8829030500287183, "grad_norm": 1.6863495111465454, "learning_rate": 1.8299874981181726e-05, "loss": 0.0905, "step": 59950 }, { "epoch": 0.8832712331188053, "grad_norm": 1.422714114189148, "learning_rate": 1.829823860921474e-05, "loss": 0.0909, "step": 59975 }, { "epoch": 0.8836394162088923, "grad_norm": 1.6514418125152588, "learning_rate": 1.8296602237247754e-05, "loss": 0.0926, "step": 60000 }, { "epoch": 0.8840075992989794, "grad_norm": 1.4973467588424683, "learning_rate": 1.829496586528077e-05, "loss": 0.0851, "step": 60025 }, { "epoch": 0.8843757823890664, "grad_norm": 1.7477022409439087, "learning_rate": 1.8293329493313787e-05, "loss": 0.0945, "step": 60050 }, { "epoch": 0.8847439654791535, "grad_norm": 0.9304346442222595, "learning_rate": 1.82916931213468e-05, "loss": 0.0966, "step": 60075 }, { "epoch": 0.8851121485692405, "grad_norm": 1.712140679359436, "learning_rate": 1.8290056749379816e-05, "loss": 0.0943, "step": 60100 }, { "epoch": 0.8854803316593276, "grad_norm": 1.0455185174942017, "learning_rate": 1.8288420377412833e-05, "loss": 0.0914, "step": 60125 }, { "epoch": 0.8858485147494146, "grad_norm": 1.478262186050415, "learning_rate": 1.8286784005445848e-05, "loss": 0.0829, "step": 60150 }, { "epoch": 0.8862166978395016, "grad_norm": 1.6686688661575317, "learning_rate": 1.8285147633478862e-05, "loss": 0.0839, "step": 60175 }, { "epoch": 0.8865848809295886, "grad_norm": 1.6570348739624023, "learning_rate": 1.828351126151188e-05, "loss": 0.0978, "step": 60200 }, { "epoch": 0.8869530640196757, "grad_norm": 1.4632148742675781, "learning_rate": 1.828187488954489e-05, "loss": 0.0935, "step": 60225 }, { "epoch": 0.8873212471097628, "grad_norm": 1.9386918544769287, "learning_rate": 1.828023851757791e-05, "loss": 0.0916, "step": 60250 }, { "epoch": 0.8876894301998498, "grad_norm": 1.4522777795791626, "learning_rate": 1.8278602145610924e-05, "loss": 0.0889, "step": 60275 }, { "epoch": 0.8880576132899368, "grad_norm": 1.8300904035568237, "learning_rate": 1.827696577364394e-05, "loss": 0.0979, "step": 60300 }, { "epoch": 0.8884257963800238, "grad_norm": 1.9495863914489746, "learning_rate": 1.8275329401676956e-05, "loss": 0.1011, "step": 60325 }, { "epoch": 0.8887939794701109, "grad_norm": 1.876254916191101, "learning_rate": 1.827369302970997e-05, "loss": 0.0896, "step": 60350 }, { "epoch": 0.889162162560198, "grad_norm": 1.2851182222366333, "learning_rate": 1.8272056657742988e-05, "loss": 0.0931, "step": 60375 }, { "epoch": 0.889530345650285, "grad_norm": 1.205654263496399, "learning_rate": 1.8270420285776003e-05, "loss": 0.0948, "step": 60400 }, { "epoch": 0.889898528740372, "grad_norm": 1.613236665725708, "learning_rate": 1.8268783913809017e-05, "loss": 0.0928, "step": 60425 }, { "epoch": 0.890266711830459, "grad_norm": 1.8041343688964844, "learning_rate": 1.826714754184203e-05, "loss": 0.0892, "step": 60450 }, { "epoch": 0.890634894920546, "grad_norm": 0.9721527695655823, "learning_rate": 1.8265511169875046e-05, "loss": 0.0912, "step": 60475 }, { "epoch": 0.8910030780106332, "grad_norm": 1.0499447584152222, "learning_rate": 1.8263940252786743e-05, "loss": 0.0893, "step": 60500 }, { "epoch": 0.8913712611007202, "grad_norm": 1.4288532733917236, "learning_rate": 1.8262303880819758e-05, "loss": 0.0914, "step": 60525 }, { "epoch": 0.8917394441908072, "grad_norm": 1.3860613107681274, "learning_rate": 1.8260667508852776e-05, "loss": 0.0884, "step": 60550 }, { "epoch": 0.8921076272808942, "grad_norm": 1.6695598363876343, "learning_rate": 1.825903113688579e-05, "loss": 0.0911, "step": 60575 }, { "epoch": 0.8924758103709812, "grad_norm": 1.0595664978027344, "learning_rate": 1.8257394764918805e-05, "loss": 0.0926, "step": 60600 }, { "epoch": 0.8928439934610684, "grad_norm": 1.530721664428711, "learning_rate": 1.8255758392951822e-05, "loss": 0.0904, "step": 60625 }, { "epoch": 0.8932121765511554, "grad_norm": 1.4969135522842407, "learning_rate": 1.8254122020984833e-05, "loss": 0.0937, "step": 60650 }, { "epoch": 0.8935803596412424, "grad_norm": 1.4802945852279663, "learning_rate": 1.825248564901785e-05, "loss": 0.0884, "step": 60675 }, { "epoch": 0.8939485427313294, "grad_norm": 1.5618170499801636, "learning_rate": 1.8250849277050866e-05, "loss": 0.092, "step": 60700 }, { "epoch": 0.8943167258214165, "grad_norm": 1.3365405797958374, "learning_rate": 1.824921290508388e-05, "loss": 0.0916, "step": 60725 }, { "epoch": 0.8946849089115035, "grad_norm": 1.8510189056396484, "learning_rate": 1.8247576533116898e-05, "loss": 0.0898, "step": 60750 }, { "epoch": 0.8950530920015906, "grad_norm": 1.214875340461731, "learning_rate": 1.8245940161149913e-05, "loss": 0.0954, "step": 60775 }, { "epoch": 0.8954212750916776, "grad_norm": 1.3900574445724487, "learning_rate": 1.824430378918293e-05, "loss": 0.0936, "step": 60800 }, { "epoch": 0.8957894581817646, "grad_norm": 1.2594184875488281, "learning_rate": 1.8242667417215945e-05, "loss": 0.0927, "step": 60825 }, { "epoch": 0.8961576412718517, "grad_norm": 1.2305347919464111, "learning_rate": 1.824103104524896e-05, "loss": 0.0864, "step": 60850 }, { "epoch": 0.8965258243619387, "grad_norm": 1.9893605709075928, "learning_rate": 1.8239394673281974e-05, "loss": 0.0894, "step": 60875 }, { "epoch": 0.8968940074520257, "grad_norm": 1.546601414680481, "learning_rate": 1.8237758301314988e-05, "loss": 0.0947, "step": 60900 }, { "epoch": 0.8972621905421128, "grad_norm": 1.658313512802124, "learning_rate": 1.8236121929348006e-05, "loss": 0.0953, "step": 60925 }, { "epoch": 0.8976303736321998, "grad_norm": 1.5878702402114868, "learning_rate": 1.823448555738102e-05, "loss": 0.0864, "step": 60950 }, { "epoch": 0.8979985567222869, "grad_norm": 1.3052133321762085, "learning_rate": 1.8232849185414035e-05, "loss": 0.0827, "step": 60975 }, { "epoch": 0.8983667398123739, "grad_norm": 1.3506282567977905, "learning_rate": 1.8231212813447053e-05, "loss": 0.0958, "step": 61000 }, { "epoch": 0.8987349229024609, "grad_norm": 1.0250307321548462, "learning_rate": 1.8229576441480067e-05, "loss": 0.0844, "step": 61025 }, { "epoch": 0.899103105992548, "grad_norm": 1.356543779373169, "learning_rate": 1.8227940069513085e-05, "loss": 0.0944, "step": 61050 }, { "epoch": 0.899471289082635, "grad_norm": 1.3725234270095825, "learning_rate": 1.8226303697546096e-05, "loss": 0.0887, "step": 61075 }, { "epoch": 0.8998394721727221, "grad_norm": 1.2289153337478638, "learning_rate": 1.8224667325579114e-05, "loss": 0.0874, "step": 61100 }, { "epoch": 0.9002076552628091, "grad_norm": 1.7129337787628174, "learning_rate": 1.822303095361213e-05, "loss": 0.0996, "step": 61125 }, { "epoch": 0.9005758383528961, "grad_norm": 1.4440714120864868, "learning_rate": 1.8221394581645143e-05, "loss": 0.0911, "step": 61150 }, { "epoch": 0.9009440214429831, "grad_norm": 1.5119160413742065, "learning_rate": 1.821975820967816e-05, "loss": 0.0898, "step": 61175 }, { "epoch": 0.9013122045330703, "grad_norm": 1.3910434246063232, "learning_rate": 1.8218121837711175e-05, "loss": 0.0949, "step": 61200 }, { "epoch": 0.9016803876231573, "grad_norm": 1.4344723224639893, "learning_rate": 1.8216485465744193e-05, "loss": 0.0929, "step": 61225 }, { "epoch": 0.9020485707132443, "grad_norm": 1.3344783782958984, "learning_rate": 1.8214849093777208e-05, "loss": 0.0879, "step": 61250 }, { "epoch": 0.9024167538033313, "grad_norm": 2.065082311630249, "learning_rate": 1.8213212721810222e-05, "loss": 0.0874, "step": 61275 }, { "epoch": 0.9027849368934183, "grad_norm": 1.8886113166809082, "learning_rate": 1.8211576349843236e-05, "loss": 0.09, "step": 61300 }, { "epoch": 0.9031531199835054, "grad_norm": 1.7450238466262817, "learning_rate": 1.820993997787625e-05, "loss": 0.0908, "step": 61325 }, { "epoch": 0.9035213030735925, "grad_norm": 1.3400936126708984, "learning_rate": 1.820830360590927e-05, "loss": 0.0966, "step": 61350 }, { "epoch": 0.9038894861636795, "grad_norm": 1.3660224676132202, "learning_rate": 1.8206667233942283e-05, "loss": 0.0878, "step": 61375 }, { "epoch": 0.9042576692537665, "grad_norm": 1.0200551748275757, "learning_rate": 1.8205030861975298e-05, "loss": 0.0861, "step": 61400 }, { "epoch": 0.9046258523438535, "grad_norm": 1.4215823411941528, "learning_rate": 1.8203394490008316e-05, "loss": 0.0835, "step": 61425 }, { "epoch": 0.9049940354339406, "grad_norm": 1.767581820487976, "learning_rate": 1.820175811804133e-05, "loss": 0.0911, "step": 61450 }, { "epoch": 0.9053622185240277, "grad_norm": 1.8444174528121948, "learning_rate": 1.8200121746074348e-05, "loss": 0.0993, "step": 61475 }, { "epoch": 0.9057304016141147, "grad_norm": 1.2559212446212769, "learning_rate": 1.819848537410736e-05, "loss": 0.0895, "step": 61500 }, { "epoch": 0.9060985847042017, "grad_norm": 1.2072019577026367, "learning_rate": 1.8196849002140377e-05, "loss": 0.0901, "step": 61525 }, { "epoch": 0.9064667677942887, "grad_norm": 1.5533256530761719, "learning_rate": 1.819521263017339e-05, "loss": 0.0838, "step": 61550 }, { "epoch": 0.9068349508843758, "grad_norm": 1.1651979684829712, "learning_rate": 1.8193576258206406e-05, "loss": 0.0952, "step": 61575 }, { "epoch": 0.9072031339744628, "grad_norm": 1.433811068534851, "learning_rate": 1.8191939886239423e-05, "loss": 0.086, "step": 61600 }, { "epoch": 0.9075713170645499, "grad_norm": 1.8967841863632202, "learning_rate": 1.8190303514272438e-05, "loss": 0.0869, "step": 61625 }, { "epoch": 0.9079395001546369, "grad_norm": 1.5940126180648804, "learning_rate": 1.8188667142305452e-05, "loss": 0.0841, "step": 61650 }, { "epoch": 0.9083076832447239, "grad_norm": 1.2679402828216553, "learning_rate": 1.818703077033847e-05, "loss": 0.0844, "step": 61675 }, { "epoch": 0.908675866334811, "grad_norm": 1.1893339157104492, "learning_rate": 1.8185394398371485e-05, "loss": 0.0977, "step": 61700 }, { "epoch": 0.909044049424898, "grad_norm": 1.3891805410385132, "learning_rate": 1.81837580264045e-05, "loss": 0.0919, "step": 61725 }, { "epoch": 0.909412232514985, "grad_norm": 1.474007487297058, "learning_rate": 1.8182121654437514e-05, "loss": 0.0919, "step": 61750 }, { "epoch": 0.9097804156050721, "grad_norm": 1.4935157299041748, "learning_rate": 1.818048528247053e-05, "loss": 0.0823, "step": 61775 }, { "epoch": 0.9101485986951592, "grad_norm": 1.199958324432373, "learning_rate": 1.8178848910503546e-05, "loss": 0.0868, "step": 61800 }, { "epoch": 0.9105167817852462, "grad_norm": 1.1490696668624878, "learning_rate": 1.817721253853656e-05, "loss": 0.0945, "step": 61825 }, { "epoch": 0.9108849648753332, "grad_norm": 1.5686537027359009, "learning_rate": 1.8175576166569578e-05, "loss": 0.0968, "step": 61850 }, { "epoch": 0.9112531479654202, "grad_norm": 1.8495757579803467, "learning_rate": 1.8173939794602593e-05, "loss": 0.0936, "step": 61875 }, { "epoch": 0.9116213310555072, "grad_norm": 1.4681916236877441, "learning_rate": 1.8172303422635607e-05, "loss": 0.0918, "step": 61900 }, { "epoch": 0.9119895141455944, "grad_norm": 1.4203157424926758, "learning_rate": 1.817066705066862e-05, "loss": 0.09, "step": 61925 }, { "epoch": 0.9123576972356814, "grad_norm": 1.233629584312439, "learning_rate": 1.816903067870164e-05, "loss": 0.0806, "step": 61950 }, { "epoch": 0.9127258803257684, "grad_norm": 1.7074884176254272, "learning_rate": 1.8167394306734654e-05, "loss": 0.0925, "step": 61975 }, { "epoch": 0.9130940634158554, "grad_norm": 1.5755255222320557, "learning_rate": 1.816575793476767e-05, "loss": 0.0877, "step": 62000 }, { "epoch": 0.9134622465059424, "grad_norm": 1.721422553062439, "learning_rate": 1.8164121562800686e-05, "loss": 0.0889, "step": 62025 }, { "epoch": 0.9138304295960296, "grad_norm": 1.6015130281448364, "learning_rate": 1.81624851908337e-05, "loss": 0.091, "step": 62050 }, { "epoch": 0.9141986126861166, "grad_norm": 1.6153591871261597, "learning_rate": 1.8160848818866715e-05, "loss": 0.0935, "step": 62075 }, { "epoch": 0.9145667957762036, "grad_norm": 1.5466082096099854, "learning_rate": 1.8159212446899733e-05, "loss": 0.0951, "step": 62100 }, { "epoch": 0.9149349788662906, "grad_norm": 1.756897211074829, "learning_rate": 1.8157576074932744e-05, "loss": 0.0825, "step": 62125 }, { "epoch": 0.9153031619563776, "grad_norm": 1.7807000875473022, "learning_rate": 1.8155939702965762e-05, "loss": 0.1, "step": 62150 }, { "epoch": 0.9156713450464647, "grad_norm": 1.0893385410308838, "learning_rate": 1.8154303330998776e-05, "loss": 0.0848, "step": 62175 }, { "epoch": 0.9160395281365518, "grad_norm": 1.796811819076538, "learning_rate": 1.8152666959031794e-05, "loss": 0.0851, "step": 62200 }, { "epoch": 0.9164077112266388, "grad_norm": 1.8552054166793823, "learning_rate": 1.815103058706481e-05, "loss": 0.0944, "step": 62225 }, { "epoch": 0.9167758943167258, "grad_norm": 1.2951158285140991, "learning_rate": 1.8149394215097823e-05, "loss": 0.0847, "step": 62250 }, { "epoch": 0.9171440774068128, "grad_norm": 1.18398916721344, "learning_rate": 1.814775784313084e-05, "loss": 0.0954, "step": 62275 }, { "epoch": 0.9175122604968999, "grad_norm": 1.342115879058838, "learning_rate": 1.8146121471163855e-05, "loss": 0.0852, "step": 62300 }, { "epoch": 0.917880443586987, "grad_norm": 1.480775237083435, "learning_rate": 1.814448509919687e-05, "loss": 0.0837, "step": 62325 }, { "epoch": 0.918248626677074, "grad_norm": 1.4389697313308716, "learning_rate": 1.8142848727229884e-05, "loss": 0.0834, "step": 62350 }, { "epoch": 0.918616809767161, "grad_norm": 1.9070498943328857, "learning_rate": 1.8141212355262902e-05, "loss": 0.0931, "step": 62375 }, { "epoch": 0.9189849928572481, "grad_norm": 1.3322042226791382, "learning_rate": 1.8139575983295917e-05, "loss": 0.0942, "step": 62400 }, { "epoch": 0.9193531759473351, "grad_norm": 1.0659507513046265, "learning_rate": 1.813793961132893e-05, "loss": 0.0828, "step": 62425 }, { "epoch": 0.9197213590374221, "grad_norm": 1.1807823181152344, "learning_rate": 1.813630323936195e-05, "loss": 0.0868, "step": 62450 }, { "epoch": 0.9200895421275092, "grad_norm": 1.2817989587783813, "learning_rate": 1.8134666867394963e-05, "loss": 0.088, "step": 62475 }, { "epoch": 0.9204577252175962, "grad_norm": 1.4101269245147705, "learning_rate": 1.8133030495427978e-05, "loss": 0.0847, "step": 62500 }, { "epoch": 0.9208259083076833, "grad_norm": 1.565391182899475, "learning_rate": 1.8131394123460996e-05, "loss": 0.082, "step": 62525 }, { "epoch": 0.9211940913977703, "grad_norm": 1.62588369846344, "learning_rate": 1.8129757751494007e-05, "loss": 0.0911, "step": 62550 }, { "epoch": 0.9215622744878573, "grad_norm": 1.4808884859085083, "learning_rate": 1.8128121379527025e-05, "loss": 0.0896, "step": 62575 }, { "epoch": 0.9219304575779443, "grad_norm": 1.2906694412231445, "learning_rate": 1.812648500756004e-05, "loss": 0.0846, "step": 62600 }, { "epoch": 0.9222986406680314, "grad_norm": 1.2689462900161743, "learning_rate": 1.8124848635593057e-05, "loss": 0.0816, "step": 62625 }, { "epoch": 0.9226668237581185, "grad_norm": 1.4023360013961792, "learning_rate": 1.812321226362607e-05, "loss": 0.0858, "step": 62650 }, { "epoch": 0.9230350068482055, "grad_norm": 1.5279533863067627, "learning_rate": 1.8121575891659086e-05, "loss": 0.0918, "step": 62675 }, { "epoch": 0.9234031899382925, "grad_norm": 1.5480386018753052, "learning_rate": 1.8119939519692104e-05, "loss": 0.0972, "step": 62700 }, { "epoch": 0.9237713730283795, "grad_norm": 1.862874984741211, "learning_rate": 1.8118368602603798e-05, "loss": 0.088, "step": 62725 }, { "epoch": 0.9241395561184665, "grad_norm": 1.6180002689361572, "learning_rate": 1.8116732230636812e-05, "loss": 0.0937, "step": 62750 }, { "epoch": 0.9245077392085537, "grad_norm": 1.143884539604187, "learning_rate": 1.8115095858669826e-05, "loss": 0.0916, "step": 62775 }, { "epoch": 0.9248759222986407, "grad_norm": 1.1028547286987305, "learning_rate": 1.811345948670284e-05, "loss": 0.0871, "step": 62800 }, { "epoch": 0.9252441053887277, "grad_norm": 1.703330636024475, "learning_rate": 1.811182311473586e-05, "loss": 0.0888, "step": 62825 }, { "epoch": 0.9256122884788147, "grad_norm": 1.5463730096817017, "learning_rate": 1.8110186742768873e-05, "loss": 0.0924, "step": 62850 }, { "epoch": 0.9259804715689017, "grad_norm": 1.4528677463531494, "learning_rate": 1.810855037080189e-05, "loss": 0.0929, "step": 62875 }, { "epoch": 0.9263486546589889, "grad_norm": 1.3995440006256104, "learning_rate": 1.8106913998834906e-05, "loss": 0.0921, "step": 62900 }, { "epoch": 0.9267168377490759, "grad_norm": 1.1789085865020752, "learning_rate": 1.810527762686792e-05, "loss": 0.0845, "step": 62925 }, { "epoch": 0.9270850208391629, "grad_norm": 1.8213672637939453, "learning_rate": 1.8103641254900938e-05, "loss": 0.0909, "step": 62950 }, { "epoch": 0.9274532039292499, "grad_norm": 1.1578150987625122, "learning_rate": 1.810200488293395e-05, "loss": 0.0821, "step": 62975 }, { "epoch": 0.927821387019337, "grad_norm": 0.9961247444152832, "learning_rate": 1.8100368510966967e-05, "loss": 0.0891, "step": 63000 }, { "epoch": 0.928189570109424, "grad_norm": 1.5190428495407104, "learning_rate": 1.809873213899998e-05, "loss": 0.0993, "step": 63025 }, { "epoch": 0.9285577531995111, "grad_norm": 1.2491146326065063, "learning_rate": 1.8097095767032996e-05, "loss": 0.0835, "step": 63050 }, { "epoch": 0.9289259362895981, "grad_norm": 1.6360255479812622, "learning_rate": 1.8095459395066014e-05, "loss": 0.09, "step": 63075 }, { "epoch": 0.9292941193796851, "grad_norm": 1.5824953317642212, "learning_rate": 1.8093823023099028e-05, "loss": 0.0965, "step": 63100 }, { "epoch": 0.9296623024697722, "grad_norm": 1.808732509613037, "learning_rate": 1.8092186651132046e-05, "loss": 0.0821, "step": 63125 }, { "epoch": 0.9300304855598592, "grad_norm": 1.3798719644546509, "learning_rate": 1.809055027916506e-05, "loss": 0.0911, "step": 63150 }, { "epoch": 0.9303986686499462, "grad_norm": 1.413038730621338, "learning_rate": 1.8088913907198075e-05, "loss": 0.0941, "step": 63175 }, { "epoch": 0.9307668517400333, "grad_norm": 1.307576060295105, "learning_rate": 1.808727753523109e-05, "loss": 0.0899, "step": 63200 }, { "epoch": 0.9311350348301203, "grad_norm": 1.3909647464752197, "learning_rate": 1.8085641163264104e-05, "loss": 0.0864, "step": 63225 }, { "epoch": 0.9315032179202074, "grad_norm": 1.2756690979003906, "learning_rate": 1.808400479129712e-05, "loss": 0.0881, "step": 63250 }, { "epoch": 0.9318714010102944, "grad_norm": 1.1651579141616821, "learning_rate": 1.8082368419330136e-05, "loss": 0.0852, "step": 63275 }, { "epoch": 0.9322395841003814, "grad_norm": 1.597281813621521, "learning_rate": 1.808073204736315e-05, "loss": 0.0862, "step": 63300 }, { "epoch": 0.9326077671904685, "grad_norm": 1.5948517322540283, "learning_rate": 1.8079095675396168e-05, "loss": 0.0864, "step": 63325 }, { "epoch": 0.9329759502805555, "grad_norm": 1.3347128629684448, "learning_rate": 1.8077459303429183e-05, "loss": 0.0899, "step": 63350 }, { "epoch": 0.9333441333706426, "grad_norm": 1.7122323513031006, "learning_rate": 1.8075822931462197e-05, "loss": 0.0878, "step": 63375 }, { "epoch": 0.9337123164607296, "grad_norm": 0.9531201720237732, "learning_rate": 1.807418655949521e-05, "loss": 0.0792, "step": 63400 }, { "epoch": 0.9340804995508166, "grad_norm": 1.0709425210952759, "learning_rate": 1.807255018752823e-05, "loss": 0.0822, "step": 63425 }, { "epoch": 0.9344486826409036, "grad_norm": 1.5660427808761597, "learning_rate": 1.8070913815561244e-05, "loss": 0.0841, "step": 63450 }, { "epoch": 0.9348168657309907, "grad_norm": 1.3853843212127686, "learning_rate": 1.806927744359426e-05, "loss": 0.0807, "step": 63475 }, { "epoch": 0.9351850488210778, "grad_norm": 1.7013237476348877, "learning_rate": 1.8067641071627276e-05, "loss": 0.0869, "step": 63500 }, { "epoch": 0.9355532319111648, "grad_norm": 1.189096450805664, "learning_rate": 1.806600469966029e-05, "loss": 0.0831, "step": 63525 }, { "epoch": 0.9359214150012518, "grad_norm": 1.7477755546569824, "learning_rate": 1.806436832769331e-05, "loss": 0.0886, "step": 63550 }, { "epoch": 0.9362895980913388, "grad_norm": 1.5616755485534668, "learning_rate": 1.8062731955726323e-05, "loss": 0.0969, "step": 63575 }, { "epoch": 0.936657781181426, "grad_norm": 1.5929596424102783, "learning_rate": 1.8061095583759337e-05, "loss": 0.0848, "step": 63600 }, { "epoch": 0.937025964271513, "grad_norm": 1.4864095449447632, "learning_rate": 1.8059459211792352e-05, "loss": 0.0793, "step": 63625 }, { "epoch": 0.9373941473616, "grad_norm": 1.4461345672607422, "learning_rate": 1.8057822839825366e-05, "loss": 0.0941, "step": 63650 }, { "epoch": 0.937762330451687, "grad_norm": 1.7591949701309204, "learning_rate": 1.8056186467858384e-05, "loss": 0.0882, "step": 63675 }, { "epoch": 0.938130513541774, "grad_norm": 1.4366741180419922, "learning_rate": 1.80545500958914e-05, "loss": 0.0858, "step": 63700 }, { "epoch": 0.9384986966318611, "grad_norm": 1.7040621042251587, "learning_rate": 1.8052913723924413e-05, "loss": 0.0942, "step": 63725 }, { "epoch": 0.9388668797219482, "grad_norm": 1.5122658014297485, "learning_rate": 1.805127735195743e-05, "loss": 0.0897, "step": 63750 }, { "epoch": 0.9392350628120352, "grad_norm": 1.209884524345398, "learning_rate": 1.8049640979990445e-05, "loss": 0.0847, "step": 63775 }, { "epoch": 0.9396032459021222, "grad_norm": 1.0629122257232666, "learning_rate": 1.804800460802346e-05, "loss": 0.0868, "step": 63800 }, { "epoch": 0.9399714289922092, "grad_norm": 1.631815791130066, "learning_rate": 1.8046368236056474e-05, "loss": 0.0932, "step": 63825 }, { "epoch": 0.9403396120822963, "grad_norm": 1.504408597946167, "learning_rate": 1.8044731864089492e-05, "loss": 0.0873, "step": 63850 }, { "epoch": 0.9407077951723833, "grad_norm": 1.6363877058029175, "learning_rate": 1.8043095492122507e-05, "loss": 0.0885, "step": 63875 }, { "epoch": 0.9410759782624704, "grad_norm": 1.272243618965149, "learning_rate": 1.804145912015552e-05, "loss": 0.0945, "step": 63900 }, { "epoch": 0.9414441613525574, "grad_norm": 1.7437857389450073, "learning_rate": 1.803982274818854e-05, "loss": 0.0916, "step": 63925 }, { "epoch": 0.9418123444426444, "grad_norm": 1.6619237661361694, "learning_rate": 1.8038186376221553e-05, "loss": 0.093, "step": 63950 }, { "epoch": 0.9421805275327315, "grad_norm": 1.2458175420761108, "learning_rate": 1.8036550004254568e-05, "loss": 0.0816, "step": 63975 }, { "epoch": 0.9425487106228185, "grad_norm": 1.0379489660263062, "learning_rate": 1.8034913632287586e-05, "loss": 0.0911, "step": 64000 }, { "epoch": 0.9429168937129055, "grad_norm": 1.926479458808899, "learning_rate": 1.80332772603206e-05, "loss": 0.0919, "step": 64025 }, { "epoch": 0.9432850768029926, "grad_norm": 1.343337893486023, "learning_rate": 1.8031640888353615e-05, "loss": 0.0873, "step": 64050 }, { "epoch": 0.9436532598930796, "grad_norm": 1.640823245048523, "learning_rate": 1.803000451638663e-05, "loss": 0.0845, "step": 64075 }, { "epoch": 0.9440214429831667, "grad_norm": 0.8465896844863892, "learning_rate": 1.8028368144419647e-05, "loss": 0.0947, "step": 64100 }, { "epoch": 0.9443896260732537, "grad_norm": 1.8323887586593628, "learning_rate": 1.802673177245266e-05, "loss": 0.0913, "step": 64125 }, { "epoch": 0.9447578091633407, "grad_norm": 1.5387301445007324, "learning_rate": 1.8025095400485676e-05, "loss": 0.0935, "step": 64150 }, { "epoch": 0.9451259922534277, "grad_norm": 1.2456954717636108, "learning_rate": 1.8023459028518694e-05, "loss": 0.089, "step": 64175 }, { "epoch": 0.9454941753435149, "grad_norm": 1.3453234434127808, "learning_rate": 1.8021822656551708e-05, "loss": 0.0907, "step": 64200 }, { "epoch": 0.9458623584336019, "grad_norm": 1.6878015995025635, "learning_rate": 1.8020186284584723e-05, "loss": 0.093, "step": 64225 }, { "epoch": 0.9462305415236889, "grad_norm": 1.2218587398529053, "learning_rate": 1.8018549912617737e-05, "loss": 0.0941, "step": 64250 }, { "epoch": 0.9465987246137759, "grad_norm": 1.3658645153045654, "learning_rate": 1.8016913540650755e-05, "loss": 0.084, "step": 64275 }, { "epoch": 0.9469669077038629, "grad_norm": 1.2594447135925293, "learning_rate": 1.801527716868377e-05, "loss": 0.0889, "step": 64300 }, { "epoch": 0.9473350907939501, "grad_norm": 1.2579340934753418, "learning_rate": 1.8013640796716784e-05, "loss": 0.0823, "step": 64325 }, { "epoch": 0.9477032738840371, "grad_norm": 1.435950517654419, "learning_rate": 1.80120044247498e-05, "loss": 0.0898, "step": 64350 }, { "epoch": 0.9480714569741241, "grad_norm": 1.629874587059021, "learning_rate": 1.8010368052782816e-05, "loss": 0.0882, "step": 64375 }, { "epoch": 0.9484396400642111, "grad_norm": 1.4672939777374268, "learning_rate": 1.800873168081583e-05, "loss": 0.0854, "step": 64400 }, { "epoch": 0.9488078231542981, "grad_norm": 1.4122424125671387, "learning_rate": 1.800709530884885e-05, "loss": 0.09, "step": 64425 }, { "epoch": 0.9491760062443853, "grad_norm": 1.3469072580337524, "learning_rate": 1.800545893688186e-05, "loss": 0.0876, "step": 64450 }, { "epoch": 0.9495441893344723, "grad_norm": 1.363897442817688, "learning_rate": 1.8003822564914877e-05, "loss": 0.0828, "step": 64475 }, { "epoch": 0.9499123724245593, "grad_norm": 1.1692781448364258, "learning_rate": 1.8002186192947892e-05, "loss": 0.0847, "step": 64500 }, { "epoch": 0.9502805555146463, "grad_norm": 1.439585566520691, "learning_rate": 1.800054982098091e-05, "loss": 0.0803, "step": 64525 }, { "epoch": 0.9506487386047333, "grad_norm": 1.2381300926208496, "learning_rate": 1.7998913449013924e-05, "loss": 0.0916, "step": 64550 }, { "epoch": 0.9510169216948204, "grad_norm": 1.4574675559997559, "learning_rate": 1.799727707704694e-05, "loss": 0.0945, "step": 64575 }, { "epoch": 0.9513851047849075, "grad_norm": 1.155951738357544, "learning_rate": 1.7995640705079956e-05, "loss": 0.0871, "step": 64600 }, { "epoch": 0.9517532878749945, "grad_norm": 1.1017684936523438, "learning_rate": 1.799400433311297e-05, "loss": 0.0848, "step": 64625 }, { "epoch": 0.9521214709650815, "grad_norm": 1.5029999017715454, "learning_rate": 1.7992367961145985e-05, "loss": 0.0889, "step": 64650 }, { "epoch": 0.9524896540551685, "grad_norm": 1.666143536567688, "learning_rate": 1.7990731589179e-05, "loss": 0.0928, "step": 64675 }, { "epoch": 0.9528578371452556, "grad_norm": 1.2904329299926758, "learning_rate": 1.7989095217212014e-05, "loss": 0.0915, "step": 64700 }, { "epoch": 0.9532260202353426, "grad_norm": 1.4966198205947876, "learning_rate": 1.7987458845245032e-05, "loss": 0.0785, "step": 64725 }, { "epoch": 0.9535942033254297, "grad_norm": 1.3005931377410889, "learning_rate": 1.7985822473278047e-05, "loss": 0.0902, "step": 64750 }, { "epoch": 0.9539623864155167, "grad_norm": 1.4355655908584595, "learning_rate": 1.7984186101311064e-05, "loss": 0.089, "step": 64775 }, { "epoch": 0.9543305695056038, "grad_norm": 1.433249831199646, "learning_rate": 1.798254972934408e-05, "loss": 0.0867, "step": 64800 }, { "epoch": 0.9546987525956908, "grad_norm": 1.392223596572876, "learning_rate": 1.7980913357377093e-05, "loss": 0.0848, "step": 64825 }, { "epoch": 0.9550669356857778, "grad_norm": 1.0814814567565918, "learning_rate": 1.7979276985410108e-05, "loss": 0.0905, "step": 64850 }, { "epoch": 0.9554351187758648, "grad_norm": 1.419834852218628, "learning_rate": 1.7977640613443122e-05, "loss": 0.0867, "step": 64875 }, { "epoch": 0.9558033018659519, "grad_norm": 1.4989286661148071, "learning_rate": 1.797600424147614e-05, "loss": 0.091, "step": 64900 }, { "epoch": 0.956171484956039, "grad_norm": 1.759748101234436, "learning_rate": 1.7974367869509155e-05, "loss": 0.0906, "step": 64925 }, { "epoch": 0.956539668046126, "grad_norm": 1.4901723861694336, "learning_rate": 1.7972731497542172e-05, "loss": 0.0883, "step": 64950 }, { "epoch": 0.956907851136213, "grad_norm": 1.6070477962493896, "learning_rate": 1.7971095125575187e-05, "loss": 0.0933, "step": 64975 }, { "epoch": 0.9572760342263, "grad_norm": 1.7674330472946167, "learning_rate": 1.79694587536082e-05, "loss": 0.0873, "step": 65000 }, { "epoch": 0.957644217316387, "grad_norm": 1.4405423402786255, "learning_rate": 1.796782238164122e-05, "loss": 0.0852, "step": 65025 }, { "epoch": 0.9580124004064742, "grad_norm": 1.4775818586349487, "learning_rate": 1.7966186009674234e-05, "loss": 0.087, "step": 65050 }, { "epoch": 0.9583805834965612, "grad_norm": 1.1930218935012817, "learning_rate": 1.7964549637707248e-05, "loss": 0.0888, "step": 65075 }, { "epoch": 0.9587487665866482, "grad_norm": 1.4698896408081055, "learning_rate": 1.7962913265740262e-05, "loss": 0.0837, "step": 65100 }, { "epoch": 0.9591169496767352, "grad_norm": 1.6088793277740479, "learning_rate": 1.7961276893773277e-05, "loss": 0.0866, "step": 65125 }, { "epoch": 0.9594851327668222, "grad_norm": 1.456915259361267, "learning_rate": 1.7959640521806295e-05, "loss": 0.0943, "step": 65150 }, { "epoch": 0.9598533158569094, "grad_norm": 1.4380178451538086, "learning_rate": 1.795800414983931e-05, "loss": 0.0908, "step": 65175 }, { "epoch": 0.9602214989469964, "grad_norm": 1.2547969818115234, "learning_rate": 1.7956367777872327e-05, "loss": 0.0879, "step": 65200 }, { "epoch": 0.9605896820370834, "grad_norm": 1.3286045789718628, "learning_rate": 1.795473140590534e-05, "loss": 0.0828, "step": 65225 }, { "epoch": 0.9609578651271704, "grad_norm": 1.7631367444992065, "learning_rate": 1.7953095033938356e-05, "loss": 0.0906, "step": 65250 }, { "epoch": 0.9613260482172574, "grad_norm": 1.6213173866271973, "learning_rate": 1.795145866197137e-05, "loss": 0.096, "step": 65275 }, { "epoch": 0.9616942313073445, "grad_norm": 1.1831706762313843, "learning_rate": 1.7949887744883064e-05, "loss": 0.0909, "step": 65300 }, { "epoch": 0.9620624143974316, "grad_norm": 1.2344462871551514, "learning_rate": 1.7948251372916082e-05, "loss": 0.0874, "step": 65325 }, { "epoch": 0.9624305974875186, "grad_norm": 1.1486679315567017, "learning_rate": 1.7946615000949097e-05, "loss": 0.0888, "step": 65350 }, { "epoch": 0.9627987805776056, "grad_norm": 1.5928605794906616, "learning_rate": 1.794497862898211e-05, "loss": 0.0945, "step": 65375 }, { "epoch": 0.9631669636676927, "grad_norm": 1.2084425687789917, "learning_rate": 1.794334225701513e-05, "loss": 0.0838, "step": 65400 }, { "epoch": 0.9635351467577797, "grad_norm": 1.0286258459091187, "learning_rate": 1.7941705885048143e-05, "loss": 0.0815, "step": 65425 }, { "epoch": 0.9639033298478668, "grad_norm": 1.072543740272522, "learning_rate": 1.794006951308116e-05, "loss": 0.0802, "step": 65450 }, { "epoch": 0.9642715129379538, "grad_norm": 1.6432961225509644, "learning_rate": 1.7938433141114176e-05, "loss": 0.0818, "step": 65475 }, { "epoch": 0.9646396960280408, "grad_norm": 1.8096356391906738, "learning_rate": 1.793679676914719e-05, "loss": 0.0794, "step": 65500 }, { "epoch": 0.9650078791181279, "grad_norm": 1.352291226387024, "learning_rate": 1.7935160397180205e-05, "loss": 0.0908, "step": 65525 }, { "epoch": 0.9653760622082149, "grad_norm": 1.9157545566558838, "learning_rate": 1.793352402521322e-05, "loss": 0.0926, "step": 65550 }, { "epoch": 0.9657442452983019, "grad_norm": 1.394393801689148, "learning_rate": 1.7931887653246237e-05, "loss": 0.0832, "step": 65575 }, { "epoch": 0.966112428388389, "grad_norm": 1.070513129234314, "learning_rate": 1.793025128127925e-05, "loss": 0.0883, "step": 65600 }, { "epoch": 0.966480611478476, "grad_norm": 1.1968494653701782, "learning_rate": 1.7928614909312266e-05, "loss": 0.0914, "step": 65625 }, { "epoch": 0.9668487945685631, "grad_norm": 1.3151240348815918, "learning_rate": 1.7926978537345284e-05, "loss": 0.0809, "step": 65650 }, { "epoch": 0.9672169776586501, "grad_norm": 2.005269765853882, "learning_rate": 1.7925342165378298e-05, "loss": 0.08, "step": 65675 }, { "epoch": 0.9675851607487371, "grad_norm": 1.4291810989379883, "learning_rate": 1.7923705793411313e-05, "loss": 0.0909, "step": 65700 }, { "epoch": 0.9679533438388241, "grad_norm": 1.40546715259552, "learning_rate": 1.7922069421444327e-05, "loss": 0.0835, "step": 65725 }, { "epoch": 0.9683215269289112, "grad_norm": 1.1090855598449707, "learning_rate": 1.7920433049477345e-05, "loss": 0.0898, "step": 65750 }, { "epoch": 0.9686897100189983, "grad_norm": 1.6328848600387573, "learning_rate": 1.791879667751036e-05, "loss": 0.0844, "step": 65775 }, { "epoch": 0.9690578931090853, "grad_norm": 1.6949405670166016, "learning_rate": 1.7917160305543374e-05, "loss": 0.0921, "step": 65800 }, { "epoch": 0.9694260761991723, "grad_norm": 1.4263778924942017, "learning_rate": 1.791552393357639e-05, "loss": 0.091, "step": 65825 }, { "epoch": 0.9697942592892593, "grad_norm": 1.6204248666763306, "learning_rate": 1.7913887561609406e-05, "loss": 0.0946, "step": 65850 }, { "epoch": 0.9701624423793465, "grad_norm": 1.2282878160476685, "learning_rate": 1.791225118964242e-05, "loss": 0.0915, "step": 65875 }, { "epoch": 0.9705306254694335, "grad_norm": 1.5596424341201782, "learning_rate": 1.791061481767544e-05, "loss": 0.0858, "step": 65900 }, { "epoch": 0.9708988085595205, "grad_norm": 1.3790189027786255, "learning_rate": 1.7908978445708453e-05, "loss": 0.0818, "step": 65925 }, { "epoch": 0.9712669916496075, "grad_norm": 1.2927910089492798, "learning_rate": 1.7907342073741467e-05, "loss": 0.0882, "step": 65950 }, { "epoch": 0.9716351747396945, "grad_norm": 1.5115795135498047, "learning_rate": 1.7905705701774482e-05, "loss": 0.0839, "step": 65975 }, { "epoch": 0.9720033578297816, "grad_norm": 1.2746201753616333, "learning_rate": 1.79040693298075e-05, "loss": 0.0875, "step": 66000 }, { "epoch": 0.9723715409198687, "grad_norm": 1.374213695526123, "learning_rate": 1.7902432957840514e-05, "loss": 0.084, "step": 66025 }, { "epoch": 0.9727397240099557, "grad_norm": 1.3423460721969604, "learning_rate": 1.790079658587353e-05, "loss": 0.0831, "step": 66050 }, { "epoch": 0.9731079071000427, "grad_norm": 1.528391718864441, "learning_rate": 1.7899160213906546e-05, "loss": 0.093, "step": 66075 }, { "epoch": 0.9734760901901297, "grad_norm": 1.7538795471191406, "learning_rate": 1.789752384193956e-05, "loss": 0.0958, "step": 66100 }, { "epoch": 0.9738442732802168, "grad_norm": 1.7096960544586182, "learning_rate": 1.7895887469972575e-05, "loss": 0.0906, "step": 66125 }, { "epoch": 0.9742124563703038, "grad_norm": 0.8853684663772583, "learning_rate": 1.789425109800559e-05, "loss": 0.0814, "step": 66150 }, { "epoch": 0.9745806394603909, "grad_norm": 1.4343311786651611, "learning_rate": 1.7892614726038608e-05, "loss": 0.0886, "step": 66175 }, { "epoch": 0.9749488225504779, "grad_norm": 1.0935750007629395, "learning_rate": 1.7890978354071622e-05, "loss": 0.0778, "step": 66200 }, { "epoch": 0.9753170056405649, "grad_norm": 1.5821186304092407, "learning_rate": 1.7889341982104637e-05, "loss": 0.087, "step": 66225 }, { "epoch": 0.975685188730652, "grad_norm": 1.5097957849502563, "learning_rate": 1.7887705610137654e-05, "loss": 0.0859, "step": 66250 }, { "epoch": 0.976053371820739, "grad_norm": 1.6766763925552368, "learning_rate": 1.788606923817067e-05, "loss": 0.0918, "step": 66275 }, { "epoch": 0.976421554910826, "grad_norm": 1.2421185970306396, "learning_rate": 1.7884432866203683e-05, "loss": 0.088, "step": 66300 }, { "epoch": 0.9767897380009131, "grad_norm": 1.2647916078567505, "learning_rate": 1.78827964942367e-05, "loss": 0.0853, "step": 66325 }, { "epoch": 0.9771579210910001, "grad_norm": 1.9773876667022705, "learning_rate": 1.7881160122269716e-05, "loss": 0.0875, "step": 66350 }, { "epoch": 0.9775261041810872, "grad_norm": 1.2135019302368164, "learning_rate": 1.787952375030273e-05, "loss": 0.0867, "step": 66375 }, { "epoch": 0.9778942872711742, "grad_norm": 1.0465253591537476, "learning_rate": 1.7877887378335745e-05, "loss": 0.0843, "step": 66400 }, { "epoch": 0.9782624703612612, "grad_norm": 1.4281352758407593, "learning_rate": 1.7876251006368762e-05, "loss": 0.0912, "step": 66425 }, { "epoch": 0.9786306534513483, "grad_norm": 1.832780361175537, "learning_rate": 1.7874614634401777e-05, "loss": 0.0888, "step": 66450 }, { "epoch": 0.9789988365414354, "grad_norm": 1.7653343677520752, "learning_rate": 1.787297826243479e-05, "loss": 0.0858, "step": 66475 }, { "epoch": 0.9793670196315224, "grad_norm": 0.9594721794128418, "learning_rate": 1.787134189046781e-05, "loss": 0.0868, "step": 66500 }, { "epoch": 0.9797352027216094, "grad_norm": 1.0360686779022217, "learning_rate": 1.7869705518500824e-05, "loss": 0.0889, "step": 66525 }, { "epoch": 0.9801033858116964, "grad_norm": 1.1977566480636597, "learning_rate": 1.7868069146533838e-05, "loss": 0.0826, "step": 66550 }, { "epoch": 0.9804715689017834, "grad_norm": 1.925881266593933, "learning_rate": 1.7866432774566852e-05, "loss": 0.0841, "step": 66575 }, { "epoch": 0.9808397519918706, "grad_norm": 1.0635563135147095, "learning_rate": 1.786479640259987e-05, "loss": 0.0815, "step": 66600 }, { "epoch": 0.9812079350819576, "grad_norm": 1.1311936378479004, "learning_rate": 1.7863160030632885e-05, "loss": 0.089, "step": 66625 }, { "epoch": 0.9815761181720446, "grad_norm": 1.32826828956604, "learning_rate": 1.78615236586659e-05, "loss": 0.0815, "step": 66650 }, { "epoch": 0.9819443012621316, "grad_norm": 1.7389458417892456, "learning_rate": 1.7859887286698917e-05, "loss": 0.0937, "step": 66675 }, { "epoch": 0.9823124843522186, "grad_norm": 1.0361177921295166, "learning_rate": 1.785825091473193e-05, "loss": 0.0806, "step": 66700 }, { "epoch": 0.9826806674423058, "grad_norm": 2.10971736907959, "learning_rate": 1.7856614542764946e-05, "loss": 0.0875, "step": 66725 }, { "epoch": 0.9830488505323928, "grad_norm": 1.6492832899093628, "learning_rate": 1.785497817079796e-05, "loss": 0.0873, "step": 66750 }, { "epoch": 0.9834170336224798, "grad_norm": 1.559989333152771, "learning_rate": 1.7853341798830975e-05, "loss": 0.0802, "step": 66775 }, { "epoch": 0.9837852167125668, "grad_norm": 1.7486131191253662, "learning_rate": 1.7851705426863993e-05, "loss": 0.0851, "step": 66800 }, { "epoch": 0.9841533998026538, "grad_norm": 1.6348661184310913, "learning_rate": 1.7850069054897007e-05, "loss": 0.0916, "step": 66825 }, { "epoch": 0.9845215828927409, "grad_norm": 1.8323534727096558, "learning_rate": 1.7848432682930025e-05, "loss": 0.083, "step": 66850 }, { "epoch": 0.984889765982828, "grad_norm": 1.3669302463531494, "learning_rate": 1.784679631096304e-05, "loss": 0.0842, "step": 66875 }, { "epoch": 0.985257949072915, "grad_norm": 0.9307335019111633, "learning_rate": 1.7845159938996054e-05, "loss": 0.086, "step": 66900 }, { "epoch": 0.985626132163002, "grad_norm": 1.380886435508728, "learning_rate": 1.7843523567029072e-05, "loss": 0.0897, "step": 66925 }, { "epoch": 0.985994315253089, "grad_norm": 1.7508845329284668, "learning_rate": 1.7841887195062086e-05, "loss": 0.0929, "step": 66950 }, { "epoch": 0.9863624983431761, "grad_norm": 1.377454161643982, "learning_rate": 1.78402508230951e-05, "loss": 0.0928, "step": 66975 }, { "epoch": 0.9867306814332631, "grad_norm": 1.450796127319336, "learning_rate": 1.7838614451128115e-05, "loss": 0.0774, "step": 67000 }, { "epoch": 0.9870988645233502, "grad_norm": 1.3448824882507324, "learning_rate": 1.783697807916113e-05, "loss": 0.0833, "step": 67025 }, { "epoch": 0.9874670476134372, "grad_norm": 1.3844190835952759, "learning_rate": 1.7835341707194148e-05, "loss": 0.0846, "step": 67050 }, { "epoch": 0.9878352307035243, "grad_norm": 1.8527183532714844, "learning_rate": 1.7833705335227162e-05, "loss": 0.0908, "step": 67075 }, { "epoch": 0.9882034137936113, "grad_norm": 1.5654221773147583, "learning_rate": 1.783206896326018e-05, "loss": 0.0989, "step": 67100 }, { "epoch": 0.9885715968836983, "grad_norm": 1.2591010332107544, "learning_rate": 1.7830432591293194e-05, "loss": 0.0745, "step": 67125 }, { "epoch": 0.9889397799737853, "grad_norm": 1.2417330741882324, "learning_rate": 1.782879621932621e-05, "loss": 0.0917, "step": 67150 }, { "epoch": 0.9893079630638724, "grad_norm": 1.194953441619873, "learning_rate": 1.7827159847359223e-05, "loss": 0.087, "step": 67175 }, { "epoch": 0.9896761461539595, "grad_norm": 1.2350224256515503, "learning_rate": 1.7825523475392238e-05, "loss": 0.0908, "step": 67200 }, { "epoch": 0.9900443292440465, "grad_norm": 1.223141074180603, "learning_rate": 1.7823887103425255e-05, "loss": 0.0784, "step": 67225 }, { "epoch": 0.9904125123341335, "grad_norm": 1.784192442893982, "learning_rate": 1.782225073145827e-05, "loss": 0.0864, "step": 67250 }, { "epoch": 0.9907806954242205, "grad_norm": 1.5929571390151978, "learning_rate": 1.7820614359491288e-05, "loss": 0.0861, "step": 67275 }, { "epoch": 0.9911488785143076, "grad_norm": 1.3482542037963867, "learning_rate": 1.7818977987524302e-05, "loss": 0.0824, "step": 67300 }, { "epoch": 0.9915170616043947, "grad_norm": 1.104810118675232, "learning_rate": 1.7817341615557317e-05, "loss": 0.0865, "step": 67325 }, { "epoch": 0.9918852446944817, "grad_norm": 1.4847999811172485, "learning_rate": 1.7815705243590335e-05, "loss": 0.0799, "step": 67350 }, { "epoch": 0.9922534277845687, "grad_norm": 1.5398287773132324, "learning_rate": 1.781406887162335e-05, "loss": 0.0859, "step": 67375 }, { "epoch": 0.9926216108746557, "grad_norm": 1.618699312210083, "learning_rate": 1.7812432499656363e-05, "loss": 0.0857, "step": 67400 }, { "epoch": 0.9929897939647427, "grad_norm": 1.1916437149047852, "learning_rate": 1.7810796127689378e-05, "loss": 0.0919, "step": 67425 }, { "epoch": 0.9933579770548299, "grad_norm": 1.6624246835708618, "learning_rate": 1.7809159755722392e-05, "loss": 0.0919, "step": 67450 }, { "epoch": 0.9937261601449169, "grad_norm": 1.8165644407272339, "learning_rate": 1.780752338375541e-05, "loss": 0.0928, "step": 67475 }, { "epoch": 0.9940943432350039, "grad_norm": 1.5858564376831055, "learning_rate": 1.7805887011788425e-05, "loss": 0.0841, "step": 67500 }, { "epoch": 0.9944625263250909, "grad_norm": 2.407370090484619, "learning_rate": 1.7804316094700122e-05, "loss": 0.085, "step": 67525 }, { "epoch": 0.9948307094151779, "grad_norm": 1.5399054288864136, "learning_rate": 1.7802679722733136e-05, "loss": 0.0854, "step": 67550 }, { "epoch": 0.995198892505265, "grad_norm": 1.4991439580917358, "learning_rate": 1.780104335076615e-05, "loss": 0.0901, "step": 67575 }, { "epoch": 0.9955670755953521, "grad_norm": 1.1912407875061035, "learning_rate": 1.7799406978799165e-05, "loss": 0.0825, "step": 67600 }, { "epoch": 0.9959352586854391, "grad_norm": 1.625110387802124, "learning_rate": 1.779777060683218e-05, "loss": 0.0841, "step": 67625 }, { "epoch": 0.9963034417755261, "grad_norm": 1.3171745538711548, "learning_rate": 1.7796134234865198e-05, "loss": 0.0845, "step": 67650 }, { "epoch": 0.9966716248656132, "grad_norm": 1.5322037935256958, "learning_rate": 1.7794497862898212e-05, "loss": 0.0898, "step": 67675 }, { "epoch": 0.9970398079557002, "grad_norm": 1.6718095541000366, "learning_rate": 1.7792861490931227e-05, "loss": 0.0889, "step": 67700 }, { "epoch": 0.9974079910457873, "grad_norm": 1.542233943939209, "learning_rate": 1.7791225118964244e-05, "loss": 0.0843, "step": 67725 }, { "epoch": 0.9977761741358743, "grad_norm": 0.9952422976493835, "learning_rate": 1.778958874699726e-05, "loss": 0.0787, "step": 67750 }, { "epoch": 0.9981443572259613, "grad_norm": 1.5124484300613403, "learning_rate": 1.7787952375030277e-05, "loss": 0.0894, "step": 67775 }, { "epoch": 0.9985125403160484, "grad_norm": 1.3759278059005737, "learning_rate": 1.778631600306329e-05, "loss": 0.0848, "step": 67800 }, { "epoch": 0.9988807234061354, "grad_norm": 1.011905550956726, "learning_rate": 1.7784679631096306e-05, "loss": 0.0884, "step": 67825 }, { "epoch": 0.9992489064962224, "grad_norm": 1.3664641380310059, "learning_rate": 1.778304325912932e-05, "loss": 0.0914, "step": 67850 }, { "epoch": 0.9996170895863095, "grad_norm": 1.09990656375885, "learning_rate": 1.7781406887162335e-05, "loss": 0.0857, "step": 67875 }, { "epoch": 0.9999852726763965, "grad_norm": 1.1419481039047241, "learning_rate": 1.7779770515195352e-05, "loss": 0.0888, "step": 67900 }, { "epoch": 1.0, "eval_loss": 0.0816650465130806, "eval_runtime": 118.5794, "eval_samples_per_second": 2991.651, "eval_steps_per_second": 5.844, "step": 67901 }, { "epoch": 1.0003534557664835, "grad_norm": 1.6040254831314087, "learning_rate": 1.7778134143228367e-05, "loss": 0.0793, "step": 67925 }, { "epoch": 1.0007216388565705, "grad_norm": 1.2348227500915527, "learning_rate": 1.777649777126138e-05, "loss": 0.0798, "step": 67950 }, { "epoch": 1.0010898219466577, "grad_norm": 1.301466703414917, "learning_rate": 1.77748613992944e-05, "loss": 0.0873, "step": 67975 }, { "epoch": 1.0014580050367448, "grad_norm": 1.350974202156067, "learning_rate": 1.7773225027327414e-05, "loss": 0.0802, "step": 68000 }, { "epoch": 1.0018261881268318, "grad_norm": 1.2824699878692627, "learning_rate": 1.7771588655360428e-05, "loss": 0.0835, "step": 68025 }, { "epoch": 1.0021943712169188, "grad_norm": 1.5551531314849854, "learning_rate": 1.7769952283393442e-05, "loss": 0.0801, "step": 68050 }, { "epoch": 1.0025625543070058, "grad_norm": 1.713180422782898, "learning_rate": 1.776831591142646e-05, "loss": 0.0776, "step": 68075 }, { "epoch": 1.0029307373970928, "grad_norm": 1.6707239151000977, "learning_rate": 1.7766679539459475e-05, "loss": 0.0783, "step": 68100 }, { "epoch": 1.0032989204871798, "grad_norm": 1.5839450359344482, "learning_rate": 1.776504316749249e-05, "loss": 0.0825, "step": 68125 }, { "epoch": 1.0036671035772669, "grad_norm": 1.555139422416687, "learning_rate": 1.7763406795525507e-05, "loss": 0.0818, "step": 68150 }, { "epoch": 1.0040352866673539, "grad_norm": 1.2419105768203735, "learning_rate": 1.776177042355852e-05, "loss": 0.088, "step": 68175 }, { "epoch": 1.0044034697574409, "grad_norm": 1.6057548522949219, "learning_rate": 1.7760134051591536e-05, "loss": 0.08, "step": 68200 }, { "epoch": 1.0047716528475281, "grad_norm": 1.7384859323501587, "learning_rate": 1.775849767962455e-05, "loss": 0.09, "step": 68225 }, { "epoch": 1.0051398359376151, "grad_norm": 1.2435907125473022, "learning_rate": 1.775686130765757e-05, "loss": 0.0886, "step": 68250 }, { "epoch": 1.0055080190277021, "grad_norm": 1.2714653015136719, "learning_rate": 1.7755224935690583e-05, "loss": 0.0875, "step": 68275 }, { "epoch": 1.0058762021177892, "grad_norm": 1.2328994274139404, "learning_rate": 1.7753588563723597e-05, "loss": 0.0823, "step": 68300 }, { "epoch": 1.0062443852078762, "grad_norm": 1.724279522895813, "learning_rate": 1.7751952191756615e-05, "loss": 0.0854, "step": 68325 }, { "epoch": 1.0066125682979632, "grad_norm": 1.153212547302246, "learning_rate": 1.775031581978963e-05, "loss": 0.0863, "step": 68350 }, { "epoch": 1.0069807513880502, "grad_norm": 2.0599777698516846, "learning_rate": 1.7748679447822644e-05, "loss": 0.0938, "step": 68375 }, { "epoch": 1.0073489344781372, "grad_norm": 1.7947320938110352, "learning_rate": 1.7747043075855662e-05, "loss": 0.0927, "step": 68400 }, { "epoch": 1.0077171175682242, "grad_norm": 1.0751878023147583, "learning_rate": 1.7745406703888676e-05, "loss": 0.0726, "step": 68425 }, { "epoch": 1.0080853006583113, "grad_norm": 1.325618863105774, "learning_rate": 1.774377033192169e-05, "loss": 0.0879, "step": 68450 }, { "epoch": 1.0084534837483985, "grad_norm": 1.3592146635055542, "learning_rate": 1.7742133959954705e-05, "loss": 0.0828, "step": 68475 }, { "epoch": 1.0088216668384855, "grad_norm": 1.5241844654083252, "learning_rate": 1.7740497587987723e-05, "loss": 0.0823, "step": 68500 }, { "epoch": 1.0091898499285725, "grad_norm": 1.4818546772003174, "learning_rate": 1.7738861216020738e-05, "loss": 0.0853, "step": 68525 }, { "epoch": 1.0095580330186595, "grad_norm": 1.7624226808547974, "learning_rate": 1.7737224844053752e-05, "loss": 0.0855, "step": 68550 }, { "epoch": 1.0099262161087466, "grad_norm": 1.298776388168335, "learning_rate": 1.773558847208677e-05, "loss": 0.0828, "step": 68575 }, { "epoch": 1.0102943991988336, "grad_norm": 1.4680594205856323, "learning_rate": 1.7733952100119784e-05, "loss": 0.0848, "step": 68600 }, { "epoch": 1.0106625822889206, "grad_norm": 1.138354778289795, "learning_rate": 1.77323157281528e-05, "loss": 0.0773, "step": 68625 }, { "epoch": 1.0110307653790076, "grad_norm": 1.0077418088912964, "learning_rate": 1.7730679356185813e-05, "loss": 0.0752, "step": 68650 }, { "epoch": 1.0113989484690946, "grad_norm": 1.3636548519134521, "learning_rate": 1.772904298421883e-05, "loss": 0.0873, "step": 68675 }, { "epoch": 1.0117671315591819, "grad_norm": 1.248867154121399, "learning_rate": 1.7727406612251845e-05, "loss": 0.0876, "step": 68700 }, { "epoch": 1.0121353146492689, "grad_norm": 1.4871373176574707, "learning_rate": 1.772577024028486e-05, "loss": 0.0833, "step": 68725 }, { "epoch": 1.0125034977393559, "grad_norm": 1.800140142440796, "learning_rate": 1.7724133868317878e-05, "loss": 0.0859, "step": 68750 }, { "epoch": 1.012871680829443, "grad_norm": 0.9257559180259705, "learning_rate": 1.7722497496350892e-05, "loss": 0.08, "step": 68775 }, { "epoch": 1.01323986391953, "grad_norm": 1.123623013496399, "learning_rate": 1.7720861124383907e-05, "loss": 0.0806, "step": 68800 }, { "epoch": 1.013608047009617, "grad_norm": 1.6497175693511963, "learning_rate": 1.7719224752416925e-05, "loss": 0.0835, "step": 68825 }, { "epoch": 1.013976230099704, "grad_norm": 1.2564496994018555, "learning_rate": 1.771758838044994e-05, "loss": 0.081, "step": 68850 }, { "epoch": 1.014344413189791, "grad_norm": 1.5004905462265015, "learning_rate": 1.7715952008482953e-05, "loss": 0.0909, "step": 68875 }, { "epoch": 1.014712596279878, "grad_norm": 1.2909660339355469, "learning_rate": 1.7714315636515968e-05, "loss": 0.0894, "step": 68900 }, { "epoch": 1.015080779369965, "grad_norm": 1.0051658153533936, "learning_rate": 1.7712679264548986e-05, "loss": 0.0817, "step": 68925 }, { "epoch": 1.0154489624600522, "grad_norm": 1.2211277484893799, "learning_rate": 1.7711042892582e-05, "loss": 0.0824, "step": 68950 }, { "epoch": 1.0158171455501392, "grad_norm": 1.0578739643096924, "learning_rate": 1.7709406520615015e-05, "loss": 0.0795, "step": 68975 }, { "epoch": 1.0161853286402263, "grad_norm": 1.3825381994247437, "learning_rate": 1.7707770148648033e-05, "loss": 0.0873, "step": 69000 }, { "epoch": 1.0165535117303133, "grad_norm": 1.2576650381088257, "learning_rate": 1.7706133776681047e-05, "loss": 0.0814, "step": 69025 }, { "epoch": 1.0169216948204003, "grad_norm": 1.559657335281372, "learning_rate": 1.770449740471406e-05, "loss": 0.0871, "step": 69050 }, { "epoch": 1.0172898779104873, "grad_norm": 1.4857251644134521, "learning_rate": 1.7702861032747076e-05, "loss": 0.0843, "step": 69075 }, { "epoch": 1.0176580610005743, "grad_norm": 1.4493496417999268, "learning_rate": 1.770122466078009e-05, "loss": 0.0783, "step": 69100 }, { "epoch": 1.0180262440906613, "grad_norm": 1.2584834098815918, "learning_rate": 1.7699588288813108e-05, "loss": 0.0825, "step": 69125 }, { "epoch": 1.0183944271807484, "grad_norm": 1.5866835117340088, "learning_rate": 1.7697951916846123e-05, "loss": 0.0836, "step": 69150 }, { "epoch": 1.0187626102708356, "grad_norm": 1.3111428022384644, "learning_rate": 1.769631554487914e-05, "loss": 0.0884, "step": 69175 }, { "epoch": 1.0191307933609226, "grad_norm": 1.3243120908737183, "learning_rate": 1.7694679172912155e-05, "loss": 0.0822, "step": 69200 }, { "epoch": 1.0194989764510096, "grad_norm": 1.3820977210998535, "learning_rate": 1.769304280094517e-05, "loss": 0.0844, "step": 69225 }, { "epoch": 1.0198671595410966, "grad_norm": 1.5885424613952637, "learning_rate": 1.7691406428978187e-05, "loss": 0.083, "step": 69250 }, { "epoch": 1.0202353426311837, "grad_norm": 1.6160551309585571, "learning_rate": 1.7689770057011202e-05, "loss": 0.0848, "step": 69275 }, { "epoch": 1.0206035257212707, "grad_norm": 1.4503554105758667, "learning_rate": 1.7688133685044216e-05, "loss": 0.0839, "step": 69300 }, { "epoch": 1.0209717088113577, "grad_norm": 1.1684181690216064, "learning_rate": 1.768649731307723e-05, "loss": 0.087, "step": 69325 }, { "epoch": 1.0213398919014447, "grad_norm": 1.3307864665985107, "learning_rate": 1.7684860941110245e-05, "loss": 0.0859, "step": 69350 }, { "epoch": 1.0217080749915317, "grad_norm": 1.3071322441101074, "learning_rate": 1.7683224569143263e-05, "loss": 0.0763, "step": 69375 }, { "epoch": 1.0220762580816187, "grad_norm": 1.0993708372116089, "learning_rate": 1.7681588197176277e-05, "loss": 0.0864, "step": 69400 }, { "epoch": 1.022444441171706, "grad_norm": 1.5368435382843018, "learning_rate": 1.7679951825209295e-05, "loss": 0.0864, "step": 69425 }, { "epoch": 1.022812624261793, "grad_norm": 1.384029507637024, "learning_rate": 1.767831545324231e-05, "loss": 0.089, "step": 69450 }, { "epoch": 1.02318080735188, "grad_norm": 1.5836247205734253, "learning_rate": 1.7676679081275324e-05, "loss": 0.0772, "step": 69475 }, { "epoch": 1.023548990441967, "grad_norm": 1.1067153215408325, "learning_rate": 1.767504270930834e-05, "loss": 0.085, "step": 69500 }, { "epoch": 1.023917173532054, "grad_norm": 1.1940453052520752, "learning_rate": 1.7673406337341353e-05, "loss": 0.0732, "step": 69525 }, { "epoch": 1.024285356622141, "grad_norm": 1.5667295455932617, "learning_rate": 1.767176996537437e-05, "loss": 0.0862, "step": 69550 }, { "epoch": 1.024653539712228, "grad_norm": 1.4569140672683716, "learning_rate": 1.7670133593407385e-05, "loss": 0.0738, "step": 69575 }, { "epoch": 1.025021722802315, "grad_norm": 1.644558072090149, "learning_rate": 1.7668497221440403e-05, "loss": 0.0834, "step": 69600 }, { "epoch": 1.025389905892402, "grad_norm": 1.1172680854797363, "learning_rate": 1.7666860849473418e-05, "loss": 0.0767, "step": 69625 }, { "epoch": 1.025758088982489, "grad_norm": 1.9643030166625977, "learning_rate": 1.7665224477506432e-05, "loss": 0.0872, "step": 69650 }, { "epoch": 1.0261262720725763, "grad_norm": 1.4155508279800415, "learning_rate": 1.766358810553945e-05, "loss": 0.0851, "step": 69675 }, { "epoch": 1.0264944551626634, "grad_norm": 1.1935136318206787, "learning_rate": 1.766195173357246e-05, "loss": 0.0912, "step": 69700 }, { "epoch": 1.0268626382527504, "grad_norm": 1.1054178476333618, "learning_rate": 1.766031536160548e-05, "loss": 0.0901, "step": 69725 }, { "epoch": 1.0272308213428374, "grad_norm": 1.4529372453689575, "learning_rate": 1.7658678989638493e-05, "loss": 0.0866, "step": 69750 }, { "epoch": 1.0275990044329244, "grad_norm": 1.525184988975525, "learning_rate": 1.7657042617671508e-05, "loss": 0.0797, "step": 69775 }, { "epoch": 1.0279671875230114, "grad_norm": 1.60329008102417, "learning_rate": 1.7655406245704526e-05, "loss": 0.0835, "step": 69800 }, { "epoch": 1.0283353706130984, "grad_norm": 1.1393709182739258, "learning_rate": 1.765376987373754e-05, "loss": 0.0791, "step": 69825 }, { "epoch": 1.0287035537031854, "grad_norm": 1.6377661228179932, "learning_rate": 1.7652133501770558e-05, "loss": 0.0926, "step": 69850 }, { "epoch": 1.0290717367932725, "grad_norm": 1.4320484399795532, "learning_rate": 1.7650497129803572e-05, "loss": 0.0875, "step": 69875 }, { "epoch": 1.0294399198833597, "grad_norm": 1.3752083778381348, "learning_rate": 1.7648860757836587e-05, "loss": 0.0879, "step": 69900 }, { "epoch": 1.0298081029734467, "grad_norm": 1.7365537881851196, "learning_rate": 1.76472243858696e-05, "loss": 0.0893, "step": 69925 }, { "epoch": 1.0301762860635337, "grad_norm": 1.254590392112732, "learning_rate": 1.7645588013902616e-05, "loss": 0.0884, "step": 69950 }, { "epoch": 1.0305444691536207, "grad_norm": 1.0306187868118286, "learning_rate": 1.7643951641935634e-05, "loss": 0.0848, "step": 69975 }, { "epoch": 1.0309126522437078, "grad_norm": 1.7979035377502441, "learning_rate": 1.7642315269968648e-05, "loss": 0.0847, "step": 70000 }, { "epoch": 1.0312808353337948, "grad_norm": 1.0098897218704224, "learning_rate": 1.7640678898001663e-05, "loss": 0.0787, "step": 70025 }, { "epoch": 1.0316490184238818, "grad_norm": 1.1822419166564941, "learning_rate": 1.763904252603468e-05, "loss": 0.0811, "step": 70050 }, { "epoch": 1.0320172015139688, "grad_norm": 1.4940446615219116, "learning_rate": 1.7637406154067695e-05, "loss": 0.0788, "step": 70075 }, { "epoch": 1.0323853846040558, "grad_norm": 1.5025551319122314, "learning_rate": 1.7635769782100713e-05, "loss": 0.0937, "step": 70100 }, { "epoch": 1.0327535676941428, "grad_norm": 1.3301732540130615, "learning_rate": 1.7634133410133724e-05, "loss": 0.0884, "step": 70125 }, { "epoch": 1.03312175078423, "grad_norm": 1.4251512289047241, "learning_rate": 1.763249703816674e-05, "loss": 0.0762, "step": 70150 }, { "epoch": 1.033489933874317, "grad_norm": 1.9587931632995605, "learning_rate": 1.7630926121078435e-05, "loss": 0.0818, "step": 70175 }, { "epoch": 1.033858116964404, "grad_norm": 1.5725241899490356, "learning_rate": 1.762928974911145e-05, "loss": 0.0823, "step": 70200 }, { "epoch": 1.0342263000544911, "grad_norm": 1.236335277557373, "learning_rate": 1.7627653377144468e-05, "loss": 0.0823, "step": 70225 }, { "epoch": 1.0345944831445781, "grad_norm": 1.218507170677185, "learning_rate": 1.7626017005177482e-05, "loss": 0.0901, "step": 70250 }, { "epoch": 1.0349626662346652, "grad_norm": 1.6564546823501587, "learning_rate": 1.7624380633210497e-05, "loss": 0.0835, "step": 70275 }, { "epoch": 1.0353308493247522, "grad_norm": 1.8725467920303345, "learning_rate": 1.7622744261243515e-05, "loss": 0.0946, "step": 70300 }, { "epoch": 1.0356990324148392, "grad_norm": 1.1960952281951904, "learning_rate": 1.762110788927653e-05, "loss": 0.0816, "step": 70325 }, { "epoch": 1.0360672155049262, "grad_norm": 2.004316568374634, "learning_rate": 1.7619471517309543e-05, "loss": 0.0818, "step": 70350 }, { "epoch": 1.0364353985950134, "grad_norm": 1.5343608856201172, "learning_rate": 1.7617835145342558e-05, "loss": 0.0787, "step": 70375 }, { "epoch": 1.0368035816851005, "grad_norm": 1.4142705202102661, "learning_rate": 1.7616198773375576e-05, "loss": 0.0727, "step": 70400 }, { "epoch": 1.0371717647751875, "grad_norm": 1.3772735595703125, "learning_rate": 1.761456240140859e-05, "loss": 0.0798, "step": 70425 }, { "epoch": 1.0375399478652745, "grad_norm": 1.4325580596923828, "learning_rate": 1.7612926029441605e-05, "loss": 0.0852, "step": 70450 }, { "epoch": 1.0379081309553615, "grad_norm": 2.0144031047821045, "learning_rate": 1.7611289657474623e-05, "loss": 0.0864, "step": 70475 }, { "epoch": 1.0382763140454485, "grad_norm": 1.111639142036438, "learning_rate": 1.7609653285507637e-05, "loss": 0.0891, "step": 70500 }, { "epoch": 1.0386444971355355, "grad_norm": 1.2016462087631226, "learning_rate": 1.760801691354065e-05, "loss": 0.0804, "step": 70525 }, { "epoch": 1.0390126802256225, "grad_norm": 1.6934276819229126, "learning_rate": 1.7606380541573666e-05, "loss": 0.0845, "step": 70550 }, { "epoch": 1.0393808633157096, "grad_norm": 1.9183427095413208, "learning_rate": 1.7604744169606684e-05, "loss": 0.0774, "step": 70575 }, { "epoch": 1.0397490464057966, "grad_norm": 1.5269534587860107, "learning_rate": 1.7603107797639698e-05, "loss": 0.0774, "step": 70600 }, { "epoch": 1.0401172294958838, "grad_norm": 1.3354527950286865, "learning_rate": 1.7601471425672713e-05, "loss": 0.0883, "step": 70625 }, { "epoch": 1.0404854125859708, "grad_norm": 1.8077341318130493, "learning_rate": 1.759983505370573e-05, "loss": 0.0903, "step": 70650 }, { "epoch": 1.0408535956760578, "grad_norm": 1.3553496599197388, "learning_rate": 1.7598198681738745e-05, "loss": 0.0838, "step": 70675 }, { "epoch": 1.0412217787661449, "grad_norm": 1.5021089315414429, "learning_rate": 1.759656230977176e-05, "loss": 0.0766, "step": 70700 }, { "epoch": 1.0415899618562319, "grad_norm": 1.4155638217926025, "learning_rate": 1.7594925937804777e-05, "loss": 0.0817, "step": 70725 }, { "epoch": 1.0419581449463189, "grad_norm": 1.2288072109222412, "learning_rate": 1.7593289565837792e-05, "loss": 0.0798, "step": 70750 }, { "epoch": 1.042326328036406, "grad_norm": 0.9162379503250122, "learning_rate": 1.7591653193870806e-05, "loss": 0.0872, "step": 70775 }, { "epoch": 1.042694511126493, "grad_norm": 1.4179303646087646, "learning_rate": 1.759001682190382e-05, "loss": 0.0843, "step": 70800 }, { "epoch": 1.04306269421658, "grad_norm": 1.0597056150436401, "learning_rate": 1.758838044993684e-05, "loss": 0.0728, "step": 70825 }, { "epoch": 1.0434308773066672, "grad_norm": 1.3661811351776123, "learning_rate": 1.7586744077969853e-05, "loss": 0.0889, "step": 70850 }, { "epoch": 1.0437990603967542, "grad_norm": 1.0863116979599, "learning_rate": 1.7585107706002867e-05, "loss": 0.0838, "step": 70875 }, { "epoch": 1.0441672434868412, "grad_norm": 1.4826573133468628, "learning_rate": 1.7583471334035885e-05, "loss": 0.0828, "step": 70900 }, { "epoch": 1.0445354265769282, "grad_norm": 1.5765396356582642, "learning_rate": 1.75818349620689e-05, "loss": 0.0752, "step": 70925 }, { "epoch": 1.0449036096670152, "grad_norm": 1.3515713214874268, "learning_rate": 1.7580198590101914e-05, "loss": 0.0849, "step": 70950 }, { "epoch": 1.0452717927571022, "grad_norm": 1.5682932138442993, "learning_rate": 1.757856221813493e-05, "loss": 0.0897, "step": 70975 }, { "epoch": 1.0456399758471893, "grad_norm": 1.2009916305541992, "learning_rate": 1.7576925846167946e-05, "loss": 0.0829, "step": 71000 }, { "epoch": 1.0460081589372763, "grad_norm": 1.3471386432647705, "learning_rate": 1.757528947420096e-05, "loss": 0.0868, "step": 71025 }, { "epoch": 1.0463763420273633, "grad_norm": 1.3757349252700806, "learning_rate": 1.7573653102233975e-05, "loss": 0.0831, "step": 71050 }, { "epoch": 1.0467445251174503, "grad_norm": 1.463050365447998, "learning_rate": 1.7572016730266993e-05, "loss": 0.0843, "step": 71075 }, { "epoch": 1.0471127082075375, "grad_norm": 1.5152462720870972, "learning_rate": 1.7570380358300008e-05, "loss": 0.0804, "step": 71100 }, { "epoch": 1.0474808912976246, "grad_norm": 1.9172595739364624, "learning_rate": 1.7568743986333022e-05, "loss": 0.0832, "step": 71125 }, { "epoch": 1.0478490743877116, "grad_norm": 1.480907678604126, "learning_rate": 1.756710761436604e-05, "loss": 0.0866, "step": 71150 }, { "epoch": 1.0482172574777986, "grad_norm": 1.302281379699707, "learning_rate": 1.756547124239905e-05, "loss": 0.0869, "step": 71175 }, { "epoch": 1.0485854405678856, "grad_norm": 1.1208454370498657, "learning_rate": 1.756383487043207e-05, "loss": 0.082, "step": 71200 }, { "epoch": 1.0489536236579726, "grad_norm": 1.2618451118469238, "learning_rate": 1.7562198498465083e-05, "loss": 0.0792, "step": 71225 }, { "epoch": 1.0493218067480596, "grad_norm": 1.8567328453063965, "learning_rate": 1.75605621264981e-05, "loss": 0.0776, "step": 71250 }, { "epoch": 1.0496899898381467, "grad_norm": 1.074665904045105, "learning_rate": 1.7558925754531116e-05, "loss": 0.0841, "step": 71275 }, { "epoch": 1.0500581729282337, "grad_norm": 1.6876140832901, "learning_rate": 1.755728938256413e-05, "loss": 0.0893, "step": 71300 }, { "epoch": 1.0504263560183207, "grad_norm": 1.3799030780792236, "learning_rate": 1.7555653010597148e-05, "loss": 0.0805, "step": 71325 }, { "epoch": 1.050794539108408, "grad_norm": 1.2006375789642334, "learning_rate": 1.7554016638630162e-05, "loss": 0.0786, "step": 71350 }, { "epoch": 1.051162722198495, "grad_norm": 1.3169827461242676, "learning_rate": 1.7552380266663177e-05, "loss": 0.0804, "step": 71375 }, { "epoch": 1.051530905288582, "grad_norm": 1.6053396463394165, "learning_rate": 1.755074389469619e-05, "loss": 0.0789, "step": 71400 }, { "epoch": 1.051899088378669, "grad_norm": 1.2583928108215332, "learning_rate": 1.7549107522729206e-05, "loss": 0.0794, "step": 71425 }, { "epoch": 1.052267271468756, "grad_norm": 0.9673905968666077, "learning_rate": 1.7547471150762224e-05, "loss": 0.0809, "step": 71450 }, { "epoch": 1.052635454558843, "grad_norm": 1.3871254920959473, "learning_rate": 1.7545834778795238e-05, "loss": 0.0757, "step": 71475 }, { "epoch": 1.05300363764893, "grad_norm": 1.766830325126648, "learning_rate": 1.7544198406828256e-05, "loss": 0.0769, "step": 71500 }, { "epoch": 1.053371820739017, "grad_norm": 1.335160732269287, "learning_rate": 1.754256203486127e-05, "loss": 0.0789, "step": 71525 }, { "epoch": 1.053740003829104, "grad_norm": 1.3315373659133911, "learning_rate": 1.7540925662894285e-05, "loss": 0.0766, "step": 71550 }, { "epoch": 1.0541081869191913, "grad_norm": 1.4730496406555176, "learning_rate": 1.7539289290927303e-05, "loss": 0.0855, "step": 71575 }, { "epoch": 1.0544763700092783, "grad_norm": 2.104713201522827, "learning_rate": 1.7537652918960314e-05, "loss": 0.0805, "step": 71600 }, { "epoch": 1.0548445530993653, "grad_norm": 1.2803919315338135, "learning_rate": 1.753601654699333e-05, "loss": 0.0746, "step": 71625 }, { "epoch": 1.0552127361894523, "grad_norm": 1.606890082359314, "learning_rate": 1.7534380175026346e-05, "loss": 0.0864, "step": 71650 }, { "epoch": 1.0555809192795393, "grad_norm": 1.3688819408416748, "learning_rate": 1.753274380305936e-05, "loss": 0.0784, "step": 71675 }, { "epoch": 1.0559491023696264, "grad_norm": 1.2413685321807861, "learning_rate": 1.753110743109238e-05, "loss": 0.0812, "step": 71700 }, { "epoch": 1.0563172854597134, "grad_norm": 1.475354552268982, "learning_rate": 1.7529471059125393e-05, "loss": 0.0817, "step": 71725 }, { "epoch": 1.0566854685498004, "grad_norm": 1.1470940113067627, "learning_rate": 1.752783468715841e-05, "loss": 0.0857, "step": 71750 }, { "epoch": 1.0570536516398874, "grad_norm": 1.1003742218017578, "learning_rate": 1.7526198315191425e-05, "loss": 0.0847, "step": 71775 }, { "epoch": 1.0574218347299744, "grad_norm": 1.3824670314788818, "learning_rate": 1.752456194322444e-05, "loss": 0.0797, "step": 71800 }, { "epoch": 1.0577900178200617, "grad_norm": 1.7201858758926392, "learning_rate": 1.7522925571257454e-05, "loss": 0.0772, "step": 71825 }, { "epoch": 1.0581582009101487, "grad_norm": 1.4946352243423462, "learning_rate": 1.752128919929047e-05, "loss": 0.0832, "step": 71850 }, { "epoch": 1.0585263840002357, "grad_norm": 1.4284608364105225, "learning_rate": 1.7519652827323486e-05, "loss": 0.0807, "step": 71875 }, { "epoch": 1.0588945670903227, "grad_norm": 1.5651804208755493, "learning_rate": 1.75180164553565e-05, "loss": 0.0783, "step": 71900 }, { "epoch": 1.0592627501804097, "grad_norm": 1.2496366500854492, "learning_rate": 1.751638008338952e-05, "loss": 0.0824, "step": 71925 }, { "epoch": 1.0596309332704967, "grad_norm": 1.242274284362793, "learning_rate": 1.7514743711422533e-05, "loss": 0.0781, "step": 71950 }, { "epoch": 1.0599991163605837, "grad_norm": 1.537037968635559, "learning_rate": 1.7513107339455548e-05, "loss": 0.0735, "step": 71975 }, { "epoch": 1.0603672994506708, "grad_norm": 1.0832239389419556, "learning_rate": 1.7511470967488565e-05, "loss": 0.0779, "step": 72000 }, { "epoch": 1.0607354825407578, "grad_norm": 1.5025373697280884, "learning_rate": 1.7509834595521576e-05, "loss": 0.0829, "step": 72025 }, { "epoch": 1.0611036656308448, "grad_norm": 1.574141025543213, "learning_rate": 1.7508198223554594e-05, "loss": 0.0818, "step": 72050 }, { "epoch": 1.061471848720932, "grad_norm": 1.5251744985580444, "learning_rate": 1.750656185158761e-05, "loss": 0.0754, "step": 72075 }, { "epoch": 1.061840031811019, "grad_norm": 1.8047915697097778, "learning_rate": 1.7504925479620623e-05, "loss": 0.0796, "step": 72100 }, { "epoch": 1.062208214901106, "grad_norm": 1.5582528114318848, "learning_rate": 1.750328910765364e-05, "loss": 0.0867, "step": 72125 }, { "epoch": 1.062576397991193, "grad_norm": 1.557234287261963, "learning_rate": 1.7501652735686656e-05, "loss": 0.081, "step": 72150 }, { "epoch": 1.06294458108128, "grad_norm": 2.2205381393432617, "learning_rate": 1.7500016363719673e-05, "loss": 0.0816, "step": 72175 }, { "epoch": 1.063312764171367, "grad_norm": 1.7665126323699951, "learning_rate": 1.7498379991752688e-05, "loss": 0.0752, "step": 72200 }, { "epoch": 1.0636809472614541, "grad_norm": 1.6299313306808472, "learning_rate": 1.7496743619785702e-05, "loss": 0.0829, "step": 72225 }, { "epoch": 1.0640491303515411, "grad_norm": 1.3616055250167847, "learning_rate": 1.7495107247818717e-05, "loss": 0.076, "step": 72250 }, { "epoch": 1.0644173134416282, "grad_norm": 1.0546361207962036, "learning_rate": 1.749347087585173e-05, "loss": 0.0764, "step": 72275 }, { "epoch": 1.0647854965317154, "grad_norm": 1.6121575832366943, "learning_rate": 1.749183450388475e-05, "loss": 0.0845, "step": 72300 }, { "epoch": 1.0651536796218024, "grad_norm": 1.368516206741333, "learning_rate": 1.7490198131917764e-05, "loss": 0.0795, "step": 72325 }, { "epoch": 1.0655218627118894, "grad_norm": 1.8296114206314087, "learning_rate": 1.7488561759950778e-05, "loss": 0.0835, "step": 72350 }, { "epoch": 1.0658900458019764, "grad_norm": 1.780468463897705, "learning_rate": 1.7486925387983796e-05, "loss": 0.0869, "step": 72375 }, { "epoch": 1.0662582288920635, "grad_norm": 1.5264418125152588, "learning_rate": 1.748528901601681e-05, "loss": 0.0816, "step": 72400 }, { "epoch": 1.0666264119821505, "grad_norm": 1.9545817375183105, "learning_rate": 1.7483718098928508e-05, "loss": 0.0909, "step": 72425 }, { "epoch": 1.0669945950722375, "grad_norm": 1.7828044891357422, "learning_rate": 1.748208172696152e-05, "loss": 0.08, "step": 72450 }, { "epoch": 1.0673627781623245, "grad_norm": 1.3515057563781738, "learning_rate": 1.7480445354994536e-05, "loss": 0.0847, "step": 72475 }, { "epoch": 1.0677309612524115, "grad_norm": 1.1792912483215332, "learning_rate": 1.747880898302755e-05, "loss": 0.0793, "step": 72500 }, { "epoch": 1.0680991443424985, "grad_norm": 1.4954952001571655, "learning_rate": 1.7477172611060565e-05, "loss": 0.0868, "step": 72525 }, { "epoch": 1.0684673274325858, "grad_norm": 1.7793201208114624, "learning_rate": 1.7475536239093583e-05, "loss": 0.0758, "step": 72550 }, { "epoch": 1.0688355105226728, "grad_norm": 1.258103847503662, "learning_rate": 1.7473899867126598e-05, "loss": 0.081, "step": 72575 }, { "epoch": 1.0692036936127598, "grad_norm": 0.9954408407211304, "learning_rate": 1.7472263495159612e-05, "loss": 0.0834, "step": 72600 }, { "epoch": 1.0695718767028468, "grad_norm": 1.409990668296814, "learning_rate": 1.747062712319263e-05, "loss": 0.0819, "step": 72625 }, { "epoch": 1.0699400597929338, "grad_norm": 0.9933326244354248, "learning_rate": 1.7468990751225644e-05, "loss": 0.0728, "step": 72650 }, { "epoch": 1.0703082428830208, "grad_norm": 1.8262516260147095, "learning_rate": 1.746735437925866e-05, "loss": 0.0837, "step": 72675 }, { "epoch": 1.0706764259731079, "grad_norm": 1.5246831178665161, "learning_rate": 1.7465718007291673e-05, "loss": 0.0867, "step": 72700 }, { "epoch": 1.0710446090631949, "grad_norm": 1.4583308696746826, "learning_rate": 1.746408163532469e-05, "loss": 0.0815, "step": 72725 }, { "epoch": 1.071412792153282, "grad_norm": 1.1462247371673584, "learning_rate": 1.7462445263357706e-05, "loss": 0.0849, "step": 72750 }, { "epoch": 1.0717809752433691, "grad_norm": 1.4642852544784546, "learning_rate": 1.746080889139072e-05, "loss": 0.0826, "step": 72775 }, { "epoch": 1.0721491583334561, "grad_norm": 1.5686918497085571, "learning_rate": 1.7459172519423738e-05, "loss": 0.0756, "step": 72800 }, { "epoch": 1.0725173414235432, "grad_norm": 1.3278517723083496, "learning_rate": 1.7457536147456752e-05, "loss": 0.0816, "step": 72825 }, { "epoch": 1.0728855245136302, "grad_norm": 1.6725236177444458, "learning_rate": 1.7455899775489767e-05, "loss": 0.0771, "step": 72850 }, { "epoch": 1.0732537076037172, "grad_norm": 1.555825114250183, "learning_rate": 1.745426340352278e-05, "loss": 0.08, "step": 72875 }, { "epoch": 1.0736218906938042, "grad_norm": 1.4804408550262451, "learning_rate": 1.74526270315558e-05, "loss": 0.081, "step": 72900 }, { "epoch": 1.0739900737838912, "grad_norm": 1.2189922332763672, "learning_rate": 1.7450990659588814e-05, "loss": 0.0677, "step": 72925 }, { "epoch": 1.0743582568739782, "grad_norm": 1.2289925813674927, "learning_rate": 1.7449354287621828e-05, "loss": 0.0749, "step": 72950 }, { "epoch": 1.0747264399640652, "grad_norm": 1.0798590183258057, "learning_rate": 1.7447717915654846e-05, "loss": 0.0846, "step": 72975 }, { "epoch": 1.0750946230541523, "grad_norm": 1.6045472621917725, "learning_rate": 1.744608154368786e-05, "loss": 0.0822, "step": 73000 }, { "epoch": 1.0754628061442395, "grad_norm": 1.045710563659668, "learning_rate": 1.7444445171720875e-05, "loss": 0.0757, "step": 73025 }, { "epoch": 1.0758309892343265, "grad_norm": 1.7396941184997559, "learning_rate": 1.7442808799753893e-05, "loss": 0.087, "step": 73050 }, { "epoch": 1.0761991723244135, "grad_norm": 1.4701290130615234, "learning_rate": 1.7441172427786904e-05, "loss": 0.0876, "step": 73075 }, { "epoch": 1.0765673554145005, "grad_norm": 1.0782220363616943, "learning_rate": 1.743953605581992e-05, "loss": 0.0839, "step": 73100 }, { "epoch": 1.0769355385045876, "grad_norm": 1.6876006126403809, "learning_rate": 1.7437899683852936e-05, "loss": 0.0782, "step": 73125 }, { "epoch": 1.0773037215946746, "grad_norm": 1.6829878091812134, "learning_rate": 1.7436263311885954e-05, "loss": 0.0791, "step": 73150 }, { "epoch": 1.0776719046847616, "grad_norm": 1.1190736293792725, "learning_rate": 1.743462693991897e-05, "loss": 0.0823, "step": 73175 }, { "epoch": 1.0780400877748486, "grad_norm": 1.127810001373291, "learning_rate": 1.7432990567951983e-05, "loss": 0.0718, "step": 73200 }, { "epoch": 1.0784082708649356, "grad_norm": 1.4211413860321045, "learning_rate": 1.7431354195985e-05, "loss": 0.0825, "step": 73225 }, { "epoch": 1.0787764539550229, "grad_norm": 1.359724760055542, "learning_rate": 1.7429717824018015e-05, "loss": 0.0859, "step": 73250 }, { "epoch": 1.0791446370451099, "grad_norm": 1.8260948657989502, "learning_rate": 1.742808145205103e-05, "loss": 0.0819, "step": 73275 }, { "epoch": 1.079512820135197, "grad_norm": 1.6171700954437256, "learning_rate": 1.7426445080084044e-05, "loss": 0.0749, "step": 73300 }, { "epoch": 1.079881003225284, "grad_norm": 1.9867669343948364, "learning_rate": 1.7424808708117062e-05, "loss": 0.0858, "step": 73325 }, { "epoch": 1.080249186315371, "grad_norm": 1.6073391437530518, "learning_rate": 1.7423172336150076e-05, "loss": 0.0897, "step": 73350 }, { "epoch": 1.080617369405458, "grad_norm": 1.0877889394760132, "learning_rate": 1.742153596418309e-05, "loss": 0.0748, "step": 73375 }, { "epoch": 1.080985552495545, "grad_norm": 1.6490763425827026, "learning_rate": 1.741989959221611e-05, "loss": 0.086, "step": 73400 }, { "epoch": 1.081353735585632, "grad_norm": 1.2874717712402344, "learning_rate": 1.7418263220249123e-05, "loss": 0.079, "step": 73425 }, { "epoch": 1.081721918675719, "grad_norm": 1.2430524826049805, "learning_rate": 1.7416626848282138e-05, "loss": 0.0806, "step": 73450 }, { "epoch": 1.082090101765806, "grad_norm": 1.2100410461425781, "learning_rate": 1.7414990476315155e-05, "loss": 0.0776, "step": 73475 }, { "epoch": 1.0824582848558932, "grad_norm": 1.1482142210006714, "learning_rate": 1.7413354104348167e-05, "loss": 0.0781, "step": 73500 }, { "epoch": 1.0828264679459803, "grad_norm": 1.4018893241882324, "learning_rate": 1.7411717732381184e-05, "loss": 0.0784, "step": 73525 }, { "epoch": 1.0831946510360673, "grad_norm": 1.5947740077972412, "learning_rate": 1.74100813604142e-05, "loss": 0.0759, "step": 73550 }, { "epoch": 1.0835628341261543, "grad_norm": 1.4407312870025635, "learning_rate": 1.7408444988447217e-05, "loss": 0.0718, "step": 73575 }, { "epoch": 1.0839310172162413, "grad_norm": 1.9396814107894897, "learning_rate": 1.740680861648023e-05, "loss": 0.0775, "step": 73600 }, { "epoch": 1.0842992003063283, "grad_norm": 1.4494119882583618, "learning_rate": 1.7405172244513246e-05, "loss": 0.081, "step": 73625 }, { "epoch": 1.0846673833964153, "grad_norm": 1.6604697704315186, "learning_rate": 1.7403535872546263e-05, "loss": 0.078, "step": 73650 }, { "epoch": 1.0850355664865023, "grad_norm": 1.3195675611495972, "learning_rate": 1.7401899500579278e-05, "loss": 0.0791, "step": 73675 }, { "epoch": 1.0854037495765894, "grad_norm": 1.4423450231552124, "learning_rate": 1.7400263128612292e-05, "loss": 0.0791, "step": 73700 }, { "epoch": 1.0857719326666766, "grad_norm": 1.245192289352417, "learning_rate": 1.7398626756645307e-05, "loss": 0.086, "step": 73725 }, { "epoch": 1.0861401157567636, "grad_norm": 1.1297852993011475, "learning_rate": 1.739699038467832e-05, "loss": 0.0861, "step": 73750 }, { "epoch": 1.0865082988468506, "grad_norm": 1.7305914163589478, "learning_rate": 1.739535401271134e-05, "loss": 0.0853, "step": 73775 }, { "epoch": 1.0868764819369376, "grad_norm": 1.5151560306549072, "learning_rate": 1.7393717640744354e-05, "loss": 0.0822, "step": 73800 }, { "epoch": 1.0872446650270247, "grad_norm": 1.3996646404266357, "learning_rate": 1.739208126877737e-05, "loss": 0.0816, "step": 73825 }, { "epoch": 1.0876128481171117, "grad_norm": 1.3975486755371094, "learning_rate": 1.7390444896810386e-05, "loss": 0.0831, "step": 73850 }, { "epoch": 1.0879810312071987, "grad_norm": 1.7937424182891846, "learning_rate": 1.73888085248434e-05, "loss": 0.0845, "step": 73875 }, { "epoch": 1.0883492142972857, "grad_norm": 1.3818455934524536, "learning_rate": 1.7387172152876418e-05, "loss": 0.0865, "step": 73900 }, { "epoch": 1.0887173973873727, "grad_norm": 1.3593707084655762, "learning_rate": 1.738553578090943e-05, "loss": 0.0877, "step": 73925 }, { "epoch": 1.0890855804774597, "grad_norm": 1.2562730312347412, "learning_rate": 1.7383899408942447e-05, "loss": 0.0815, "step": 73950 }, { "epoch": 1.089453763567547, "grad_norm": 1.4753555059432983, "learning_rate": 1.738226303697546e-05, "loss": 0.0756, "step": 73975 }, { "epoch": 1.089821946657634, "grad_norm": 0.9510453343391418, "learning_rate": 1.7380626665008476e-05, "loss": 0.085, "step": 74000 }, { "epoch": 1.090190129747721, "grad_norm": 1.5919740200042725, "learning_rate": 1.7378990293041494e-05, "loss": 0.0778, "step": 74025 }, { "epoch": 1.090558312837808, "grad_norm": 1.4968332052230835, "learning_rate": 1.7377353921074508e-05, "loss": 0.0733, "step": 74050 }, { "epoch": 1.090926495927895, "grad_norm": 1.4238145351409912, "learning_rate": 1.7375717549107526e-05, "loss": 0.0732, "step": 74075 }, { "epoch": 1.091294679017982, "grad_norm": 1.6327730417251587, "learning_rate": 1.737408117714054e-05, "loss": 0.0813, "step": 74100 }, { "epoch": 1.091662862108069, "grad_norm": 1.4643621444702148, "learning_rate": 1.7372444805173555e-05, "loss": 0.0828, "step": 74125 }, { "epoch": 1.092031045198156, "grad_norm": 1.0110392570495605, "learning_rate": 1.737080843320657e-05, "loss": 0.0856, "step": 74150 }, { "epoch": 1.092399228288243, "grad_norm": 1.255319356918335, "learning_rate": 1.7369172061239584e-05, "loss": 0.082, "step": 74175 }, { "epoch": 1.0927674113783303, "grad_norm": 1.1562066078186035, "learning_rate": 1.7367535689272602e-05, "loss": 0.0776, "step": 74200 }, { "epoch": 1.0931355944684173, "grad_norm": 1.4678603410720825, "learning_rate": 1.7365899317305616e-05, "loss": 0.0799, "step": 74225 }, { "epoch": 1.0935037775585044, "grad_norm": 1.4903764724731445, "learning_rate": 1.7364262945338634e-05, "loss": 0.0806, "step": 74250 }, { "epoch": 1.0938719606485914, "grad_norm": 1.6453461647033691, "learning_rate": 1.736262657337165e-05, "loss": 0.0857, "step": 74275 }, { "epoch": 1.0942401437386784, "grad_norm": 1.5547235012054443, "learning_rate": 1.7360990201404663e-05, "loss": 0.0788, "step": 74300 }, { "epoch": 1.0946083268287654, "grad_norm": 1.6805888414382935, "learning_rate": 1.735935382943768e-05, "loss": 0.0849, "step": 74325 }, { "epoch": 1.0949765099188524, "grad_norm": 1.1416271924972534, "learning_rate": 1.7357717457470692e-05, "loss": 0.0826, "step": 74350 }, { "epoch": 1.0953446930089394, "grad_norm": 1.2809503078460693, "learning_rate": 1.735608108550371e-05, "loss": 0.0852, "step": 74375 }, { "epoch": 1.0957128760990265, "grad_norm": 1.764859676361084, "learning_rate": 1.7354444713536724e-05, "loss": 0.0797, "step": 74400 }, { "epoch": 1.0960810591891135, "grad_norm": 1.3929708003997803, "learning_rate": 1.735280834156974e-05, "loss": 0.0701, "step": 74425 }, { "epoch": 1.0964492422792005, "grad_norm": 1.5949950218200684, "learning_rate": 1.7351171969602757e-05, "loss": 0.083, "step": 74450 }, { "epoch": 1.0968174253692877, "grad_norm": 1.3857530355453491, "learning_rate": 1.734953559763577e-05, "loss": 0.0809, "step": 74475 }, { "epoch": 1.0971856084593747, "grad_norm": 1.1845228672027588, "learning_rate": 1.734789922566879e-05, "loss": 0.0719, "step": 74500 }, { "epoch": 1.0975537915494618, "grad_norm": 1.6475591659545898, "learning_rate": 1.7346262853701803e-05, "loss": 0.0746, "step": 74525 }, { "epoch": 1.0979219746395488, "grad_norm": 1.65035080909729, "learning_rate": 1.7344691936613497e-05, "loss": 0.0803, "step": 74550 }, { "epoch": 1.0982901577296358, "grad_norm": 1.9141035079956055, "learning_rate": 1.734305556464651e-05, "loss": 0.0887, "step": 74575 }, { "epoch": 1.0986583408197228, "grad_norm": 1.7077194452285767, "learning_rate": 1.7341419192679526e-05, "loss": 0.0851, "step": 74600 }, { "epoch": 1.0990265239098098, "grad_norm": 1.180472493171692, "learning_rate": 1.7339782820712544e-05, "loss": 0.0837, "step": 74625 }, { "epoch": 1.0993947069998968, "grad_norm": 1.4680033922195435, "learning_rate": 1.733814644874556e-05, "loss": 0.0846, "step": 74650 }, { "epoch": 1.0997628900899838, "grad_norm": 1.3979564905166626, "learning_rate": 1.7336510076778573e-05, "loss": 0.0795, "step": 74675 }, { "epoch": 1.100131073180071, "grad_norm": 1.307181477546692, "learning_rate": 1.733487370481159e-05, "loss": 0.0826, "step": 74700 }, { "epoch": 1.100499256270158, "grad_norm": 1.5092790126800537, "learning_rate": 1.7333237332844605e-05, "loss": 0.0822, "step": 74725 }, { "epoch": 1.1008674393602451, "grad_norm": 1.431704044342041, "learning_rate": 1.7331600960877623e-05, "loss": 0.0891, "step": 74750 }, { "epoch": 1.1012356224503321, "grad_norm": 0.9238422513008118, "learning_rate": 1.7329964588910634e-05, "loss": 0.0797, "step": 74775 }, { "epoch": 1.1016038055404191, "grad_norm": 2.129912853240967, "learning_rate": 1.7328328216943652e-05, "loss": 0.0755, "step": 74800 }, { "epoch": 1.1019719886305062, "grad_norm": 1.3834251165390015, "learning_rate": 1.7326691844976666e-05, "loss": 0.0779, "step": 74825 }, { "epoch": 1.1023401717205932, "grad_norm": 1.7270833253860474, "learning_rate": 1.732505547300968e-05, "loss": 0.0762, "step": 74850 }, { "epoch": 1.1027083548106802, "grad_norm": 1.3919258117675781, "learning_rate": 1.73234191010427e-05, "loss": 0.0846, "step": 74875 }, { "epoch": 1.1030765379007672, "grad_norm": 1.6441198587417603, "learning_rate": 1.7321782729075713e-05, "loss": 0.0834, "step": 74900 }, { "epoch": 1.1034447209908542, "grad_norm": 1.5029547214508057, "learning_rate": 1.7320146357108728e-05, "loss": 0.0783, "step": 74925 }, { "epoch": 1.1038129040809415, "grad_norm": 2.0855712890625, "learning_rate": 1.7318509985141745e-05, "loss": 0.0766, "step": 74950 }, { "epoch": 1.1041810871710285, "grad_norm": 1.2368886470794678, "learning_rate": 1.731687361317476e-05, "loss": 0.0853, "step": 74975 }, { "epoch": 1.1045492702611155, "grad_norm": 1.6182918548583984, "learning_rate": 1.7315237241207774e-05, "loss": 0.0825, "step": 75000 }, { "epoch": 1.1049174533512025, "grad_norm": 1.0082389116287231, "learning_rate": 1.731360086924079e-05, "loss": 0.0822, "step": 75025 }, { "epoch": 1.1052856364412895, "grad_norm": 1.4351855516433716, "learning_rate": 1.7311964497273807e-05, "loss": 0.0837, "step": 75050 }, { "epoch": 1.1056538195313765, "grad_norm": 1.5235116481781006, "learning_rate": 1.731032812530682e-05, "loss": 0.0847, "step": 75075 }, { "epoch": 1.1060220026214636, "grad_norm": 1.17570161819458, "learning_rate": 1.7308691753339836e-05, "loss": 0.0774, "step": 75100 }, { "epoch": 1.1063901857115506, "grad_norm": 1.96731698513031, "learning_rate": 1.7307055381372853e-05, "loss": 0.0817, "step": 75125 }, { "epoch": 1.1067583688016376, "grad_norm": 1.8333358764648438, "learning_rate": 1.7305419009405868e-05, "loss": 0.0921, "step": 75150 }, { "epoch": 1.1071265518917248, "grad_norm": 1.3284287452697754, "learning_rate": 1.7303782637438882e-05, "loss": 0.0867, "step": 75175 }, { "epoch": 1.1074947349818118, "grad_norm": 1.3460052013397217, "learning_rate": 1.7302146265471897e-05, "loss": 0.0828, "step": 75200 }, { "epoch": 1.1078629180718989, "grad_norm": 1.5569969415664673, "learning_rate": 1.7300509893504915e-05, "loss": 0.0854, "step": 75225 }, { "epoch": 1.1082311011619859, "grad_norm": 1.6549465656280518, "learning_rate": 1.729887352153793e-05, "loss": 0.0867, "step": 75250 }, { "epoch": 1.1085992842520729, "grad_norm": 1.4125876426696777, "learning_rate": 1.7297237149570944e-05, "loss": 0.0756, "step": 75275 }, { "epoch": 1.10896746734216, "grad_norm": 1.3274352550506592, "learning_rate": 1.729560077760396e-05, "loss": 0.0843, "step": 75300 }, { "epoch": 1.109335650432247, "grad_norm": 1.4729095697402954, "learning_rate": 1.7293964405636976e-05, "loss": 0.081, "step": 75325 }, { "epoch": 1.109703833522334, "grad_norm": 1.2580407857894897, "learning_rate": 1.729232803366999e-05, "loss": 0.081, "step": 75350 }, { "epoch": 1.110072016612421, "grad_norm": 1.4386241436004639, "learning_rate": 1.7290691661703008e-05, "loss": 0.078, "step": 75375 }, { "epoch": 1.110440199702508, "grad_norm": 1.2250170707702637, "learning_rate": 1.728905528973602e-05, "loss": 0.0737, "step": 75400 }, { "epoch": 1.1108083827925952, "grad_norm": 1.6921952962875366, "learning_rate": 1.7287418917769037e-05, "loss": 0.0823, "step": 75425 }, { "epoch": 1.1111765658826822, "grad_norm": 1.3983107805252075, "learning_rate": 1.728578254580205e-05, "loss": 0.0847, "step": 75450 }, { "epoch": 1.1115447489727692, "grad_norm": 1.6122878789901733, "learning_rate": 1.728414617383507e-05, "loss": 0.0845, "step": 75475 }, { "epoch": 1.1119129320628562, "grad_norm": 1.3357850313186646, "learning_rate": 1.7282509801868084e-05, "loss": 0.083, "step": 75500 }, { "epoch": 1.1122811151529433, "grad_norm": 1.0580809116363525, "learning_rate": 1.7280873429901098e-05, "loss": 0.0815, "step": 75525 }, { "epoch": 1.1126492982430303, "grad_norm": 1.527647852897644, "learning_rate": 1.7279237057934116e-05, "loss": 0.0813, "step": 75550 }, { "epoch": 1.1130174813331173, "grad_norm": 0.9379507899284363, "learning_rate": 1.727760068596713e-05, "loss": 0.076, "step": 75575 }, { "epoch": 1.1133856644232043, "grad_norm": 1.5256074666976929, "learning_rate": 1.7275964314000145e-05, "loss": 0.0804, "step": 75600 }, { "epoch": 1.1137538475132913, "grad_norm": 1.0808801651000977, "learning_rate": 1.727432794203316e-05, "loss": 0.0784, "step": 75625 }, { "epoch": 1.1141220306033786, "grad_norm": 1.0583561658859253, "learning_rate": 1.7272691570066177e-05, "loss": 0.0717, "step": 75650 }, { "epoch": 1.1144902136934656, "grad_norm": 1.167378306388855, "learning_rate": 1.7271055198099192e-05, "loss": 0.0794, "step": 75675 }, { "epoch": 1.1148583967835526, "grad_norm": 1.806620478630066, "learning_rate": 1.7269418826132206e-05, "loss": 0.0878, "step": 75700 }, { "epoch": 1.1152265798736396, "grad_norm": 1.5262364149093628, "learning_rate": 1.7267782454165224e-05, "loss": 0.0824, "step": 75725 }, { "epoch": 1.1155947629637266, "grad_norm": 1.6135478019714355, "learning_rate": 1.726614608219824e-05, "loss": 0.084, "step": 75750 }, { "epoch": 1.1159629460538136, "grad_norm": 1.9469832181930542, "learning_rate": 1.7264509710231253e-05, "loss": 0.0846, "step": 75775 }, { "epoch": 1.1163311291439006, "grad_norm": 1.2898473739624023, "learning_rate": 1.726287333826427e-05, "loss": 0.0758, "step": 75800 }, { "epoch": 1.1166993122339877, "grad_norm": 1.1968735456466675, "learning_rate": 1.7261236966297282e-05, "loss": 0.0836, "step": 75825 }, { "epoch": 1.1170674953240747, "grad_norm": 1.5359967947006226, "learning_rate": 1.72596005943303e-05, "loss": 0.0792, "step": 75850 }, { "epoch": 1.1174356784141617, "grad_norm": 1.64653480052948, "learning_rate": 1.7257964222363314e-05, "loss": 0.0858, "step": 75875 }, { "epoch": 1.117803861504249, "grad_norm": 1.3337204456329346, "learning_rate": 1.7256327850396332e-05, "loss": 0.0883, "step": 75900 }, { "epoch": 1.118172044594336, "grad_norm": 1.4658782482147217, "learning_rate": 1.7254691478429347e-05, "loss": 0.0836, "step": 75925 }, { "epoch": 1.118540227684423, "grad_norm": 1.0437543392181396, "learning_rate": 1.725305510646236e-05, "loss": 0.082, "step": 75950 }, { "epoch": 1.11890841077451, "grad_norm": 1.3824712038040161, "learning_rate": 1.725141873449538e-05, "loss": 0.0894, "step": 75975 }, { "epoch": 1.119276593864597, "grad_norm": 1.7544254064559937, "learning_rate": 1.7249782362528393e-05, "loss": 0.077, "step": 76000 }, { "epoch": 1.119644776954684, "grad_norm": 1.0734038352966309, "learning_rate": 1.7248145990561408e-05, "loss": 0.0827, "step": 76025 }, { "epoch": 1.120012960044771, "grad_norm": 1.267118215560913, "learning_rate": 1.7246509618594422e-05, "loss": 0.0801, "step": 76050 }, { "epoch": 1.120381143134858, "grad_norm": 1.6631799936294556, "learning_rate": 1.7244873246627437e-05, "loss": 0.0821, "step": 76075 }, { "epoch": 1.120749326224945, "grad_norm": 1.2830431461334229, "learning_rate": 1.7243236874660455e-05, "loss": 0.0744, "step": 76100 }, { "epoch": 1.1211175093150323, "grad_norm": 1.292281150817871, "learning_rate": 1.724160050269347e-05, "loss": 0.0866, "step": 76125 }, { "epoch": 1.1214856924051193, "grad_norm": 1.574506163597107, "learning_rate": 1.7239964130726487e-05, "loss": 0.081, "step": 76150 }, { "epoch": 1.1218538754952063, "grad_norm": 1.2235420942306519, "learning_rate": 1.72383277587595e-05, "loss": 0.077, "step": 76175 }, { "epoch": 1.1222220585852933, "grad_norm": 1.723737359046936, "learning_rate": 1.7236691386792516e-05, "loss": 0.082, "step": 76200 }, { "epoch": 1.1225902416753804, "grad_norm": 1.4583799839019775, "learning_rate": 1.7235055014825534e-05, "loss": 0.0793, "step": 76225 }, { "epoch": 1.1229584247654674, "grad_norm": 1.1304832696914673, "learning_rate": 1.7233418642858545e-05, "loss": 0.0793, "step": 76250 }, { "epoch": 1.1233266078555544, "grad_norm": 0.9958282709121704, "learning_rate": 1.7231782270891563e-05, "loss": 0.0898, "step": 76275 }, { "epoch": 1.1236947909456414, "grad_norm": 1.4184434413909912, "learning_rate": 1.7230145898924577e-05, "loss": 0.0811, "step": 76300 }, { "epoch": 1.1240629740357284, "grad_norm": 1.0592827796936035, "learning_rate": 1.722850952695759e-05, "loss": 0.0754, "step": 76325 }, { "epoch": 1.1244311571258154, "grad_norm": 1.2309232950210571, "learning_rate": 1.722687315499061e-05, "loss": 0.0733, "step": 76350 }, { "epoch": 1.1247993402159027, "grad_norm": 1.6619325876235962, "learning_rate": 1.7225236783023624e-05, "loss": 0.0726, "step": 76375 }, { "epoch": 1.1251675233059897, "grad_norm": 0.8825359344482422, "learning_rate": 1.722360041105664e-05, "loss": 0.0762, "step": 76400 }, { "epoch": 1.1255357063960767, "grad_norm": 1.4367597103118896, "learning_rate": 1.7221964039089656e-05, "loss": 0.0835, "step": 76425 }, { "epoch": 1.1259038894861637, "grad_norm": 1.4315481185913086, "learning_rate": 1.722032766712267e-05, "loss": 0.0818, "step": 76450 }, { "epoch": 1.1262720725762507, "grad_norm": 1.7525913715362549, "learning_rate": 1.7218691295155685e-05, "loss": 0.0829, "step": 76475 }, { "epoch": 1.1266402556663377, "grad_norm": 1.1785986423492432, "learning_rate": 1.72170549231887e-05, "loss": 0.0756, "step": 76500 }, { "epoch": 1.1270084387564248, "grad_norm": 1.0775268077850342, "learning_rate": 1.7215418551221717e-05, "loss": 0.0831, "step": 76525 }, { "epoch": 1.1273766218465118, "grad_norm": 1.243326187133789, "learning_rate": 1.7213782179254732e-05, "loss": 0.0794, "step": 76550 }, { "epoch": 1.1277448049365988, "grad_norm": 1.255817174911499, "learning_rate": 1.721214580728775e-05, "loss": 0.0801, "step": 76575 }, { "epoch": 1.128112988026686, "grad_norm": 1.1211082935333252, "learning_rate": 1.7210509435320764e-05, "loss": 0.0827, "step": 76600 }, { "epoch": 1.128481171116773, "grad_norm": 1.6233681440353394, "learning_rate": 1.720887306335378e-05, "loss": 0.0792, "step": 76625 }, { "epoch": 1.12884935420686, "grad_norm": 1.3273680210113525, "learning_rate": 1.7207236691386796e-05, "loss": 0.0862, "step": 76650 }, { "epoch": 1.129217537296947, "grad_norm": 1.525869607925415, "learning_rate": 1.7205600319419807e-05, "loss": 0.0734, "step": 76675 }, { "epoch": 1.129585720387034, "grad_norm": 1.7137657403945923, "learning_rate": 1.7203963947452825e-05, "loss": 0.0803, "step": 76700 }, { "epoch": 1.129953903477121, "grad_norm": 1.5917319059371948, "learning_rate": 1.720232757548584e-05, "loss": 0.0847, "step": 76725 }, { "epoch": 1.1303220865672081, "grad_norm": 1.3040844202041626, "learning_rate": 1.7200691203518854e-05, "loss": 0.0687, "step": 76750 }, { "epoch": 1.1306902696572951, "grad_norm": 1.1933627128601074, "learning_rate": 1.7199054831551872e-05, "loss": 0.0811, "step": 76775 }, { "epoch": 1.1310584527473821, "grad_norm": 1.637626051902771, "learning_rate": 1.7197418459584886e-05, "loss": 0.084, "step": 76800 }, { "epoch": 1.1314266358374692, "grad_norm": 1.5711268186569214, "learning_rate": 1.7195782087617904e-05, "loss": 0.0797, "step": 76825 }, { "epoch": 1.1317948189275562, "grad_norm": 1.3153433799743652, "learning_rate": 1.719414571565092e-05, "loss": 0.0853, "step": 76850 }, { "epoch": 1.1321630020176434, "grad_norm": 1.6022580862045288, "learning_rate": 1.7192509343683933e-05, "loss": 0.0732, "step": 76875 }, { "epoch": 1.1325311851077304, "grad_norm": 1.388688564300537, "learning_rate": 1.7190872971716948e-05, "loss": 0.0836, "step": 76900 }, { "epoch": 1.1328993681978174, "grad_norm": 1.6517606973648071, "learning_rate": 1.718930205462864e-05, "loss": 0.088, "step": 76925 }, { "epoch": 1.1332675512879045, "grad_norm": 1.35842764377594, "learning_rate": 1.718766568266166e-05, "loss": 0.0839, "step": 76950 }, { "epoch": 1.1336357343779915, "grad_norm": 1.1529228687286377, "learning_rate": 1.7186029310694674e-05, "loss": 0.0804, "step": 76975 }, { "epoch": 1.1340039174680785, "grad_norm": 1.8080562353134155, "learning_rate": 1.7184392938727688e-05, "loss": 0.0748, "step": 77000 }, { "epoch": 1.1343721005581655, "grad_norm": 1.4784293174743652, "learning_rate": 1.7182756566760706e-05, "loss": 0.0808, "step": 77025 }, { "epoch": 1.1347402836482525, "grad_norm": 1.4406383037567139, "learning_rate": 1.718112019479372e-05, "loss": 0.0843, "step": 77050 }, { "epoch": 1.1351084667383398, "grad_norm": 1.3282133340835571, "learning_rate": 1.717948382282674e-05, "loss": 0.0777, "step": 77075 }, { "epoch": 1.1354766498284268, "grad_norm": 1.4475878477096558, "learning_rate": 1.717784745085975e-05, "loss": 0.0832, "step": 77100 }, { "epoch": 1.1358448329185138, "grad_norm": 1.152169108390808, "learning_rate": 1.7176211078892767e-05, "loss": 0.0846, "step": 77125 }, { "epoch": 1.1362130160086008, "grad_norm": 1.4110019207000732, "learning_rate": 1.7174574706925782e-05, "loss": 0.0781, "step": 77150 }, { "epoch": 1.1365811990986878, "grad_norm": 1.5411064624786377, "learning_rate": 1.7172938334958796e-05, "loss": 0.0828, "step": 77175 }, { "epoch": 1.1369493821887748, "grad_norm": 1.9236423969268799, "learning_rate": 1.7171301962991814e-05, "loss": 0.0763, "step": 77200 }, { "epoch": 1.1373175652788619, "grad_norm": 1.0170892477035522, "learning_rate": 1.716966559102483e-05, "loss": 0.0841, "step": 77225 }, { "epoch": 1.1376857483689489, "grad_norm": 1.2837483882904053, "learning_rate": 1.7168029219057843e-05, "loss": 0.0797, "step": 77250 }, { "epoch": 1.1380539314590359, "grad_norm": 1.3705393075942993, "learning_rate": 1.716639284709086e-05, "loss": 0.0709, "step": 77275 }, { "epoch": 1.138422114549123, "grad_norm": 1.2568047046661377, "learning_rate": 1.7164756475123875e-05, "loss": 0.0761, "step": 77300 }, { "epoch": 1.13879029763921, "grad_norm": 1.4926201105117798, "learning_rate": 1.716312010315689e-05, "loss": 0.0834, "step": 77325 }, { "epoch": 1.1391584807292972, "grad_norm": 1.5288939476013184, "learning_rate": 1.7161483731189904e-05, "loss": 0.0818, "step": 77350 }, { "epoch": 1.1395266638193842, "grad_norm": 1.317007303237915, "learning_rate": 1.7159847359222922e-05, "loss": 0.0751, "step": 77375 }, { "epoch": 1.1398948469094712, "grad_norm": 1.464195966720581, "learning_rate": 1.7158210987255937e-05, "loss": 0.0836, "step": 77400 }, { "epoch": 1.1402630299995582, "grad_norm": 1.6872813701629639, "learning_rate": 1.715657461528895e-05, "loss": 0.0775, "step": 77425 }, { "epoch": 1.1406312130896452, "grad_norm": 1.4622498750686646, "learning_rate": 1.715493824332197e-05, "loss": 0.0826, "step": 77450 }, { "epoch": 1.1409993961797322, "grad_norm": 1.582440972328186, "learning_rate": 1.7153301871354983e-05, "loss": 0.0773, "step": 77475 }, { "epoch": 1.1413675792698192, "grad_norm": 1.4376102685928345, "learning_rate": 1.7151665499387998e-05, "loss": 0.0787, "step": 77500 }, { "epoch": 1.1417357623599063, "grad_norm": 1.6611912250518799, "learning_rate": 1.7150029127421012e-05, "loss": 0.0797, "step": 77525 }, { "epoch": 1.1421039454499935, "grad_norm": 1.2961845397949219, "learning_rate": 1.714839275545403e-05, "loss": 0.0709, "step": 77550 }, { "epoch": 1.1424721285400805, "grad_norm": 1.6757700443267822, "learning_rate": 1.7146756383487045e-05, "loss": 0.0801, "step": 77575 }, { "epoch": 1.1428403116301675, "grad_norm": 1.1338918209075928, "learning_rate": 1.714512001152006e-05, "loss": 0.076, "step": 77600 }, { "epoch": 1.1432084947202545, "grad_norm": 1.7071967124938965, "learning_rate": 1.7143483639553077e-05, "loss": 0.079, "step": 77625 }, { "epoch": 1.1435766778103416, "grad_norm": 1.3826173543930054, "learning_rate": 1.714184726758609e-05, "loss": 0.0789, "step": 77650 }, { "epoch": 1.1439448609004286, "grad_norm": 0.780609130859375, "learning_rate": 1.7140210895619106e-05, "loss": 0.076, "step": 77675 }, { "epoch": 1.1443130439905156, "grad_norm": 1.3437656164169312, "learning_rate": 1.7138574523652124e-05, "loss": 0.0801, "step": 77700 }, { "epoch": 1.1446812270806026, "grad_norm": 1.351599931716919, "learning_rate": 1.7136938151685135e-05, "loss": 0.0804, "step": 77725 }, { "epoch": 1.1450494101706896, "grad_norm": 1.117048978805542, "learning_rate": 1.7135301779718153e-05, "loss": 0.08, "step": 77750 }, { "epoch": 1.1454175932607766, "grad_norm": 1.6592435836791992, "learning_rate": 1.7133665407751167e-05, "loss": 0.0825, "step": 77775 }, { "epoch": 1.1457857763508636, "grad_norm": 1.3648903369903564, "learning_rate": 1.7132029035784185e-05, "loss": 0.0677, "step": 77800 }, { "epoch": 1.1461539594409509, "grad_norm": 1.1514414548873901, "learning_rate": 1.71303926638172e-05, "loss": 0.0723, "step": 77825 }, { "epoch": 1.146522142531038, "grad_norm": 1.6848379373550415, "learning_rate": 1.7128756291850214e-05, "loss": 0.0843, "step": 77850 }, { "epoch": 1.146890325621125, "grad_norm": 1.2856078147888184, "learning_rate": 1.712711991988323e-05, "loss": 0.0747, "step": 77875 }, { "epoch": 1.147258508711212, "grad_norm": 1.5376570224761963, "learning_rate": 1.7125483547916246e-05, "loss": 0.0846, "step": 77900 }, { "epoch": 1.147626691801299, "grad_norm": 1.2517526149749756, "learning_rate": 1.712384717594926e-05, "loss": 0.0805, "step": 77925 }, { "epoch": 1.147994874891386, "grad_norm": 1.3925846815109253, "learning_rate": 1.7122210803982275e-05, "loss": 0.0845, "step": 77950 }, { "epoch": 1.148363057981473, "grad_norm": 1.7829298973083496, "learning_rate": 1.7120574432015293e-05, "loss": 0.0804, "step": 77975 }, { "epoch": 1.14873124107156, "grad_norm": 1.0492154359817505, "learning_rate": 1.7118938060048307e-05, "loss": 0.0744, "step": 78000 }, { "epoch": 1.149099424161647, "grad_norm": 1.22791588306427, "learning_rate": 1.7117301688081322e-05, "loss": 0.0798, "step": 78025 }, { "epoch": 1.1494676072517342, "grad_norm": 1.5338963270187378, "learning_rate": 1.711566531611434e-05, "loss": 0.0729, "step": 78050 }, { "epoch": 1.1498357903418213, "grad_norm": 1.3798317909240723, "learning_rate": 1.7114028944147354e-05, "loss": 0.082, "step": 78075 }, { "epoch": 1.1502039734319083, "grad_norm": 1.2627533674240112, "learning_rate": 1.711239257218037e-05, "loss": 0.0806, "step": 78100 }, { "epoch": 1.1505721565219953, "grad_norm": 1.3761323690414429, "learning_rate": 1.7110756200213386e-05, "loss": 0.079, "step": 78125 }, { "epoch": 1.1509403396120823, "grad_norm": 1.0720791816711426, "learning_rate": 1.7109119828246397e-05, "loss": 0.076, "step": 78150 }, { "epoch": 1.1513085227021693, "grad_norm": 1.3640034198760986, "learning_rate": 1.7107483456279415e-05, "loss": 0.0777, "step": 78175 }, { "epoch": 1.1516767057922563, "grad_norm": 1.560114860534668, "learning_rate": 1.710584708431243e-05, "loss": 0.0805, "step": 78200 }, { "epoch": 1.1520448888823434, "grad_norm": 1.338734745979309, "learning_rate": 1.7104210712345448e-05, "loss": 0.0737, "step": 78225 }, { "epoch": 1.1524130719724304, "grad_norm": 1.5766642093658447, "learning_rate": 1.7102574340378462e-05, "loss": 0.0846, "step": 78250 }, { "epoch": 1.1527812550625174, "grad_norm": 0.9298288822174072, "learning_rate": 1.7100937968411476e-05, "loss": 0.083, "step": 78275 }, { "epoch": 1.1531494381526046, "grad_norm": 1.1307923793792725, "learning_rate": 1.7099301596444494e-05, "loss": 0.0701, "step": 78300 }, { "epoch": 1.1535176212426916, "grad_norm": 1.4159973859786987, "learning_rate": 1.709766522447751e-05, "loss": 0.0827, "step": 78325 }, { "epoch": 1.1538858043327787, "grad_norm": 1.4030858278274536, "learning_rate": 1.7096028852510523e-05, "loss": 0.0818, "step": 78350 }, { "epoch": 1.1542539874228657, "grad_norm": 1.3141252994537354, "learning_rate": 1.7094392480543538e-05, "loss": 0.078, "step": 78375 }, { "epoch": 1.1546221705129527, "grad_norm": 1.3796144723892212, "learning_rate": 1.7092756108576552e-05, "loss": 0.0788, "step": 78400 }, { "epoch": 1.1549903536030397, "grad_norm": 1.6221325397491455, "learning_rate": 1.709111973660957e-05, "loss": 0.087, "step": 78425 }, { "epoch": 1.1553585366931267, "grad_norm": 1.1633723974227905, "learning_rate": 1.7089483364642584e-05, "loss": 0.0802, "step": 78450 }, { "epoch": 1.1557267197832137, "grad_norm": 0.8109989762306213, "learning_rate": 1.7087846992675602e-05, "loss": 0.0828, "step": 78475 }, { "epoch": 1.1560949028733007, "grad_norm": 1.7017933130264282, "learning_rate": 1.7086210620708617e-05, "loss": 0.08, "step": 78500 }, { "epoch": 1.156463085963388, "grad_norm": 1.5921801328659058, "learning_rate": 1.708457424874163e-05, "loss": 0.0789, "step": 78525 }, { "epoch": 1.156831269053475, "grad_norm": 1.5675138235092163, "learning_rate": 1.708293787677465e-05, "loss": 0.0795, "step": 78550 }, { "epoch": 1.157199452143562, "grad_norm": 1.6273598670959473, "learning_rate": 1.708130150480766e-05, "loss": 0.0745, "step": 78575 }, { "epoch": 1.157567635233649, "grad_norm": 1.4480699300765991, "learning_rate": 1.7079665132840678e-05, "loss": 0.0779, "step": 78600 }, { "epoch": 1.157935818323736, "grad_norm": 1.0813771486282349, "learning_rate": 1.7078028760873692e-05, "loss": 0.0761, "step": 78625 }, { "epoch": 1.158304001413823, "grad_norm": 1.6519935131072998, "learning_rate": 1.7076392388906707e-05, "loss": 0.0796, "step": 78650 }, { "epoch": 1.15867218450391, "grad_norm": 1.2597001791000366, "learning_rate": 1.7074756016939725e-05, "loss": 0.0781, "step": 78675 }, { "epoch": 1.159040367593997, "grad_norm": 1.4550304412841797, "learning_rate": 1.707311964497274e-05, "loss": 0.075, "step": 78700 }, { "epoch": 1.159408550684084, "grad_norm": 1.2241061925888062, "learning_rate": 1.7071483273005757e-05, "loss": 0.0777, "step": 78725 }, { "epoch": 1.1597767337741711, "grad_norm": 1.7053532600402832, "learning_rate": 1.706984690103877e-05, "loss": 0.0832, "step": 78750 }, { "epoch": 1.1601449168642581, "grad_norm": 1.2709039449691772, "learning_rate": 1.7068210529071786e-05, "loss": 0.0769, "step": 78775 }, { "epoch": 1.1605130999543454, "grad_norm": 1.2542306184768677, "learning_rate": 1.70665741571048e-05, "loss": 0.084, "step": 78800 }, { "epoch": 1.1608812830444324, "grad_norm": 1.642019271850586, "learning_rate": 1.7064937785137815e-05, "loss": 0.0748, "step": 78825 }, { "epoch": 1.1612494661345194, "grad_norm": 1.9131805896759033, "learning_rate": 1.7063301413170833e-05, "loss": 0.0777, "step": 78850 }, { "epoch": 1.1616176492246064, "grad_norm": 1.2671501636505127, "learning_rate": 1.7061665041203847e-05, "loss": 0.0753, "step": 78875 }, { "epoch": 1.1619858323146934, "grad_norm": 1.4796531200408936, "learning_rate": 1.7060028669236865e-05, "loss": 0.0837, "step": 78900 }, { "epoch": 1.1623540154047804, "grad_norm": 1.3053607940673828, "learning_rate": 1.705839229726988e-05, "loss": 0.0769, "step": 78925 }, { "epoch": 1.1627221984948675, "grad_norm": 1.5992932319641113, "learning_rate": 1.7056755925302894e-05, "loss": 0.0898, "step": 78950 }, { "epoch": 1.1630903815849545, "grad_norm": 1.2795113325119019, "learning_rate": 1.705511955333591e-05, "loss": 0.0839, "step": 78975 }, { "epoch": 1.1634585646750417, "grad_norm": 1.5682183504104614, "learning_rate": 1.7053483181368923e-05, "loss": 0.0806, "step": 79000 }, { "epoch": 1.1638267477651287, "grad_norm": 1.5702452659606934, "learning_rate": 1.705191226428062e-05, "loss": 0.0745, "step": 79025 }, { "epoch": 1.1641949308552157, "grad_norm": 1.4337961673736572, "learning_rate": 1.7050275892313635e-05, "loss": 0.0761, "step": 79050 }, { "epoch": 1.1645631139453028, "grad_norm": 1.2402743101119995, "learning_rate": 1.704863952034665e-05, "loss": 0.0868, "step": 79075 }, { "epoch": 1.1649312970353898, "grad_norm": 1.2124780416488647, "learning_rate": 1.7047003148379667e-05, "loss": 0.0879, "step": 79100 }, { "epoch": 1.1652994801254768, "grad_norm": 1.216910719871521, "learning_rate": 1.704536677641268e-05, "loss": 0.0734, "step": 79125 }, { "epoch": 1.1656676632155638, "grad_norm": 1.1891676187515259, "learning_rate": 1.70437304044457e-05, "loss": 0.0838, "step": 79150 }, { "epoch": 1.1660358463056508, "grad_norm": 1.1938279867172241, "learning_rate": 1.7042094032478714e-05, "loss": 0.0838, "step": 79175 }, { "epoch": 1.1664040293957378, "grad_norm": 1.1487699747085571, "learning_rate": 1.7040457660511728e-05, "loss": 0.0892, "step": 79200 }, { "epoch": 1.1667722124858249, "grad_norm": 1.526025652885437, "learning_rate": 1.7038821288544743e-05, "loss": 0.0788, "step": 79225 }, { "epoch": 1.1671403955759119, "grad_norm": 1.1601793766021729, "learning_rate": 1.7037184916577757e-05, "loss": 0.0819, "step": 79250 }, { "epoch": 1.167508578665999, "grad_norm": 0.9991488456726074, "learning_rate": 1.7035548544610775e-05, "loss": 0.0813, "step": 79275 }, { "epoch": 1.1678767617560861, "grad_norm": 1.7756083011627197, "learning_rate": 1.703391217264379e-05, "loss": 0.0766, "step": 79300 }, { "epoch": 1.1682449448461731, "grad_norm": 1.1006495952606201, "learning_rate": 1.7032275800676804e-05, "loss": 0.0721, "step": 79325 }, { "epoch": 1.1686131279362602, "grad_norm": 1.1231677532196045, "learning_rate": 1.703063942870982e-05, "loss": 0.0805, "step": 79350 }, { "epoch": 1.1689813110263472, "grad_norm": 1.5433217287063599, "learning_rate": 1.7029003056742836e-05, "loss": 0.0744, "step": 79375 }, { "epoch": 1.1693494941164342, "grad_norm": 1.6323466300964355, "learning_rate": 1.702736668477585e-05, "loss": 0.075, "step": 79400 }, { "epoch": 1.1697176772065212, "grad_norm": 1.1453145742416382, "learning_rate": 1.7025730312808865e-05, "loss": 0.0756, "step": 79425 }, { "epoch": 1.1700858602966082, "grad_norm": 1.7219243049621582, "learning_rate": 1.7024093940841883e-05, "loss": 0.0812, "step": 79450 }, { "epoch": 1.1704540433866955, "grad_norm": 1.5594786405563354, "learning_rate": 1.7022457568874897e-05, "loss": 0.0763, "step": 79475 }, { "epoch": 1.1708222264767825, "grad_norm": 1.3090931177139282, "learning_rate": 1.7020821196907912e-05, "loss": 0.0804, "step": 79500 }, { "epoch": 1.1711904095668695, "grad_norm": 1.0847352743148804, "learning_rate": 1.701918482494093e-05, "loss": 0.0759, "step": 79525 }, { "epoch": 1.1715585926569565, "grad_norm": 1.6062463521957397, "learning_rate": 1.7017548452973944e-05, "loss": 0.0796, "step": 79550 }, { "epoch": 1.1719267757470435, "grad_norm": 1.486147403717041, "learning_rate": 1.701591208100696e-05, "loss": 0.0778, "step": 79575 }, { "epoch": 1.1722949588371305, "grad_norm": 1.7430894374847412, "learning_rate": 1.7014275709039976e-05, "loss": 0.0825, "step": 79600 }, { "epoch": 1.1726631419272175, "grad_norm": 1.096242070198059, "learning_rate": 1.701263933707299e-05, "loss": 0.0761, "step": 79625 }, { "epoch": 1.1730313250173046, "grad_norm": 0.9886026382446289, "learning_rate": 1.7011002965106005e-05, "loss": 0.0805, "step": 79650 }, { "epoch": 1.1733995081073916, "grad_norm": 1.458372712135315, "learning_rate": 1.700936659313902e-05, "loss": 0.076, "step": 79675 }, { "epoch": 1.1737676911974786, "grad_norm": 1.2455376386642456, "learning_rate": 1.7007730221172038e-05, "loss": 0.0703, "step": 79700 }, { "epoch": 1.1741358742875656, "grad_norm": 1.4454127550125122, "learning_rate": 1.7006093849205052e-05, "loss": 0.0792, "step": 79725 }, { "epoch": 1.1745040573776528, "grad_norm": 2.116722345352173, "learning_rate": 1.7004457477238066e-05, "loss": 0.0859, "step": 79750 }, { "epoch": 1.1748722404677399, "grad_norm": 1.438284993171692, "learning_rate": 1.7002821105271084e-05, "loss": 0.0767, "step": 79775 }, { "epoch": 1.1752404235578269, "grad_norm": 1.1335705518722534, "learning_rate": 1.70011847333041e-05, "loss": 0.0725, "step": 79800 }, { "epoch": 1.175608606647914, "grad_norm": 0.9960853457450867, "learning_rate": 1.6999548361337113e-05, "loss": 0.0834, "step": 79825 }, { "epoch": 1.175976789738001, "grad_norm": 1.3727959394454956, "learning_rate": 1.6997911989370128e-05, "loss": 0.0745, "step": 79850 }, { "epoch": 1.176344972828088, "grad_norm": 1.1427618265151978, "learning_rate": 1.6996275617403146e-05, "loss": 0.0817, "step": 79875 }, { "epoch": 1.176713155918175, "grad_norm": 1.2711044549942017, "learning_rate": 1.699463924543616e-05, "loss": 0.0756, "step": 79900 }, { "epoch": 1.177081339008262, "grad_norm": 1.662916660308838, "learning_rate": 1.6993002873469174e-05, "loss": 0.0805, "step": 79925 }, { "epoch": 1.1774495220983492, "grad_norm": 1.486621618270874, "learning_rate": 1.6991366501502192e-05, "loss": 0.0776, "step": 79950 }, { "epoch": 1.1778177051884362, "grad_norm": 1.3024013042449951, "learning_rate": 1.6989730129535207e-05, "loss": 0.0795, "step": 79975 }, { "epoch": 1.1781858882785232, "grad_norm": 1.5854506492614746, "learning_rate": 1.698809375756822e-05, "loss": 0.0702, "step": 80000 }, { "epoch": 1.1785540713686102, "grad_norm": 1.459588885307312, "learning_rate": 1.698645738560124e-05, "loss": 0.0752, "step": 80025 }, { "epoch": 1.1789222544586972, "grad_norm": 1.313154697418213, "learning_rate": 1.698482101363425e-05, "loss": 0.0747, "step": 80050 }, { "epoch": 1.1792904375487843, "grad_norm": 1.4655460119247437, "learning_rate": 1.6983184641667268e-05, "loss": 0.0765, "step": 80075 }, { "epoch": 1.1796586206388713, "grad_norm": 2.191974401473999, "learning_rate": 1.6981548269700282e-05, "loss": 0.0862, "step": 80100 }, { "epoch": 1.1800268037289583, "grad_norm": 1.5848077535629272, "learning_rate": 1.69799118977333e-05, "loss": 0.0793, "step": 80125 }, { "epoch": 1.1803949868190453, "grad_norm": 1.5642589330673218, "learning_rate": 1.6978275525766315e-05, "loss": 0.0833, "step": 80150 }, { "epoch": 1.1807631699091323, "grad_norm": 1.6908992528915405, "learning_rate": 1.697663915379933e-05, "loss": 0.0823, "step": 80175 }, { "epoch": 1.1811313529992193, "grad_norm": 1.2724112272262573, "learning_rate": 1.6975002781832347e-05, "loss": 0.0803, "step": 80200 }, { "epoch": 1.1814995360893066, "grad_norm": 1.3824703693389893, "learning_rate": 1.697336640986536e-05, "loss": 0.0837, "step": 80225 }, { "epoch": 1.1818677191793936, "grad_norm": 1.66892671585083, "learning_rate": 1.6971730037898376e-05, "loss": 0.0775, "step": 80250 }, { "epoch": 1.1822359022694806, "grad_norm": 1.3801589012145996, "learning_rate": 1.697009366593139e-05, "loss": 0.0768, "step": 80275 }, { "epoch": 1.1826040853595676, "grad_norm": 1.526690125465393, "learning_rate": 1.6968457293964408e-05, "loss": 0.0818, "step": 80300 }, { "epoch": 1.1829722684496546, "grad_norm": 1.5594807863235474, "learning_rate": 1.6966820921997423e-05, "loss": 0.0797, "step": 80325 }, { "epoch": 1.1833404515397417, "grad_norm": 1.5034478902816772, "learning_rate": 1.6965184550030437e-05, "loss": 0.0815, "step": 80350 }, { "epoch": 1.1837086346298287, "grad_norm": 1.3898206949234009, "learning_rate": 1.6963548178063455e-05, "loss": 0.0819, "step": 80375 }, { "epoch": 1.1840768177199157, "grad_norm": 1.6028202772140503, "learning_rate": 1.696191180609647e-05, "loss": 0.0815, "step": 80400 }, { "epoch": 1.1844450008100027, "grad_norm": 1.446126103401184, "learning_rate": 1.6960275434129484e-05, "loss": 0.075, "step": 80425 }, { "epoch": 1.18481318390009, "grad_norm": 0.9129065275192261, "learning_rate": 1.6958639062162502e-05, "loss": 0.0761, "step": 80450 }, { "epoch": 1.185181366990177, "grad_norm": 1.6920312643051147, "learning_rate": 1.6957002690195513e-05, "loss": 0.0817, "step": 80475 }, { "epoch": 1.185549550080264, "grad_norm": 1.3888028860092163, "learning_rate": 1.695536631822853e-05, "loss": 0.0728, "step": 80500 }, { "epoch": 1.185917733170351, "grad_norm": 1.6679725646972656, "learning_rate": 1.6953729946261545e-05, "loss": 0.0908, "step": 80525 }, { "epoch": 1.186285916260438, "grad_norm": 1.0872584581375122, "learning_rate": 1.6952093574294563e-05, "loss": 0.0811, "step": 80550 }, { "epoch": 1.186654099350525, "grad_norm": 1.3815594911575317, "learning_rate": 1.6950457202327577e-05, "loss": 0.0748, "step": 80575 }, { "epoch": 1.187022282440612, "grad_norm": 1.1395231485366821, "learning_rate": 1.6948820830360592e-05, "loss": 0.083, "step": 80600 }, { "epoch": 1.187390465530699, "grad_norm": 1.7929774522781372, "learning_rate": 1.694718445839361e-05, "loss": 0.0777, "step": 80625 }, { "epoch": 1.187758648620786, "grad_norm": 1.1024836301803589, "learning_rate": 1.6945548086426624e-05, "loss": 0.0779, "step": 80650 }, { "epoch": 1.188126831710873, "grad_norm": 1.374184489250183, "learning_rate": 1.694391171445964e-05, "loss": 0.0776, "step": 80675 }, { "epoch": 1.1884950148009603, "grad_norm": 1.7337616682052612, "learning_rate": 1.6942275342492653e-05, "loss": 0.08, "step": 80700 }, { "epoch": 1.1888631978910473, "grad_norm": 1.402324914932251, "learning_rate": 1.6940638970525668e-05, "loss": 0.0796, "step": 80725 }, { "epoch": 1.1892313809811343, "grad_norm": 1.525404453277588, "learning_rate": 1.6939002598558685e-05, "loss": 0.0861, "step": 80750 }, { "epoch": 1.1895995640712214, "grad_norm": 1.3129658699035645, "learning_rate": 1.69373662265917e-05, "loss": 0.0761, "step": 80775 }, { "epoch": 1.1899677471613084, "grad_norm": 1.0589076280593872, "learning_rate": 1.6935729854624718e-05, "loss": 0.0813, "step": 80800 }, { "epoch": 1.1903359302513954, "grad_norm": 1.4902567863464355, "learning_rate": 1.6934093482657732e-05, "loss": 0.0741, "step": 80825 }, { "epoch": 1.1907041133414824, "grad_norm": 1.9505789279937744, "learning_rate": 1.6932457110690747e-05, "loss": 0.0757, "step": 80850 }, { "epoch": 1.1910722964315694, "grad_norm": 1.4526811838150024, "learning_rate": 1.693082073872376e-05, "loss": 0.0771, "step": 80875 }, { "epoch": 1.1914404795216564, "grad_norm": 1.4940719604492188, "learning_rate": 1.6929184366756776e-05, "loss": 0.0744, "step": 80900 }, { "epoch": 1.1918086626117437, "grad_norm": 1.354565978050232, "learning_rate": 1.6927547994789793e-05, "loss": 0.0773, "step": 80925 }, { "epoch": 1.1921768457018307, "grad_norm": 1.7248589992523193, "learning_rate": 1.6925911622822808e-05, "loss": 0.0785, "step": 80950 }, { "epoch": 1.1925450287919177, "grad_norm": 1.612230896949768, "learning_rate": 1.6924275250855822e-05, "loss": 0.0815, "step": 80975 }, { "epoch": 1.1929132118820047, "grad_norm": 1.3987089395523071, "learning_rate": 1.692263887888884e-05, "loss": 0.0812, "step": 81000 }, { "epoch": 1.1932813949720917, "grad_norm": 1.4682742357254028, "learning_rate": 1.6921002506921855e-05, "loss": 0.0703, "step": 81025 }, { "epoch": 1.1936495780621788, "grad_norm": 1.3240442276000977, "learning_rate": 1.6919366134954872e-05, "loss": 0.0781, "step": 81050 }, { "epoch": 1.1940177611522658, "grad_norm": 1.2376230955123901, "learning_rate": 1.6917729762987887e-05, "loss": 0.0774, "step": 81075 }, { "epoch": 1.1943859442423528, "grad_norm": 1.812394618988037, "learning_rate": 1.69160933910209e-05, "loss": 0.083, "step": 81100 }, { "epoch": 1.1947541273324398, "grad_norm": 1.4060970544815063, "learning_rate": 1.6914457019053916e-05, "loss": 0.0816, "step": 81125 }, { "epoch": 1.1951223104225268, "grad_norm": 1.2166340351104736, "learning_rate": 1.691282064708693e-05, "loss": 0.0738, "step": 81150 }, { "epoch": 1.195490493512614, "grad_norm": 1.5965485572814941, "learning_rate": 1.6911184275119948e-05, "loss": 0.0772, "step": 81175 }, { "epoch": 1.195858676602701, "grad_norm": 1.1460227966308594, "learning_rate": 1.6909547903152963e-05, "loss": 0.0852, "step": 81200 }, { "epoch": 1.196226859692788, "grad_norm": 1.0590081214904785, "learning_rate": 1.6907911531185977e-05, "loss": 0.0746, "step": 81225 }, { "epoch": 1.196595042782875, "grad_norm": 1.3020614385604858, "learning_rate": 1.6906275159218995e-05, "loss": 0.0705, "step": 81250 }, { "epoch": 1.1969632258729621, "grad_norm": 1.3257240056991577, "learning_rate": 1.690463878725201e-05, "loss": 0.0785, "step": 81275 }, { "epoch": 1.1973314089630491, "grad_norm": 1.8245090246200562, "learning_rate": 1.6903002415285024e-05, "loss": 0.0788, "step": 81300 }, { "epoch": 1.1976995920531361, "grad_norm": 1.2296271324157715, "learning_rate": 1.6901366043318038e-05, "loss": 0.0747, "step": 81325 }, { "epoch": 1.1980677751432232, "grad_norm": 1.7340651750564575, "learning_rate": 1.6899729671351056e-05, "loss": 0.0846, "step": 81350 }, { "epoch": 1.1984359582333102, "grad_norm": 1.7498055696487427, "learning_rate": 1.689809329938407e-05, "loss": 0.0787, "step": 81375 }, { "epoch": 1.1988041413233974, "grad_norm": 1.2333828210830688, "learning_rate": 1.6896456927417085e-05, "loss": 0.078, "step": 81400 }, { "epoch": 1.1991723244134844, "grad_norm": 1.2235738039016724, "learning_rate": 1.6894820555450103e-05, "loss": 0.0763, "step": 81425 }, { "epoch": 1.1995405075035714, "grad_norm": 1.2170321941375732, "learning_rate": 1.6893184183483117e-05, "loss": 0.0801, "step": 81450 }, { "epoch": 1.1999086905936585, "grad_norm": 1.5779669284820557, "learning_rate": 1.6891547811516135e-05, "loss": 0.0784, "step": 81475 }, { "epoch": 1.2002768736837455, "grad_norm": 0.9630232453346252, "learning_rate": 1.688991143954915e-05, "loss": 0.0748, "step": 81500 }, { "epoch": 1.2006450567738325, "grad_norm": 1.3928108215332031, "learning_rate": 1.6888275067582164e-05, "loss": 0.0679, "step": 81525 }, { "epoch": 1.2010132398639195, "grad_norm": 1.3765554428100586, "learning_rate": 1.688663869561518e-05, "loss": 0.0748, "step": 81550 }, { "epoch": 1.2013814229540065, "grad_norm": 1.31941556930542, "learning_rate": 1.6885067778526872e-05, "loss": 0.0749, "step": 81575 }, { "epoch": 1.2017496060440935, "grad_norm": 1.8676040172576904, "learning_rate": 1.688343140655989e-05, "loss": 0.0778, "step": 81600 }, { "epoch": 1.2021177891341805, "grad_norm": 1.0498555898666382, "learning_rate": 1.6881795034592905e-05, "loss": 0.075, "step": 81625 }, { "epoch": 1.2024859722242676, "grad_norm": 1.543021559715271, "learning_rate": 1.688015866262592e-05, "loss": 0.0791, "step": 81650 }, { "epoch": 1.2028541553143548, "grad_norm": 1.4632128477096558, "learning_rate": 1.6878522290658937e-05, "loss": 0.0677, "step": 81675 }, { "epoch": 1.2032223384044418, "grad_norm": 1.2469948530197144, "learning_rate": 1.687688591869195e-05, "loss": 0.0846, "step": 81700 }, { "epoch": 1.2035905214945288, "grad_norm": 1.282270908355713, "learning_rate": 1.6875249546724966e-05, "loss": 0.0826, "step": 81725 }, { "epoch": 1.2039587045846158, "grad_norm": 1.6557286977767944, "learning_rate": 1.687361317475798e-05, "loss": 0.074, "step": 81750 }, { "epoch": 1.2043268876747029, "grad_norm": 1.2044891119003296, "learning_rate": 1.6871976802790998e-05, "loss": 0.0768, "step": 81775 }, { "epoch": 1.2046950707647899, "grad_norm": 1.0072743892669678, "learning_rate": 1.6870340430824013e-05, "loss": 0.0724, "step": 81800 }, { "epoch": 1.205063253854877, "grad_norm": 1.2253731489181519, "learning_rate": 1.6868704058857027e-05, "loss": 0.0742, "step": 81825 }, { "epoch": 1.205431436944964, "grad_norm": 1.1172629594802856, "learning_rate": 1.6867067686890045e-05, "loss": 0.0746, "step": 81850 }, { "epoch": 1.2057996200350511, "grad_norm": 1.8046514987945557, "learning_rate": 1.686543131492306e-05, "loss": 0.0853, "step": 81875 }, { "epoch": 1.2061678031251382, "grad_norm": 1.565524935722351, "learning_rate": 1.6863794942956074e-05, "loss": 0.0923, "step": 81900 }, { "epoch": 1.2065359862152252, "grad_norm": 1.6693696975708008, "learning_rate": 1.6862158570989092e-05, "loss": 0.0756, "step": 81925 }, { "epoch": 1.2069041693053122, "grad_norm": 1.2145295143127441, "learning_rate": 1.6860522199022106e-05, "loss": 0.0802, "step": 81950 }, { "epoch": 1.2072723523953992, "grad_norm": 1.9580086469650269, "learning_rate": 1.685888582705512e-05, "loss": 0.0802, "step": 81975 }, { "epoch": 1.2076405354854862, "grad_norm": 1.2828919887542725, "learning_rate": 1.6857249455088135e-05, "loss": 0.0715, "step": 82000 }, { "epoch": 1.2080087185755732, "grad_norm": 1.1126285791397095, "learning_rate": 1.6855613083121153e-05, "loss": 0.0723, "step": 82025 }, { "epoch": 1.2083769016656603, "grad_norm": 1.360633373260498, "learning_rate": 1.6853976711154167e-05, "loss": 0.0749, "step": 82050 }, { "epoch": 1.2087450847557473, "grad_norm": 1.452627420425415, "learning_rate": 1.6852340339187182e-05, "loss": 0.0825, "step": 82075 }, { "epoch": 1.2091132678458343, "grad_norm": 1.4565470218658447, "learning_rate": 1.68507039672202e-05, "loss": 0.0739, "step": 82100 }, { "epoch": 1.2094814509359213, "grad_norm": 1.4982283115386963, "learning_rate": 1.6849067595253214e-05, "loss": 0.0731, "step": 82125 }, { "epoch": 1.2098496340260085, "grad_norm": 1.1828523874282837, "learning_rate": 1.684743122328623e-05, "loss": 0.0747, "step": 82150 }, { "epoch": 1.2102178171160956, "grad_norm": 0.8931102752685547, "learning_rate": 1.6845794851319243e-05, "loss": 0.0687, "step": 82175 }, { "epoch": 1.2105860002061826, "grad_norm": 1.536070704460144, "learning_rate": 1.684415847935226e-05, "loss": 0.0774, "step": 82200 }, { "epoch": 1.2109541832962696, "grad_norm": 0.6760857105255127, "learning_rate": 1.6842522107385275e-05, "loss": 0.0739, "step": 82225 }, { "epoch": 1.2113223663863566, "grad_norm": 1.686180830001831, "learning_rate": 1.684088573541829e-05, "loss": 0.0721, "step": 82250 }, { "epoch": 1.2116905494764436, "grad_norm": 1.1491360664367676, "learning_rate": 1.6839249363451308e-05, "loss": 0.075, "step": 82275 }, { "epoch": 1.2120587325665306, "grad_norm": 1.8029288053512573, "learning_rate": 1.6837612991484322e-05, "loss": 0.0759, "step": 82300 }, { "epoch": 1.2124269156566176, "grad_norm": 1.4963215589523315, "learning_rate": 1.6835976619517337e-05, "loss": 0.0798, "step": 82325 }, { "epoch": 1.2127950987467049, "grad_norm": 1.3388421535491943, "learning_rate": 1.683434024755035e-05, "loss": 0.0718, "step": 82350 }, { "epoch": 1.213163281836792, "grad_norm": 0.9847051501274109, "learning_rate": 1.6832703875583366e-05, "loss": 0.0791, "step": 82375 }, { "epoch": 1.213531464926879, "grad_norm": 1.8953324556350708, "learning_rate": 1.6831067503616383e-05, "loss": 0.0777, "step": 82400 }, { "epoch": 1.213899648016966, "grad_norm": 1.5321537256240845, "learning_rate": 1.6829431131649398e-05, "loss": 0.0768, "step": 82425 }, { "epoch": 1.214267831107053, "grad_norm": 1.463422417640686, "learning_rate": 1.6827794759682416e-05, "loss": 0.0672, "step": 82450 }, { "epoch": 1.21463601419714, "grad_norm": 1.3408867120742798, "learning_rate": 1.682615838771543e-05, "loss": 0.0805, "step": 82475 }, { "epoch": 1.215004197287227, "grad_norm": 1.4994094371795654, "learning_rate": 1.6824522015748445e-05, "loss": 0.0741, "step": 82500 }, { "epoch": 1.215372380377314, "grad_norm": 1.6190303564071655, "learning_rate": 1.6822885643781462e-05, "loss": 0.0759, "step": 82525 }, { "epoch": 1.215740563467401, "grad_norm": 1.7373073101043701, "learning_rate": 1.6821249271814477e-05, "loss": 0.0754, "step": 82550 }, { "epoch": 1.216108746557488, "grad_norm": 1.3519747257232666, "learning_rate": 1.681961289984749e-05, "loss": 0.0713, "step": 82575 }, { "epoch": 1.216476929647575, "grad_norm": 1.243880033493042, "learning_rate": 1.6817976527880506e-05, "loss": 0.0708, "step": 82600 }, { "epoch": 1.2168451127376623, "grad_norm": 1.8985295295715332, "learning_rate": 1.6816340155913524e-05, "loss": 0.0836, "step": 82625 }, { "epoch": 1.2172132958277493, "grad_norm": 1.1356552839279175, "learning_rate": 1.6814703783946538e-05, "loss": 0.0736, "step": 82650 }, { "epoch": 1.2175814789178363, "grad_norm": 1.2517777681350708, "learning_rate": 1.6813067411979553e-05, "loss": 0.0846, "step": 82675 }, { "epoch": 1.2179496620079233, "grad_norm": 1.986774206161499, "learning_rate": 1.681143104001257e-05, "loss": 0.0798, "step": 82700 }, { "epoch": 1.2183178450980103, "grad_norm": 1.293954849243164, "learning_rate": 1.6809794668045585e-05, "loss": 0.0786, "step": 82725 }, { "epoch": 1.2186860281880973, "grad_norm": 1.5015208721160889, "learning_rate": 1.68081582960786e-05, "loss": 0.0785, "step": 82750 }, { "epoch": 1.2190542112781844, "grad_norm": 1.105033040046692, "learning_rate": 1.6806521924111614e-05, "loss": 0.0769, "step": 82775 }, { "epoch": 1.2194223943682714, "grad_norm": 1.5143176317214966, "learning_rate": 1.6804885552144628e-05, "loss": 0.077, "step": 82800 }, { "epoch": 1.2197905774583586, "grad_norm": 1.419572353363037, "learning_rate": 1.6803249180177646e-05, "loss": 0.075, "step": 82825 }, { "epoch": 1.2201587605484456, "grad_norm": 2.0381019115448, "learning_rate": 1.680161280821066e-05, "loss": 0.0826, "step": 82850 }, { "epoch": 1.2205269436385326, "grad_norm": 1.2022831439971924, "learning_rate": 1.679997643624368e-05, "loss": 0.0801, "step": 82875 }, { "epoch": 1.2208951267286197, "grad_norm": 1.3925647735595703, "learning_rate": 1.6798340064276693e-05, "loss": 0.0734, "step": 82900 }, { "epoch": 1.2212633098187067, "grad_norm": 1.3878875970840454, "learning_rate": 1.6796703692309707e-05, "loss": 0.0789, "step": 82925 }, { "epoch": 1.2216314929087937, "grad_norm": 1.5642000436782837, "learning_rate": 1.6795067320342725e-05, "loss": 0.0822, "step": 82950 }, { "epoch": 1.2219996759988807, "grad_norm": 1.860133409500122, "learning_rate": 1.679343094837574e-05, "loss": 0.0789, "step": 82975 }, { "epoch": 1.2223678590889677, "grad_norm": 1.5566082000732422, "learning_rate": 1.6791794576408754e-05, "loss": 0.0701, "step": 83000 }, { "epoch": 1.2227360421790547, "grad_norm": 1.350318193435669, "learning_rate": 1.679015820444177e-05, "loss": 0.0747, "step": 83025 }, { "epoch": 1.2231042252691418, "grad_norm": 1.1439335346221924, "learning_rate": 1.6788521832474783e-05, "loss": 0.0764, "step": 83050 }, { "epoch": 1.2234724083592288, "grad_norm": 1.71803617477417, "learning_rate": 1.67868854605078e-05, "loss": 0.078, "step": 83075 }, { "epoch": 1.223840591449316, "grad_norm": 1.3117272853851318, "learning_rate": 1.6785249088540815e-05, "loss": 0.0759, "step": 83100 }, { "epoch": 1.224208774539403, "grad_norm": 1.5179363489151, "learning_rate": 1.6783612716573833e-05, "loss": 0.0764, "step": 83125 }, { "epoch": 1.22457695762949, "grad_norm": 1.1158641576766968, "learning_rate": 1.6781976344606848e-05, "loss": 0.0755, "step": 83150 }, { "epoch": 1.224945140719577, "grad_norm": 1.7928496599197388, "learning_rate": 1.6780339972639862e-05, "loss": 0.0787, "step": 83175 }, { "epoch": 1.225313323809664, "grad_norm": 1.5240285396575928, "learning_rate": 1.6778703600672877e-05, "loss": 0.0719, "step": 83200 }, { "epoch": 1.225681506899751, "grad_norm": 1.5038111209869385, "learning_rate": 1.677706722870589e-05, "loss": 0.0817, "step": 83225 }, { "epoch": 1.226049689989838, "grad_norm": 1.4744718074798584, "learning_rate": 1.677543085673891e-05, "loss": 0.0789, "step": 83250 }, { "epoch": 1.2264178730799251, "grad_norm": 1.0492841005325317, "learning_rate": 1.6773794484771923e-05, "loss": 0.0735, "step": 83275 }, { "epoch": 1.2267860561700121, "grad_norm": 1.5979945659637451, "learning_rate": 1.6772158112804938e-05, "loss": 0.079, "step": 83300 }, { "epoch": 1.2271542392600994, "grad_norm": 1.1743595600128174, "learning_rate": 1.6770521740837956e-05, "loss": 0.0766, "step": 83325 }, { "epoch": 1.2275224223501864, "grad_norm": 1.5478302240371704, "learning_rate": 1.676888536887097e-05, "loss": 0.0719, "step": 83350 }, { "epoch": 1.2278906054402734, "grad_norm": 1.2269262075424194, "learning_rate": 1.6767248996903988e-05, "loss": 0.075, "step": 83375 }, { "epoch": 1.2282587885303604, "grad_norm": 1.1477184295654297, "learning_rate": 1.6765612624937002e-05, "loss": 0.0697, "step": 83400 }, { "epoch": 1.2286269716204474, "grad_norm": 1.6375607252120972, "learning_rate": 1.6763976252970017e-05, "loss": 0.0815, "step": 83425 }, { "epoch": 1.2289951547105344, "grad_norm": 1.5812549591064453, "learning_rate": 1.676233988100303e-05, "loss": 0.0805, "step": 83450 }, { "epoch": 1.2293633378006215, "grad_norm": 1.5101075172424316, "learning_rate": 1.6760703509036046e-05, "loss": 0.0734, "step": 83475 }, { "epoch": 1.2297315208907085, "grad_norm": 1.4215960502624512, "learning_rate": 1.6759067137069064e-05, "loss": 0.0803, "step": 83500 }, { "epoch": 1.2300997039807955, "grad_norm": 1.4927319288253784, "learning_rate": 1.6757430765102078e-05, "loss": 0.0705, "step": 83525 }, { "epoch": 1.2304678870708825, "grad_norm": 1.6058988571166992, "learning_rate": 1.6755794393135092e-05, "loss": 0.0798, "step": 83550 }, { "epoch": 1.2308360701609697, "grad_norm": 1.5166833400726318, "learning_rate": 1.675415802116811e-05, "loss": 0.0788, "step": 83575 }, { "epoch": 1.2312042532510568, "grad_norm": 0.8335561752319336, "learning_rate": 1.6752521649201125e-05, "loss": 0.0774, "step": 83600 }, { "epoch": 1.2315724363411438, "grad_norm": 1.466392159461975, "learning_rate": 1.675088527723414e-05, "loss": 0.0749, "step": 83625 }, { "epoch": 1.2319406194312308, "grad_norm": 1.4224926233291626, "learning_rate": 1.6749248905267154e-05, "loss": 0.0791, "step": 83650 }, { "epoch": 1.2323088025213178, "grad_norm": 1.4799292087554932, "learning_rate": 1.674761253330017e-05, "loss": 0.0789, "step": 83675 }, { "epoch": 1.2326769856114048, "grad_norm": 0.98297119140625, "learning_rate": 1.6745976161333186e-05, "loss": 0.0749, "step": 83700 }, { "epoch": 1.2330451687014918, "grad_norm": 1.6584978103637695, "learning_rate": 1.67443397893662e-05, "loss": 0.0703, "step": 83725 }, { "epoch": 1.2334133517915788, "grad_norm": 1.3904367685317993, "learning_rate": 1.6742703417399218e-05, "loss": 0.0796, "step": 83750 }, { "epoch": 1.2337815348816659, "grad_norm": 1.6995155811309814, "learning_rate": 1.6741067045432233e-05, "loss": 0.0689, "step": 83775 }, { "epoch": 1.234149717971753, "grad_norm": 0.8751583695411682, "learning_rate": 1.673943067346525e-05, "loss": 0.0747, "step": 83800 }, { "epoch": 1.2345179010618401, "grad_norm": 1.6611430644989014, "learning_rate": 1.673785975637694e-05, "loss": 0.073, "step": 83825 }, { "epoch": 1.2348860841519271, "grad_norm": 1.1409391164779663, "learning_rate": 1.673622338440996e-05, "loss": 0.0754, "step": 83850 }, { "epoch": 1.2352542672420141, "grad_norm": 1.663902997970581, "learning_rate": 1.6734587012442973e-05, "loss": 0.0827, "step": 83875 }, { "epoch": 1.2356224503321012, "grad_norm": 1.592111349105835, "learning_rate": 1.6732950640475988e-05, "loss": 0.0795, "step": 83900 }, { "epoch": 1.2359906334221882, "grad_norm": 1.6744433641433716, "learning_rate": 1.6731314268509006e-05, "loss": 0.0761, "step": 83925 }, { "epoch": 1.2363588165122752, "grad_norm": 1.1632553339004517, "learning_rate": 1.672967789654202e-05, "loss": 0.0768, "step": 83950 }, { "epoch": 1.2367269996023622, "grad_norm": 1.0109707117080688, "learning_rate": 1.6728041524575035e-05, "loss": 0.0764, "step": 83975 }, { "epoch": 1.2370951826924492, "grad_norm": 1.5003042221069336, "learning_rate": 1.6726405152608052e-05, "loss": 0.0745, "step": 84000 }, { "epoch": 1.2374633657825362, "grad_norm": 1.63449227809906, "learning_rate": 1.6724768780641067e-05, "loss": 0.075, "step": 84025 }, { "epoch": 1.2378315488726233, "grad_norm": 1.4061245918273926, "learning_rate": 1.672313240867408e-05, "loss": 0.0745, "step": 84050 }, { "epoch": 1.2381997319627105, "grad_norm": 1.9697695970535278, "learning_rate": 1.6721496036707096e-05, "loss": 0.0803, "step": 84075 }, { "epoch": 1.2385679150527975, "grad_norm": 1.6412981748580933, "learning_rate": 1.6719859664740114e-05, "loss": 0.0777, "step": 84100 }, { "epoch": 1.2389360981428845, "grad_norm": 1.7949827909469604, "learning_rate": 1.6718223292773128e-05, "loss": 0.0775, "step": 84125 }, { "epoch": 1.2393042812329715, "grad_norm": 1.202728509902954, "learning_rate": 1.6716586920806143e-05, "loss": 0.0766, "step": 84150 }, { "epoch": 1.2396724643230586, "grad_norm": 1.2260409593582153, "learning_rate": 1.671495054883916e-05, "loss": 0.0745, "step": 84175 }, { "epoch": 1.2400406474131456, "grad_norm": 1.3268578052520752, "learning_rate": 1.6713314176872175e-05, "loss": 0.0744, "step": 84200 }, { "epoch": 1.2404088305032326, "grad_norm": 1.253363847732544, "learning_rate": 1.671167780490519e-05, "loss": 0.0689, "step": 84225 }, { "epoch": 1.2407770135933196, "grad_norm": 1.3394206762313843, "learning_rate": 1.6710041432938204e-05, "loss": 0.0818, "step": 84250 }, { "epoch": 1.2411451966834068, "grad_norm": 1.2930598258972168, "learning_rate": 1.670840506097122e-05, "loss": 0.0715, "step": 84275 }, { "epoch": 1.2415133797734939, "grad_norm": 1.5809293985366821, "learning_rate": 1.6706768689004236e-05, "loss": 0.0721, "step": 84300 }, { "epoch": 1.2418815628635809, "grad_norm": 1.3983064889907837, "learning_rate": 1.670513231703725e-05, "loss": 0.0745, "step": 84325 }, { "epoch": 1.2422497459536679, "grad_norm": 1.2547566890716553, "learning_rate": 1.670349594507027e-05, "loss": 0.0841, "step": 84350 }, { "epoch": 1.242617929043755, "grad_norm": 1.2029368877410889, "learning_rate": 1.6701859573103283e-05, "loss": 0.0753, "step": 84375 }, { "epoch": 1.242986112133842, "grad_norm": 1.7967641353607178, "learning_rate": 1.6700223201136297e-05, "loss": 0.0829, "step": 84400 }, { "epoch": 1.243354295223929, "grad_norm": 1.3531262874603271, "learning_rate": 1.6698586829169315e-05, "loss": 0.079, "step": 84425 }, { "epoch": 1.243722478314016, "grad_norm": 1.3004512786865234, "learning_rate": 1.669695045720233e-05, "loss": 0.0766, "step": 84450 }, { "epoch": 1.244090661404103, "grad_norm": 1.967198371887207, "learning_rate": 1.6695314085235344e-05, "loss": 0.0836, "step": 84475 }, { "epoch": 1.24445884449419, "grad_norm": 0.9700692296028137, "learning_rate": 1.669367771326836e-05, "loss": 0.0746, "step": 84500 }, { "epoch": 1.244827027584277, "grad_norm": 1.2876149415969849, "learning_rate": 1.6692041341301376e-05, "loss": 0.079, "step": 84525 }, { "epoch": 1.2451952106743642, "grad_norm": 1.5020794868469238, "learning_rate": 1.669040496933439e-05, "loss": 0.0699, "step": 84550 }, { "epoch": 1.2455633937644512, "grad_norm": 1.4075846672058105, "learning_rate": 1.6688768597367405e-05, "loss": 0.0793, "step": 84575 }, { "epoch": 1.2459315768545383, "grad_norm": 1.3767808675765991, "learning_rate": 1.6687132225400423e-05, "loss": 0.0792, "step": 84600 }, { "epoch": 1.2462997599446253, "grad_norm": 0.9941942691802979, "learning_rate": 1.6685495853433438e-05, "loss": 0.073, "step": 84625 }, { "epoch": 1.2466679430347123, "grad_norm": 1.1486207246780396, "learning_rate": 1.6683859481466452e-05, "loss": 0.076, "step": 84650 }, { "epoch": 1.2470361261247993, "grad_norm": 1.2689430713653564, "learning_rate": 1.6682223109499467e-05, "loss": 0.0777, "step": 84675 }, { "epoch": 1.2474043092148863, "grad_norm": 1.8343433141708374, "learning_rate": 1.668058673753248e-05, "loss": 0.0913, "step": 84700 }, { "epoch": 1.2477724923049733, "grad_norm": 1.5430928468704224, "learning_rate": 1.66789503655655e-05, "loss": 0.0836, "step": 84725 }, { "epoch": 1.2481406753950606, "grad_norm": 0.7241131663322449, "learning_rate": 1.6677313993598513e-05, "loss": 0.0722, "step": 84750 }, { "epoch": 1.2485088584851476, "grad_norm": 1.7381606101989746, "learning_rate": 1.667567762163153e-05, "loss": 0.0818, "step": 84775 }, { "epoch": 1.2488770415752346, "grad_norm": 1.4786512851715088, "learning_rate": 1.6674041249664546e-05, "loss": 0.071, "step": 84800 }, { "epoch": 1.2492452246653216, "grad_norm": 1.4238206148147583, "learning_rate": 1.667240487769756e-05, "loss": 0.0771, "step": 84825 }, { "epoch": 1.2496134077554086, "grad_norm": 1.4153484106063843, "learning_rate": 1.6670768505730578e-05, "loss": 0.0746, "step": 84850 }, { "epoch": 1.2499815908454956, "grad_norm": 1.505297064781189, "learning_rate": 1.6669132133763592e-05, "loss": 0.0743, "step": 84875 }, { "epoch": 1.2503497739355827, "grad_norm": 1.3245985507965088, "learning_rate": 1.6667495761796607e-05, "loss": 0.0695, "step": 84900 }, { "epoch": 1.2507179570256697, "grad_norm": 1.737091064453125, "learning_rate": 1.666585938982962e-05, "loss": 0.0752, "step": 84925 }, { "epoch": 1.2510861401157567, "grad_norm": 1.7985749244689941, "learning_rate": 1.6664223017862636e-05, "loss": 0.0742, "step": 84950 }, { "epoch": 1.2514543232058437, "grad_norm": 0.9943830966949463, "learning_rate": 1.6662586645895654e-05, "loss": 0.0744, "step": 84975 }, { "epoch": 1.2518225062959307, "grad_norm": 1.1452761888504028, "learning_rate": 1.6660950273928668e-05, "loss": 0.0845, "step": 85000 }, { "epoch": 1.2521906893860177, "grad_norm": 1.0950809717178345, "learning_rate": 1.6659313901961686e-05, "loss": 0.0722, "step": 85025 }, { "epoch": 1.252558872476105, "grad_norm": 1.2069015502929688, "learning_rate": 1.66576775299947e-05, "loss": 0.0816, "step": 85050 }, { "epoch": 1.252927055566192, "grad_norm": 1.7069686651229858, "learning_rate": 1.6656041158027715e-05, "loss": 0.0764, "step": 85075 }, { "epoch": 1.253295238656279, "grad_norm": 1.6437897682189941, "learning_rate": 1.665440478606073e-05, "loss": 0.0744, "step": 85100 }, { "epoch": 1.253663421746366, "grad_norm": 1.4827040433883667, "learning_rate": 1.6652768414093744e-05, "loss": 0.0774, "step": 85125 }, { "epoch": 1.254031604836453, "grad_norm": 1.4733189344406128, "learning_rate": 1.665113204212676e-05, "loss": 0.08, "step": 85150 }, { "epoch": 1.25439978792654, "grad_norm": 2.1713707447052, "learning_rate": 1.6649495670159776e-05, "loss": 0.0747, "step": 85175 }, { "epoch": 1.254767971016627, "grad_norm": 1.3723907470703125, "learning_rate": 1.6647859298192794e-05, "loss": 0.0778, "step": 85200 }, { "epoch": 1.2551361541067143, "grad_norm": 1.3837295770645142, "learning_rate": 1.6646222926225808e-05, "loss": 0.0764, "step": 85225 }, { "epoch": 1.2555043371968013, "grad_norm": 1.6274299621582031, "learning_rate": 1.6644586554258823e-05, "loss": 0.0764, "step": 85250 }, { "epoch": 1.2558725202868883, "grad_norm": 1.6906304359436035, "learning_rate": 1.664295018229184e-05, "loss": 0.0783, "step": 85275 }, { "epoch": 1.2562407033769754, "grad_norm": 1.3737411499023438, "learning_rate": 1.664131381032485e-05, "loss": 0.0753, "step": 85300 }, { "epoch": 1.2566088864670624, "grad_norm": 1.203748106956482, "learning_rate": 1.663967743835787e-05, "loss": 0.0721, "step": 85325 }, { "epoch": 1.2569770695571494, "grad_norm": 1.1847542524337769, "learning_rate": 1.6638041066390884e-05, "loss": 0.0771, "step": 85350 }, { "epoch": 1.2573452526472364, "grad_norm": 1.7089818716049194, "learning_rate": 1.66364046944239e-05, "loss": 0.0826, "step": 85375 }, { "epoch": 1.2577134357373234, "grad_norm": 1.32912278175354, "learning_rate": 1.6634768322456916e-05, "loss": 0.0795, "step": 85400 }, { "epoch": 1.2580816188274104, "grad_norm": 1.3430333137512207, "learning_rate": 1.663313195048993e-05, "loss": 0.0755, "step": 85425 }, { "epoch": 1.2584498019174974, "grad_norm": 1.7742334604263306, "learning_rate": 1.663149557852295e-05, "loss": 0.0813, "step": 85450 }, { "epoch": 1.2588179850075845, "grad_norm": 1.0110199451446533, "learning_rate": 1.6629859206555963e-05, "loss": 0.0759, "step": 85475 }, { "epoch": 1.2591861680976715, "grad_norm": 1.4277589321136475, "learning_rate": 1.6628222834588978e-05, "loss": 0.0724, "step": 85500 }, { "epoch": 1.2595543511877587, "grad_norm": 1.7643581628799438, "learning_rate": 1.6626586462621992e-05, "loss": 0.0808, "step": 85525 }, { "epoch": 1.2599225342778457, "grad_norm": 1.358424425125122, "learning_rate": 1.6624950090655006e-05, "loss": 0.0746, "step": 85550 }, { "epoch": 1.2602907173679327, "grad_norm": 1.502968192100525, "learning_rate": 1.6623313718688024e-05, "loss": 0.0787, "step": 85575 }, { "epoch": 1.2606589004580198, "grad_norm": 1.6026368141174316, "learning_rate": 1.662167734672104e-05, "loss": 0.0753, "step": 85600 }, { "epoch": 1.2610270835481068, "grad_norm": 1.2125557661056519, "learning_rate": 1.6620040974754053e-05, "loss": 0.0786, "step": 85625 }, { "epoch": 1.2613952666381938, "grad_norm": 1.624697208404541, "learning_rate": 1.661840460278707e-05, "loss": 0.0813, "step": 85650 }, { "epoch": 1.2617634497282808, "grad_norm": 1.8684204816818237, "learning_rate": 1.6616768230820085e-05, "loss": 0.0823, "step": 85675 }, { "epoch": 1.262131632818368, "grad_norm": 1.1944184303283691, "learning_rate": 1.6615131858853103e-05, "loss": 0.0773, "step": 85700 }, { "epoch": 1.262499815908455, "grad_norm": 1.3293205499649048, "learning_rate": 1.6613495486886114e-05, "loss": 0.0754, "step": 85725 }, { "epoch": 1.262867998998542, "grad_norm": 1.7817025184631348, "learning_rate": 1.6611859114919132e-05, "loss": 0.0836, "step": 85750 }, { "epoch": 1.263236182088629, "grad_norm": 1.3254010677337646, "learning_rate": 1.6610222742952147e-05, "loss": 0.0792, "step": 85775 }, { "epoch": 1.263604365178716, "grad_norm": 1.5083951950073242, "learning_rate": 1.660858637098516e-05, "loss": 0.0799, "step": 85800 }, { "epoch": 1.2639725482688031, "grad_norm": 1.5599150657653809, "learning_rate": 1.660694999901818e-05, "loss": 0.0726, "step": 85825 }, { "epoch": 1.2643407313588901, "grad_norm": 1.8269362449645996, "learning_rate": 1.6605313627051193e-05, "loss": 0.0835, "step": 85850 }, { "epoch": 1.2647089144489772, "grad_norm": 1.7932054996490479, "learning_rate": 1.6603677255084208e-05, "loss": 0.0668, "step": 85875 }, { "epoch": 1.2650770975390642, "grad_norm": 1.5463165044784546, "learning_rate": 1.6602040883117226e-05, "loss": 0.0743, "step": 85900 }, { "epoch": 1.2654452806291512, "grad_norm": 1.6358023881912231, "learning_rate": 1.660040451115024e-05, "loss": 0.0823, "step": 85925 }, { "epoch": 1.2658134637192382, "grad_norm": 1.7893600463867188, "learning_rate": 1.6598768139183255e-05, "loss": 0.076, "step": 85950 }, { "epoch": 1.2661816468093252, "grad_norm": 1.8993571996688843, "learning_rate": 1.659713176721627e-05, "loss": 0.082, "step": 85975 }, { "epoch": 1.2665498298994124, "grad_norm": 1.109728455543518, "learning_rate": 1.6595495395249287e-05, "loss": 0.0789, "step": 86000 }, { "epoch": 1.2669180129894995, "grad_norm": 1.1833776235580444, "learning_rate": 1.65938590232823e-05, "loss": 0.0756, "step": 86025 }, { "epoch": 1.2672861960795865, "grad_norm": 1.4657959938049316, "learning_rate": 1.6592222651315316e-05, "loss": 0.0772, "step": 86050 }, { "epoch": 1.2676543791696735, "grad_norm": 1.0066750049591064, "learning_rate": 1.6590586279348334e-05, "loss": 0.0813, "step": 86075 }, { "epoch": 1.2680225622597605, "grad_norm": 1.5638108253479004, "learning_rate": 1.6588949907381348e-05, "loss": 0.0755, "step": 86100 }, { "epoch": 1.2683907453498475, "grad_norm": 1.938494324684143, "learning_rate": 1.6587313535414366e-05, "loss": 0.0768, "step": 86125 }, { "epoch": 1.2687589284399345, "grad_norm": 1.5280470848083496, "learning_rate": 1.6585677163447377e-05, "loss": 0.0831, "step": 86150 }, { "epoch": 1.2691271115300218, "grad_norm": 1.5612305402755737, "learning_rate": 1.6584040791480395e-05, "loss": 0.085, "step": 86175 }, { "epoch": 1.2694952946201088, "grad_norm": 1.2244549989700317, "learning_rate": 1.658240441951341e-05, "loss": 0.0804, "step": 86200 }, { "epoch": 1.2698634777101958, "grad_norm": 1.543442726135254, "learning_rate": 1.6580768047546424e-05, "loss": 0.0783, "step": 86225 }, { "epoch": 1.2702316608002828, "grad_norm": 0.8403768539428711, "learning_rate": 1.6579131675579442e-05, "loss": 0.0834, "step": 86250 }, { "epoch": 1.2705998438903698, "grad_norm": 1.3470231294631958, "learning_rate": 1.6577495303612456e-05, "loss": 0.0807, "step": 86275 }, { "epoch": 1.2709680269804569, "grad_norm": 1.3456494808197021, "learning_rate": 1.657585893164547e-05, "loss": 0.0715, "step": 86300 }, { "epoch": 1.2713362100705439, "grad_norm": 1.604638934135437, "learning_rate": 1.657422255967849e-05, "loss": 0.0746, "step": 86325 }, { "epoch": 1.2717043931606309, "grad_norm": 1.5782653093338013, "learning_rate": 1.6572586187711503e-05, "loss": 0.0749, "step": 86350 }, { "epoch": 1.272072576250718, "grad_norm": 1.2369173765182495, "learning_rate": 1.6570949815744517e-05, "loss": 0.0734, "step": 86375 }, { "epoch": 1.272440759340805, "grad_norm": 1.6135135889053345, "learning_rate": 1.6569313443777532e-05, "loss": 0.0769, "step": 86400 }, { "epoch": 1.272808942430892, "grad_norm": 1.2767653465270996, "learning_rate": 1.656767707181055e-05, "loss": 0.0781, "step": 86425 }, { "epoch": 1.273177125520979, "grad_norm": 1.665022850036621, "learning_rate": 1.6566040699843564e-05, "loss": 0.0756, "step": 86450 }, { "epoch": 1.2735453086110662, "grad_norm": 1.2414312362670898, "learning_rate": 1.6564469782755258e-05, "loss": 0.0702, "step": 86475 }, { "epoch": 1.2739134917011532, "grad_norm": 1.4579577445983887, "learning_rate": 1.6562833410788276e-05, "loss": 0.0786, "step": 86500 }, { "epoch": 1.2742816747912402, "grad_norm": 1.1833775043487549, "learning_rate": 1.656119703882129e-05, "loss": 0.0758, "step": 86525 }, { "epoch": 1.2746498578813272, "grad_norm": 1.3675172328948975, "learning_rate": 1.6559560666854305e-05, "loss": 0.068, "step": 86550 }, { "epoch": 1.2750180409714142, "grad_norm": 1.6105674505233765, "learning_rate": 1.655792429488732e-05, "loss": 0.0749, "step": 86575 }, { "epoch": 1.2753862240615013, "grad_norm": 1.4817944765090942, "learning_rate": 1.6556287922920337e-05, "loss": 0.0767, "step": 86600 }, { "epoch": 1.2757544071515883, "grad_norm": 1.2722810506820679, "learning_rate": 1.655465155095335e-05, "loss": 0.0788, "step": 86625 }, { "epoch": 1.2761225902416755, "grad_norm": 1.7131518125534058, "learning_rate": 1.6553015178986366e-05, "loss": 0.0802, "step": 86650 }, { "epoch": 1.2764907733317625, "grad_norm": 0.9292106032371521, "learning_rate": 1.6551378807019384e-05, "loss": 0.0774, "step": 86675 }, { "epoch": 1.2768589564218495, "grad_norm": 1.4344921112060547, "learning_rate": 1.65497424350524e-05, "loss": 0.0754, "step": 86700 }, { "epoch": 1.2772271395119366, "grad_norm": 0.9840370416641235, "learning_rate": 1.6548106063085413e-05, "loss": 0.0745, "step": 86725 }, { "epoch": 1.2775953226020236, "grad_norm": 1.4911800622940063, "learning_rate": 1.654646969111843e-05, "loss": 0.0777, "step": 86750 }, { "epoch": 1.2779635056921106, "grad_norm": 1.4150290489196777, "learning_rate": 1.6544833319151445e-05, "loss": 0.077, "step": 86775 }, { "epoch": 1.2783316887821976, "grad_norm": 1.4077993631362915, "learning_rate": 1.654319694718446e-05, "loss": 0.0745, "step": 86800 }, { "epoch": 1.2786998718722846, "grad_norm": 1.453959345817566, "learning_rate": 1.6541560575217474e-05, "loss": 0.0801, "step": 86825 }, { "epoch": 1.2790680549623716, "grad_norm": 1.2002153396606445, "learning_rate": 1.6539924203250492e-05, "loss": 0.0646, "step": 86850 }, { "epoch": 1.2794362380524587, "grad_norm": 1.3084107637405396, "learning_rate": 1.6538287831283506e-05, "loss": 0.0697, "step": 86875 }, { "epoch": 1.2798044211425457, "grad_norm": 1.1693037748336792, "learning_rate": 1.653665145931652e-05, "loss": 0.0817, "step": 86900 }, { "epoch": 1.2801726042326327, "grad_norm": 1.7182427644729614, "learning_rate": 1.653501508734954e-05, "loss": 0.0742, "step": 86925 }, { "epoch": 1.28054078732272, "grad_norm": 1.6868401765823364, "learning_rate": 1.6533378715382553e-05, "loss": 0.0786, "step": 86950 }, { "epoch": 1.280908970412807, "grad_norm": 1.5034259557724, "learning_rate": 1.6531742343415568e-05, "loss": 0.0782, "step": 86975 }, { "epoch": 1.281277153502894, "grad_norm": 1.451076626777649, "learning_rate": 1.6530105971448582e-05, "loss": 0.0834, "step": 87000 }, { "epoch": 1.281645336592981, "grad_norm": 1.09324049949646, "learning_rate": 1.6528469599481596e-05, "loss": 0.0761, "step": 87025 }, { "epoch": 1.282013519683068, "grad_norm": 1.338484287261963, "learning_rate": 1.6526833227514614e-05, "loss": 0.0745, "step": 87050 }, { "epoch": 1.282381702773155, "grad_norm": 1.4144775867462158, "learning_rate": 1.652519685554763e-05, "loss": 0.0789, "step": 87075 }, { "epoch": 1.282749885863242, "grad_norm": 2.0029828548431396, "learning_rate": 1.6523560483580647e-05, "loss": 0.0768, "step": 87100 }, { "epoch": 1.2831180689533292, "grad_norm": 1.4517130851745605, "learning_rate": 1.652192411161366e-05, "loss": 0.0833, "step": 87125 }, { "epoch": 1.2834862520434163, "grad_norm": 1.5844374895095825, "learning_rate": 1.6520287739646675e-05, "loss": 0.0791, "step": 87150 }, { "epoch": 1.2838544351335033, "grad_norm": 0.8651666641235352, "learning_rate": 1.6518651367679693e-05, "loss": 0.0762, "step": 87175 }, { "epoch": 1.2842226182235903, "grad_norm": 0.9691341519355774, "learning_rate": 1.6517014995712704e-05, "loss": 0.0829, "step": 87200 }, { "epoch": 1.2845908013136773, "grad_norm": 1.4265964031219482, "learning_rate": 1.6515378623745722e-05, "loss": 0.0762, "step": 87225 }, { "epoch": 1.2849589844037643, "grad_norm": 1.5003539323806763, "learning_rate": 1.6513742251778737e-05, "loss": 0.0776, "step": 87250 }, { "epoch": 1.2853271674938513, "grad_norm": 1.250416874885559, "learning_rate": 1.651210587981175e-05, "loss": 0.0765, "step": 87275 }, { "epoch": 1.2856953505839384, "grad_norm": 1.2348967790603638, "learning_rate": 1.651046950784477e-05, "loss": 0.0771, "step": 87300 }, { "epoch": 1.2860635336740254, "grad_norm": 1.6736505031585693, "learning_rate": 1.6508833135877783e-05, "loss": 0.0869, "step": 87325 }, { "epoch": 1.2864317167641124, "grad_norm": 1.2044596672058105, "learning_rate": 1.65071967639108e-05, "loss": 0.0735, "step": 87350 }, { "epoch": 1.2867998998541994, "grad_norm": 1.6578843593597412, "learning_rate": 1.6505560391943816e-05, "loss": 0.0821, "step": 87375 }, { "epoch": 1.2871680829442864, "grad_norm": 1.4457515478134155, "learning_rate": 1.650392401997683e-05, "loss": 0.0755, "step": 87400 }, { "epoch": 1.2875362660343737, "grad_norm": 1.2665610313415527, "learning_rate": 1.6502287648009845e-05, "loss": 0.0786, "step": 87425 }, { "epoch": 1.2879044491244607, "grad_norm": 1.1844627857208252, "learning_rate": 1.650065127604286e-05, "loss": 0.0763, "step": 87450 }, { "epoch": 1.2882726322145477, "grad_norm": 1.7764798402786255, "learning_rate": 1.6499014904075877e-05, "loss": 0.0793, "step": 87475 }, { "epoch": 1.2886408153046347, "grad_norm": 1.5905070304870605, "learning_rate": 1.649737853210889e-05, "loss": 0.0742, "step": 87500 }, { "epoch": 1.2890089983947217, "grad_norm": 1.3501182794570923, "learning_rate": 1.649574216014191e-05, "loss": 0.0752, "step": 87525 }, { "epoch": 1.2893771814848087, "grad_norm": 1.1950966119766235, "learning_rate": 1.6494105788174924e-05, "loss": 0.0703, "step": 87550 }, { "epoch": 1.2897453645748957, "grad_norm": 1.5274617671966553, "learning_rate": 1.6492469416207938e-05, "loss": 0.0786, "step": 87575 }, { "epoch": 1.2901135476649828, "grad_norm": 1.3678492307662964, "learning_rate": 1.6490833044240956e-05, "loss": 0.0773, "step": 87600 }, { "epoch": 1.29048173075507, "grad_norm": 1.5120971202850342, "learning_rate": 1.6489196672273967e-05, "loss": 0.0864, "step": 87625 }, { "epoch": 1.290849913845157, "grad_norm": 0.7234932780265808, "learning_rate": 1.6487560300306985e-05, "loss": 0.0782, "step": 87650 }, { "epoch": 1.291218096935244, "grad_norm": 1.4716856479644775, "learning_rate": 1.648592392834e-05, "loss": 0.089, "step": 87675 }, { "epoch": 1.291586280025331, "grad_norm": 1.5008195638656616, "learning_rate": 1.6484287556373014e-05, "loss": 0.0723, "step": 87700 }, { "epoch": 1.291954463115418, "grad_norm": 1.3393083810806274, "learning_rate": 1.6482651184406032e-05, "loss": 0.0762, "step": 87725 }, { "epoch": 1.292322646205505, "grad_norm": 0.8557813167572021, "learning_rate": 1.6481014812439046e-05, "loss": 0.0783, "step": 87750 }, { "epoch": 1.292690829295592, "grad_norm": 1.3396013975143433, "learning_rate": 1.6479378440472064e-05, "loss": 0.0793, "step": 87775 }, { "epoch": 1.293059012385679, "grad_norm": 1.4153445959091187, "learning_rate": 1.647774206850508e-05, "loss": 0.078, "step": 87800 }, { "epoch": 1.2934271954757661, "grad_norm": 1.2082234621047974, "learning_rate": 1.6476105696538093e-05, "loss": 0.0772, "step": 87825 }, { "epoch": 1.2937953785658531, "grad_norm": 1.2681392431259155, "learning_rate": 1.6474469324571107e-05, "loss": 0.0861, "step": 87850 }, { "epoch": 1.2941635616559402, "grad_norm": 1.1526434421539307, "learning_rate": 1.6472832952604122e-05, "loss": 0.0637, "step": 87875 }, { "epoch": 1.2945317447460272, "grad_norm": 1.3793410062789917, "learning_rate": 1.647119658063714e-05, "loss": 0.0804, "step": 87900 }, { "epoch": 1.2948999278361144, "grad_norm": 1.4132964611053467, "learning_rate": 1.6469560208670154e-05, "loss": 0.0758, "step": 87925 }, { "epoch": 1.2952681109262014, "grad_norm": 1.4679826498031616, "learning_rate": 1.646792383670317e-05, "loss": 0.0768, "step": 87950 }, { "epoch": 1.2956362940162884, "grad_norm": 1.176544189453125, "learning_rate": 1.6466287464736186e-05, "loss": 0.0775, "step": 87975 }, { "epoch": 1.2960044771063755, "grad_norm": 1.8251954317092896, "learning_rate": 1.64646510927692e-05, "loss": 0.069, "step": 88000 }, { "epoch": 1.2963726601964625, "grad_norm": 1.5406668186187744, "learning_rate": 1.646301472080222e-05, "loss": 0.0695, "step": 88025 }, { "epoch": 1.2967408432865495, "grad_norm": 1.1207826137542725, "learning_rate": 1.646137834883523e-05, "loss": 0.0774, "step": 88050 }, { "epoch": 1.2971090263766365, "grad_norm": 1.6414769887924194, "learning_rate": 1.6459741976868248e-05, "loss": 0.0873, "step": 88075 }, { "epoch": 1.2974772094667237, "grad_norm": 1.776997685432434, "learning_rate": 1.6458105604901262e-05, "loss": 0.0722, "step": 88100 }, { "epoch": 1.2978453925568108, "grad_norm": 1.0320223569869995, "learning_rate": 1.6456469232934277e-05, "loss": 0.0775, "step": 88125 }, { "epoch": 1.2982135756468978, "grad_norm": 1.1254173517227173, "learning_rate": 1.6454832860967294e-05, "loss": 0.0783, "step": 88150 }, { "epoch": 1.2985817587369848, "grad_norm": 1.4797089099884033, "learning_rate": 1.645319648900031e-05, "loss": 0.074, "step": 88175 }, { "epoch": 1.2989499418270718, "grad_norm": 1.3233765363693237, "learning_rate": 1.6451560117033323e-05, "loss": 0.079, "step": 88200 }, { "epoch": 1.2993181249171588, "grad_norm": 1.6980239152908325, "learning_rate": 1.644992374506634e-05, "loss": 0.0821, "step": 88225 }, { "epoch": 1.2996863080072458, "grad_norm": 1.635428547859192, "learning_rate": 1.6448287373099356e-05, "loss": 0.0716, "step": 88250 }, { "epoch": 1.3000544910973328, "grad_norm": 0.9146542549133301, "learning_rate": 1.644665100113237e-05, "loss": 0.0787, "step": 88275 }, { "epoch": 1.3004226741874199, "grad_norm": 1.1425437927246094, "learning_rate": 1.6445014629165385e-05, "loss": 0.0803, "step": 88300 }, { "epoch": 1.3007908572775069, "grad_norm": 1.1494662761688232, "learning_rate": 1.6443378257198402e-05, "loss": 0.0714, "step": 88325 }, { "epoch": 1.3011590403675939, "grad_norm": 1.2007160186767578, "learning_rate": 1.6441741885231417e-05, "loss": 0.072, "step": 88350 }, { "epoch": 1.301527223457681, "grad_norm": 1.711274266242981, "learning_rate": 1.644010551326443e-05, "loss": 0.083, "step": 88375 }, { "epoch": 1.3018954065477681, "grad_norm": 1.4959429502487183, "learning_rate": 1.643846914129745e-05, "loss": 0.0794, "step": 88400 }, { "epoch": 1.3022635896378552, "grad_norm": 1.783243179321289, "learning_rate": 1.6436832769330464e-05, "loss": 0.0719, "step": 88425 }, { "epoch": 1.3026317727279422, "grad_norm": 1.0835564136505127, "learning_rate": 1.643519639736348e-05, "loss": 0.076, "step": 88450 }, { "epoch": 1.3029999558180292, "grad_norm": 1.370273470878601, "learning_rate": 1.6433560025396493e-05, "loss": 0.0679, "step": 88475 }, { "epoch": 1.3033681389081162, "grad_norm": 1.517225980758667, "learning_rate": 1.643192365342951e-05, "loss": 0.0791, "step": 88500 }, { "epoch": 1.3037363219982032, "grad_norm": 1.0269920825958252, "learning_rate": 1.6430287281462525e-05, "loss": 0.0733, "step": 88525 }, { "epoch": 1.3041045050882902, "grad_norm": 1.5639657974243164, "learning_rate": 1.642865090949554e-05, "loss": 0.079, "step": 88550 }, { "epoch": 1.3044726881783775, "grad_norm": 1.4998055696487427, "learning_rate": 1.6427014537528557e-05, "loss": 0.0742, "step": 88575 }, { "epoch": 1.3048408712684645, "grad_norm": 1.509335994720459, "learning_rate": 1.642537816556157e-05, "loss": 0.0777, "step": 88600 }, { "epoch": 1.3052090543585515, "grad_norm": 1.7063488960266113, "learning_rate": 1.6423741793594586e-05, "loss": 0.0712, "step": 88625 }, { "epoch": 1.3055772374486385, "grad_norm": 1.6909536123275757, "learning_rate": 1.6422105421627604e-05, "loss": 0.0822, "step": 88650 }, { "epoch": 1.3059454205387255, "grad_norm": 1.8001981973648071, "learning_rate": 1.642046904966062e-05, "loss": 0.0766, "step": 88675 }, { "epoch": 1.3063136036288125, "grad_norm": 1.5624608993530273, "learning_rate": 1.6418832677693633e-05, "loss": 0.0719, "step": 88700 }, { "epoch": 1.3066817867188996, "grad_norm": 1.2220088243484497, "learning_rate": 1.6417196305726647e-05, "loss": 0.0859, "step": 88725 }, { "epoch": 1.3070499698089866, "grad_norm": 1.4403575658798218, "learning_rate": 1.6415559933759665e-05, "loss": 0.083, "step": 88750 }, { "epoch": 1.3074181528990736, "grad_norm": 1.5407556295394897, "learning_rate": 1.641392356179268e-05, "loss": 0.0781, "step": 88775 }, { "epoch": 1.3077863359891606, "grad_norm": 1.4624788761138916, "learning_rate": 1.6412287189825694e-05, "loss": 0.0682, "step": 88800 }, { "epoch": 1.3081545190792476, "grad_norm": 1.3351502418518066, "learning_rate": 1.6410650817858712e-05, "loss": 0.0803, "step": 88825 }, { "epoch": 1.3085227021693346, "grad_norm": 1.2985202074050903, "learning_rate": 1.6409014445891726e-05, "loss": 0.0748, "step": 88850 }, { "epoch": 1.3088908852594219, "grad_norm": 1.2473987340927124, "learning_rate": 1.640737807392474e-05, "loss": 0.0784, "step": 88875 }, { "epoch": 1.309259068349509, "grad_norm": 1.087683916091919, "learning_rate": 1.6405741701957755e-05, "loss": 0.0746, "step": 88900 }, { "epoch": 1.309627251439596, "grad_norm": 1.5098973512649536, "learning_rate": 1.6404105329990773e-05, "loss": 0.0749, "step": 88925 }, { "epoch": 1.309995434529683, "grad_norm": 1.2901890277862549, "learning_rate": 1.6402534412902467e-05, "loss": 0.0818, "step": 88950 }, { "epoch": 1.31036361761977, "grad_norm": 1.1922364234924316, "learning_rate": 1.640089804093548e-05, "loss": 0.0744, "step": 88975 }, { "epoch": 1.310731800709857, "grad_norm": 1.3412866592407227, "learning_rate": 1.63992616689685e-05, "loss": 0.0691, "step": 89000 }, { "epoch": 1.311099983799944, "grad_norm": 1.465410828590393, "learning_rate": 1.6397625297001514e-05, "loss": 0.0823, "step": 89025 }, { "epoch": 1.3114681668900312, "grad_norm": 1.3778318166732788, "learning_rate": 1.6395988925034528e-05, "loss": 0.0771, "step": 89050 }, { "epoch": 1.3118363499801182, "grad_norm": 1.2208822965621948, "learning_rate": 1.6394352553067546e-05, "loss": 0.0789, "step": 89075 }, { "epoch": 1.3122045330702052, "grad_norm": 1.6019970178604126, "learning_rate": 1.6392716181100557e-05, "loss": 0.0782, "step": 89100 }, { "epoch": 1.3125727161602923, "grad_norm": 1.160643458366394, "learning_rate": 1.6391079809133575e-05, "loss": 0.0796, "step": 89125 }, { "epoch": 1.3129408992503793, "grad_norm": 1.4744532108306885, "learning_rate": 1.638944343716659e-05, "loss": 0.0728, "step": 89150 }, { "epoch": 1.3133090823404663, "grad_norm": 1.6466758251190186, "learning_rate": 1.6387807065199607e-05, "loss": 0.0705, "step": 89175 }, { "epoch": 1.3136772654305533, "grad_norm": 1.6497390270233154, "learning_rate": 1.6386170693232622e-05, "loss": 0.0828, "step": 89200 }, { "epoch": 1.3140454485206403, "grad_norm": 1.3599823713302612, "learning_rate": 1.6384534321265636e-05, "loss": 0.0781, "step": 89225 }, { "epoch": 1.3144136316107273, "grad_norm": 1.6341114044189453, "learning_rate": 1.6382897949298654e-05, "loss": 0.0708, "step": 89250 }, { "epoch": 1.3147818147008143, "grad_norm": 1.1234710216522217, "learning_rate": 1.638126157733167e-05, "loss": 0.0679, "step": 89275 }, { "epoch": 1.3151499977909014, "grad_norm": 1.6428710222244263, "learning_rate": 1.6379625205364683e-05, "loss": 0.0829, "step": 89300 }, { "epoch": 1.3155181808809884, "grad_norm": 0.9147220849990845, "learning_rate": 1.6377988833397697e-05, "loss": 0.0782, "step": 89325 }, { "epoch": 1.3158863639710756, "grad_norm": 1.658098578453064, "learning_rate": 1.6376352461430712e-05, "loss": 0.0709, "step": 89350 }, { "epoch": 1.3162545470611626, "grad_norm": 1.2410835027694702, "learning_rate": 1.637471608946373e-05, "loss": 0.0762, "step": 89375 }, { "epoch": 1.3166227301512496, "grad_norm": 1.0997791290283203, "learning_rate": 1.6373079717496744e-05, "loss": 0.0737, "step": 89400 }, { "epoch": 1.3169909132413367, "grad_norm": 1.3778128623962402, "learning_rate": 1.6371443345529762e-05, "loss": 0.0805, "step": 89425 }, { "epoch": 1.3173590963314237, "grad_norm": 1.4913908243179321, "learning_rate": 1.6369806973562776e-05, "loss": 0.0823, "step": 89450 }, { "epoch": 1.3177272794215107, "grad_norm": 1.7200385332107544, "learning_rate": 1.636817060159579e-05, "loss": 0.0852, "step": 89475 }, { "epoch": 1.3180954625115977, "grad_norm": 1.5541396141052246, "learning_rate": 1.636653422962881e-05, "loss": 0.0824, "step": 89500 }, { "epoch": 1.318463645601685, "grad_norm": 1.4841601848602295, "learning_rate": 1.636489785766182e-05, "loss": 0.0769, "step": 89525 }, { "epoch": 1.318831828691772, "grad_norm": 1.5176092386245728, "learning_rate": 1.6363261485694838e-05, "loss": 0.0707, "step": 89550 }, { "epoch": 1.319200011781859, "grad_norm": 1.509604573249817, "learning_rate": 1.6361625113727852e-05, "loss": 0.0783, "step": 89575 }, { "epoch": 1.319568194871946, "grad_norm": 1.4398568868637085, "learning_rate": 1.6359988741760867e-05, "loss": 0.0708, "step": 89600 }, { "epoch": 1.319936377962033, "grad_norm": 1.6234606504440308, "learning_rate": 1.6358352369793884e-05, "loss": 0.0736, "step": 89625 }, { "epoch": 1.32030456105212, "grad_norm": 1.5677322149276733, "learning_rate": 1.63567159978269e-05, "loss": 0.0757, "step": 89650 }, { "epoch": 1.320672744142207, "grad_norm": 1.727790117263794, "learning_rate": 1.6355079625859917e-05, "loss": 0.0689, "step": 89675 }, { "epoch": 1.321040927232294, "grad_norm": 1.2676259279251099, "learning_rate": 1.635344325389293e-05, "loss": 0.0735, "step": 89700 }, { "epoch": 1.321409110322381, "grad_norm": 1.7587922811508179, "learning_rate": 1.6351806881925946e-05, "loss": 0.0738, "step": 89725 }, { "epoch": 1.321777293412468, "grad_norm": 2.1331818103790283, "learning_rate": 1.635017050995896e-05, "loss": 0.0736, "step": 89750 }, { "epoch": 1.322145476502555, "grad_norm": 1.1653778553009033, "learning_rate": 1.6348534137991975e-05, "loss": 0.0753, "step": 89775 }, { "epoch": 1.322513659592642, "grad_norm": 1.4680769443511963, "learning_rate": 1.6346897766024992e-05, "loss": 0.0764, "step": 89800 }, { "epoch": 1.3228818426827293, "grad_norm": 1.3933496475219727, "learning_rate": 1.6345261394058007e-05, "loss": 0.0657, "step": 89825 }, { "epoch": 1.3232500257728164, "grad_norm": 1.3332090377807617, "learning_rate": 1.6343625022091025e-05, "loss": 0.0761, "step": 89850 }, { "epoch": 1.3236182088629034, "grad_norm": 0.8797059059143066, "learning_rate": 1.634198865012404e-05, "loss": 0.0643, "step": 89875 }, { "epoch": 1.3239863919529904, "grad_norm": 1.4298036098480225, "learning_rate": 1.6340352278157054e-05, "loss": 0.072, "step": 89900 }, { "epoch": 1.3243545750430774, "grad_norm": 1.0826364755630493, "learning_rate": 1.633871590619007e-05, "loss": 0.077, "step": 89925 }, { "epoch": 1.3247227581331644, "grad_norm": 1.3877052068710327, "learning_rate": 1.6337079534223083e-05, "loss": 0.074, "step": 89950 }, { "epoch": 1.3250909412232514, "grad_norm": 1.8560642004013062, "learning_rate": 1.63354431622561e-05, "loss": 0.0769, "step": 89975 }, { "epoch": 1.3254591243133385, "grad_norm": 1.0066782236099243, "learning_rate": 1.6333806790289115e-05, "loss": 0.0772, "step": 90000 }, { "epoch": 1.3258273074034257, "grad_norm": 1.5684863328933716, "learning_rate": 1.633217041832213e-05, "loss": 0.0712, "step": 90025 }, { "epoch": 1.3261954904935127, "grad_norm": 1.609997272491455, "learning_rate": 1.6330534046355147e-05, "loss": 0.0695, "step": 90050 }, { "epoch": 1.3265636735835997, "grad_norm": 1.3524547815322876, "learning_rate": 1.632889767438816e-05, "loss": 0.0913, "step": 90075 }, { "epoch": 1.3269318566736867, "grad_norm": 1.690234661102295, "learning_rate": 1.632726130242118e-05, "loss": 0.0831, "step": 90100 }, { "epoch": 1.3273000397637738, "grad_norm": 1.0316253900527954, "learning_rate": 1.6325624930454194e-05, "loss": 0.0693, "step": 90125 }, { "epoch": 1.3276682228538608, "grad_norm": 1.8638652563095093, "learning_rate": 1.632398855848721e-05, "loss": 0.0868, "step": 90150 }, { "epoch": 1.3280364059439478, "grad_norm": 1.6013383865356445, "learning_rate": 1.6322352186520223e-05, "loss": 0.081, "step": 90175 }, { "epoch": 1.3284045890340348, "grad_norm": 1.2709286212921143, "learning_rate": 1.6320715814553237e-05, "loss": 0.0777, "step": 90200 }, { "epoch": 1.3287727721241218, "grad_norm": 1.9263944625854492, "learning_rate": 1.6319079442586255e-05, "loss": 0.0773, "step": 90225 }, { "epoch": 1.3291409552142088, "grad_norm": 1.0371692180633545, "learning_rate": 1.631744307061927e-05, "loss": 0.077, "step": 90250 }, { "epoch": 1.3295091383042958, "grad_norm": 1.599334478378296, "learning_rate": 1.6315806698652284e-05, "loss": 0.0781, "step": 90275 }, { "epoch": 1.3298773213943829, "grad_norm": 1.292439579963684, "learning_rate": 1.6314170326685302e-05, "loss": 0.0724, "step": 90300 }, { "epoch": 1.33024550448447, "grad_norm": 1.5270636081695557, "learning_rate": 1.6312533954718316e-05, "loss": 0.0755, "step": 90325 }, { "epoch": 1.3306136875745571, "grad_norm": 2.00880765914917, "learning_rate": 1.6310897582751334e-05, "loss": 0.0775, "step": 90350 }, { "epoch": 1.3309818706646441, "grad_norm": 1.344334602355957, "learning_rate": 1.6309261210784345e-05, "loss": 0.0749, "step": 90375 }, { "epoch": 1.3313500537547311, "grad_norm": 1.0912142992019653, "learning_rate": 1.6307624838817363e-05, "loss": 0.0706, "step": 90400 }, { "epoch": 1.3317182368448182, "grad_norm": 1.29023277759552, "learning_rate": 1.6305988466850378e-05, "loss": 0.0729, "step": 90425 }, { "epoch": 1.3320864199349052, "grad_norm": 2.004547595977783, "learning_rate": 1.6304352094883392e-05, "loss": 0.0809, "step": 90450 }, { "epoch": 1.3324546030249922, "grad_norm": 1.2141296863555908, "learning_rate": 1.630271572291641e-05, "loss": 0.0754, "step": 90475 }, { "epoch": 1.3328227861150794, "grad_norm": 1.1595202684402466, "learning_rate": 1.6301079350949424e-05, "loss": 0.0762, "step": 90500 }, { "epoch": 1.3331909692051664, "grad_norm": 1.522676944732666, "learning_rate": 1.629944297898244e-05, "loss": 0.0819, "step": 90525 }, { "epoch": 1.3335591522952535, "grad_norm": 1.645128607749939, "learning_rate": 1.6297806607015457e-05, "loss": 0.0789, "step": 90550 }, { "epoch": 1.3339273353853405, "grad_norm": 1.6275830268859863, "learning_rate": 1.629617023504847e-05, "loss": 0.0777, "step": 90575 }, { "epoch": 1.3342955184754275, "grad_norm": 1.7780475616455078, "learning_rate": 1.6294533863081486e-05, "loss": 0.0751, "step": 90600 }, { "epoch": 1.3346637015655145, "grad_norm": 0.8597611784934998, "learning_rate": 1.62928974911145e-05, "loss": 0.0712, "step": 90625 }, { "epoch": 1.3350318846556015, "grad_norm": 1.6526169776916504, "learning_rate": 1.6291261119147518e-05, "loss": 0.0727, "step": 90650 }, { "epoch": 1.3354000677456885, "grad_norm": 1.4190505743026733, "learning_rate": 1.6289624747180532e-05, "loss": 0.0819, "step": 90675 }, { "epoch": 1.3357682508357756, "grad_norm": 1.3069881200790405, "learning_rate": 1.6287988375213547e-05, "loss": 0.0752, "step": 90700 }, { "epoch": 1.3361364339258626, "grad_norm": 1.3888797760009766, "learning_rate": 1.6286352003246565e-05, "loss": 0.0782, "step": 90725 }, { "epoch": 1.3365046170159496, "grad_norm": 1.2229448556900024, "learning_rate": 1.628471563127958e-05, "loss": 0.0749, "step": 90750 }, { "epoch": 1.3368728001060366, "grad_norm": 1.2446955442428589, "learning_rate": 1.6283079259312597e-05, "loss": 0.072, "step": 90775 }, { "epoch": 1.3372409831961238, "grad_norm": 1.2332305908203125, "learning_rate": 1.6281442887345608e-05, "loss": 0.069, "step": 90800 }, { "epoch": 1.3376091662862108, "grad_norm": 1.3013314008712769, "learning_rate": 1.6279806515378626e-05, "loss": 0.0788, "step": 90825 }, { "epoch": 1.3379773493762979, "grad_norm": 1.5836565494537354, "learning_rate": 1.627817014341164e-05, "loss": 0.0801, "step": 90850 }, { "epoch": 1.3383455324663849, "grad_norm": 1.5280481576919556, "learning_rate": 1.6276533771444655e-05, "loss": 0.0801, "step": 90875 }, { "epoch": 1.338713715556472, "grad_norm": 1.3406683206558228, "learning_rate": 1.6274897399477673e-05, "loss": 0.0767, "step": 90900 }, { "epoch": 1.339081898646559, "grad_norm": 0.9091733694076538, "learning_rate": 1.6273261027510687e-05, "loss": 0.0775, "step": 90925 }, { "epoch": 1.339450081736646, "grad_norm": 1.2953158617019653, "learning_rate": 1.62716246555437e-05, "loss": 0.0733, "step": 90950 }, { "epoch": 1.3398182648267332, "grad_norm": 1.3101142644882202, "learning_rate": 1.626998828357672e-05, "loss": 0.0724, "step": 90975 }, { "epoch": 1.3401864479168202, "grad_norm": 1.5352545976638794, "learning_rate": 1.626841736648841e-05, "loss": 0.0706, "step": 91000 }, { "epoch": 1.3405546310069072, "grad_norm": 1.4464690685272217, "learning_rate": 1.6266780994521428e-05, "loss": 0.0747, "step": 91025 }, { "epoch": 1.3409228140969942, "grad_norm": 1.049544334411621, "learning_rate": 1.6265144622554442e-05, "loss": 0.0731, "step": 91050 }, { "epoch": 1.3412909971870812, "grad_norm": 1.006972312927246, "learning_rate": 1.626350825058746e-05, "loss": 0.0717, "step": 91075 }, { "epoch": 1.3416591802771682, "grad_norm": 1.0202698707580566, "learning_rate": 1.6261871878620474e-05, "loss": 0.0759, "step": 91100 }, { "epoch": 1.3420273633672553, "grad_norm": 2.213733434677124, "learning_rate": 1.626023550665349e-05, "loss": 0.0839, "step": 91125 }, { "epoch": 1.3423955464573423, "grad_norm": 1.620505690574646, "learning_rate": 1.6258599134686507e-05, "loss": 0.0717, "step": 91150 }, { "epoch": 1.3427637295474293, "grad_norm": 1.2301806211471558, "learning_rate": 1.625696276271952e-05, "loss": 0.0726, "step": 91175 }, { "epoch": 1.3431319126375163, "grad_norm": 1.664406657218933, "learning_rate": 1.6255326390752536e-05, "loss": 0.076, "step": 91200 }, { "epoch": 1.3435000957276033, "grad_norm": 1.6831588745117188, "learning_rate": 1.625369001878555e-05, "loss": 0.0744, "step": 91225 }, { "epoch": 1.3438682788176903, "grad_norm": 1.3554997444152832, "learning_rate": 1.6252053646818568e-05, "loss": 0.0735, "step": 91250 }, { "epoch": 1.3442364619077776, "grad_norm": 1.3595879077911377, "learning_rate": 1.6250417274851582e-05, "loss": 0.0796, "step": 91275 }, { "epoch": 1.3446046449978646, "grad_norm": 2.183323383331299, "learning_rate": 1.6248780902884597e-05, "loss": 0.0683, "step": 91300 }, { "epoch": 1.3449728280879516, "grad_norm": 1.9501228332519531, "learning_rate": 1.6247144530917615e-05, "loss": 0.0856, "step": 91325 }, { "epoch": 1.3453410111780386, "grad_norm": 1.6886019706726074, "learning_rate": 1.624550815895063e-05, "loss": 0.0756, "step": 91350 }, { "epoch": 1.3457091942681256, "grad_norm": 1.3946233987808228, "learning_rate": 1.6243871786983644e-05, "loss": 0.0789, "step": 91375 }, { "epoch": 1.3460773773582126, "grad_norm": 1.2149133682250977, "learning_rate": 1.624223541501666e-05, "loss": 0.075, "step": 91400 }, { "epoch": 1.3464455604482997, "grad_norm": 1.3756582736968994, "learning_rate": 1.6240599043049673e-05, "loss": 0.0718, "step": 91425 }, { "epoch": 1.346813743538387, "grad_norm": 1.1662389039993286, "learning_rate": 1.623896267108269e-05, "loss": 0.0749, "step": 91450 }, { "epoch": 1.347181926628474, "grad_norm": 1.3211029767990112, "learning_rate": 1.6237326299115705e-05, "loss": 0.0788, "step": 91475 }, { "epoch": 1.347550109718561, "grad_norm": 1.4627496004104614, "learning_rate": 1.6235689927148723e-05, "loss": 0.0796, "step": 91500 }, { "epoch": 1.347918292808648, "grad_norm": 1.4493077993392944, "learning_rate": 1.6234053555181737e-05, "loss": 0.0742, "step": 91525 }, { "epoch": 1.348286475898735, "grad_norm": 1.291533350944519, "learning_rate": 1.623241718321475e-05, "loss": 0.0705, "step": 91550 }, { "epoch": 1.348654658988822, "grad_norm": 1.622793197631836, "learning_rate": 1.623078081124777e-05, "loss": 0.0708, "step": 91575 }, { "epoch": 1.349022842078909, "grad_norm": 1.5103604793548584, "learning_rate": 1.6229144439280784e-05, "loss": 0.0731, "step": 91600 }, { "epoch": 1.349391025168996, "grad_norm": 1.8477883338928223, "learning_rate": 1.62275080673138e-05, "loss": 0.0825, "step": 91625 }, { "epoch": 1.349759208259083, "grad_norm": 1.430349588394165, "learning_rate": 1.6225871695346813e-05, "loss": 0.0652, "step": 91650 }, { "epoch": 1.35012739134917, "grad_norm": 0.872810423374176, "learning_rate": 1.6224235323379827e-05, "loss": 0.0708, "step": 91675 }, { "epoch": 1.350495574439257, "grad_norm": 1.4663913249969482, "learning_rate": 1.6222598951412845e-05, "loss": 0.0798, "step": 91700 }, { "epoch": 1.350863757529344, "grad_norm": 1.6600351333618164, "learning_rate": 1.622096257944586e-05, "loss": 0.0739, "step": 91725 }, { "epoch": 1.3512319406194313, "grad_norm": 1.317004680633545, "learning_rate": 1.6219326207478877e-05, "loss": 0.0754, "step": 91750 }, { "epoch": 1.3516001237095183, "grad_norm": 1.6911221742630005, "learning_rate": 1.6217689835511892e-05, "loss": 0.0784, "step": 91775 }, { "epoch": 1.3519683067996053, "grad_norm": 1.226335048675537, "learning_rate": 1.6216053463544906e-05, "loss": 0.0776, "step": 91800 }, { "epoch": 1.3523364898896924, "grad_norm": 1.7652842998504639, "learning_rate": 1.6214417091577924e-05, "loss": 0.0767, "step": 91825 }, { "epoch": 1.3527046729797794, "grad_norm": 1.5158592462539673, "learning_rate": 1.6212780719610935e-05, "loss": 0.0793, "step": 91850 }, { "epoch": 1.3530728560698664, "grad_norm": 1.1897499561309814, "learning_rate": 1.6211144347643953e-05, "loss": 0.0741, "step": 91875 }, { "epoch": 1.3534410391599534, "grad_norm": 1.5610355138778687, "learning_rate": 1.6209507975676968e-05, "loss": 0.079, "step": 91900 }, { "epoch": 1.3538092222500406, "grad_norm": 1.5645991563796997, "learning_rate": 1.6207871603709982e-05, "loss": 0.0773, "step": 91925 }, { "epoch": 1.3541774053401276, "grad_norm": 1.8131879568099976, "learning_rate": 1.6206235231743e-05, "loss": 0.0769, "step": 91950 }, { "epoch": 1.3545455884302147, "grad_norm": 1.5232982635498047, "learning_rate": 1.6204598859776014e-05, "loss": 0.0716, "step": 91975 }, { "epoch": 1.3549137715203017, "grad_norm": 1.3999429941177368, "learning_rate": 1.6202962487809032e-05, "loss": 0.0752, "step": 92000 }, { "epoch": 1.3552819546103887, "grad_norm": 1.7154897451400757, "learning_rate": 1.6201326115842047e-05, "loss": 0.0711, "step": 92025 }, { "epoch": 1.3556501377004757, "grad_norm": 1.5103822946548462, "learning_rate": 1.619968974387506e-05, "loss": 0.0788, "step": 92050 }, { "epoch": 1.3560183207905627, "grad_norm": 1.2476588487625122, "learning_rate": 1.6198053371908076e-05, "loss": 0.0808, "step": 92075 }, { "epoch": 1.3563865038806497, "grad_norm": 1.5079420804977417, "learning_rate": 1.619641699994109e-05, "loss": 0.0815, "step": 92100 }, { "epoch": 1.3567546869707368, "grad_norm": 1.627220630645752, "learning_rate": 1.6194780627974108e-05, "loss": 0.0782, "step": 92125 }, { "epoch": 1.3571228700608238, "grad_norm": 1.4262968301773071, "learning_rate": 1.6193144256007122e-05, "loss": 0.0717, "step": 92150 }, { "epoch": 1.3574910531509108, "grad_norm": 1.3170403242111206, "learning_rate": 1.619150788404014e-05, "loss": 0.0777, "step": 92175 }, { "epoch": 1.3578592362409978, "grad_norm": 1.3268344402313232, "learning_rate": 1.6189871512073155e-05, "loss": 0.0709, "step": 92200 }, { "epoch": 1.358227419331085, "grad_norm": 1.7530207633972168, "learning_rate": 1.618823514010617e-05, "loss": 0.0733, "step": 92225 }, { "epoch": 1.358595602421172, "grad_norm": 1.6545002460479736, "learning_rate": 1.6186598768139187e-05, "loss": 0.0778, "step": 92250 }, { "epoch": 1.358963785511259, "grad_norm": 1.4598360061645508, "learning_rate": 1.6184962396172198e-05, "loss": 0.0754, "step": 92275 }, { "epoch": 1.359331968601346, "grad_norm": 1.8592778444290161, "learning_rate": 1.6183326024205216e-05, "loss": 0.0745, "step": 92300 }, { "epoch": 1.359700151691433, "grad_norm": 1.261029839515686, "learning_rate": 1.618168965223823e-05, "loss": 0.0841, "step": 92325 }, { "epoch": 1.3600683347815201, "grad_norm": 1.7479112148284912, "learning_rate": 1.6180053280271245e-05, "loss": 0.0735, "step": 92350 }, { "epoch": 1.3604365178716071, "grad_norm": 1.0892970561981201, "learning_rate": 1.6178416908304263e-05, "loss": 0.0666, "step": 92375 }, { "epoch": 1.3608047009616944, "grad_norm": 1.9156078100204468, "learning_rate": 1.6176780536337277e-05, "loss": 0.0744, "step": 92400 }, { "epoch": 1.3611728840517814, "grad_norm": 1.125425100326538, "learning_rate": 1.6175144164370295e-05, "loss": 0.0782, "step": 92425 }, { "epoch": 1.3615410671418684, "grad_norm": 1.3087259531021118, "learning_rate": 1.617350779240331e-05, "loss": 0.0716, "step": 92450 }, { "epoch": 1.3619092502319554, "grad_norm": 1.3223575353622437, "learning_rate": 1.6171871420436324e-05, "loss": 0.0773, "step": 92475 }, { "epoch": 1.3622774333220424, "grad_norm": 1.6559269428253174, "learning_rate": 1.6170235048469338e-05, "loss": 0.0713, "step": 92500 }, { "epoch": 1.3626456164121294, "grad_norm": 1.1485322713851929, "learning_rate": 1.6168598676502353e-05, "loss": 0.0742, "step": 92525 }, { "epoch": 1.3630137995022165, "grad_norm": 1.653432846069336, "learning_rate": 1.616696230453537e-05, "loss": 0.084, "step": 92550 }, { "epoch": 1.3633819825923035, "grad_norm": 1.4452425241470337, "learning_rate": 1.6165325932568385e-05, "loss": 0.0756, "step": 92575 }, { "epoch": 1.3637501656823905, "grad_norm": 1.4553121328353882, "learning_rate": 1.61636895606014e-05, "loss": 0.0805, "step": 92600 }, { "epoch": 1.3641183487724775, "grad_norm": 1.4380722045898438, "learning_rate": 1.6162053188634417e-05, "loss": 0.0734, "step": 92625 }, { "epoch": 1.3644865318625645, "grad_norm": 1.274214506149292, "learning_rate": 1.6160416816667432e-05, "loss": 0.0701, "step": 92650 }, { "epoch": 1.3648547149526515, "grad_norm": 1.813348412513733, "learning_rate": 1.615878044470045e-05, "loss": 0.0768, "step": 92675 }, { "epoch": 1.3652228980427388, "grad_norm": 1.2675639390945435, "learning_rate": 1.615714407273346e-05, "loss": 0.0788, "step": 92700 }, { "epoch": 1.3655910811328258, "grad_norm": 1.216004490852356, "learning_rate": 1.615550770076648e-05, "loss": 0.0688, "step": 92725 }, { "epoch": 1.3659592642229128, "grad_norm": 1.5318211317062378, "learning_rate": 1.6153871328799493e-05, "loss": 0.0754, "step": 92750 }, { "epoch": 1.3663274473129998, "grad_norm": 1.3124147653579712, "learning_rate": 1.6152234956832507e-05, "loss": 0.0728, "step": 92775 }, { "epoch": 1.3666956304030868, "grad_norm": 1.4888821840286255, "learning_rate": 1.6150598584865525e-05, "loss": 0.0708, "step": 92800 }, { "epoch": 1.3670638134931739, "grad_norm": 1.4113868474960327, "learning_rate": 1.614896221289854e-05, "loss": 0.0704, "step": 92825 }, { "epoch": 1.3674319965832609, "grad_norm": 1.3530505895614624, "learning_rate": 1.6147325840931554e-05, "loss": 0.0741, "step": 92850 }, { "epoch": 1.3678001796733479, "grad_norm": 1.3829823732376099, "learning_rate": 1.6145689468964572e-05, "loss": 0.0709, "step": 92875 }, { "epoch": 1.3681683627634351, "grad_norm": 1.209885835647583, "learning_rate": 1.6144053096997587e-05, "loss": 0.0686, "step": 92900 }, { "epoch": 1.3685365458535221, "grad_norm": 1.284749150276184, "learning_rate": 1.61424167250306e-05, "loss": 0.0749, "step": 92925 }, { "epoch": 1.3689047289436092, "grad_norm": 1.4421370029449463, "learning_rate": 1.6140780353063615e-05, "loss": 0.0768, "step": 92950 }, { "epoch": 1.3692729120336962, "grad_norm": 1.3181594610214233, "learning_rate": 1.6139143981096633e-05, "loss": 0.0777, "step": 92975 }, { "epoch": 1.3696410951237832, "grad_norm": 1.1795927286148071, "learning_rate": 1.6137507609129648e-05, "loss": 0.0684, "step": 93000 }, { "epoch": 1.3700092782138702, "grad_norm": 1.5650521516799927, "learning_rate": 1.6135871237162662e-05, "loss": 0.0792, "step": 93025 }, { "epoch": 1.3703774613039572, "grad_norm": 1.4631000757217407, "learning_rate": 1.613423486519568e-05, "loss": 0.0736, "step": 93050 }, { "epoch": 1.3707456443940442, "grad_norm": 1.321958303451538, "learning_rate": 1.6132598493228695e-05, "loss": 0.0761, "step": 93075 }, { "epoch": 1.3711138274841312, "grad_norm": 1.2320996522903442, "learning_rate": 1.613096212126171e-05, "loss": 0.0715, "step": 93100 }, { "epoch": 1.3714820105742183, "grad_norm": 1.4432626962661743, "learning_rate": 1.6129325749294723e-05, "loss": 0.0749, "step": 93125 }, { "epoch": 1.3718501936643053, "grad_norm": 1.2753055095672607, "learning_rate": 1.612768937732774e-05, "loss": 0.0685, "step": 93150 }, { "epoch": 1.3722183767543923, "grad_norm": 1.0910691022872925, "learning_rate": 1.6126053005360756e-05, "loss": 0.0725, "step": 93175 }, { "epoch": 1.3725865598444795, "grad_norm": 1.1770347356796265, "learning_rate": 1.612441663339377e-05, "loss": 0.0727, "step": 93200 }, { "epoch": 1.3729547429345665, "grad_norm": 1.4494625329971313, "learning_rate": 1.6122780261426788e-05, "loss": 0.0761, "step": 93225 }, { "epoch": 1.3733229260246536, "grad_norm": 1.445708990097046, "learning_rate": 1.6121143889459802e-05, "loss": 0.069, "step": 93250 }, { "epoch": 1.3736911091147406, "grad_norm": 1.6922872066497803, "learning_rate": 1.6119507517492817e-05, "loss": 0.0747, "step": 93275 }, { "epoch": 1.3740592922048276, "grad_norm": 1.3769599199295044, "learning_rate": 1.6117871145525835e-05, "loss": 0.075, "step": 93300 }, { "epoch": 1.3744274752949146, "grad_norm": 1.4104198217391968, "learning_rate": 1.611623477355885e-05, "loss": 0.073, "step": 93325 }, { "epoch": 1.3747956583850016, "grad_norm": 1.243233323097229, "learning_rate": 1.6114598401591864e-05, "loss": 0.0773, "step": 93350 }, { "epoch": 1.3751638414750889, "grad_norm": 1.573714017868042, "learning_rate": 1.6112962029624878e-05, "loss": 0.0723, "step": 93375 }, { "epoch": 1.3755320245651759, "grad_norm": 1.5424933433532715, "learning_rate": 1.6111325657657896e-05, "loss": 0.0785, "step": 93400 }, { "epoch": 1.3759002076552629, "grad_norm": 1.051605463027954, "learning_rate": 1.610968928569091e-05, "loss": 0.0681, "step": 93425 }, { "epoch": 1.37626839074535, "grad_norm": 1.186645746231079, "learning_rate": 1.6108052913723925e-05, "loss": 0.0715, "step": 93450 }, { "epoch": 1.376636573835437, "grad_norm": 1.5428658723831177, "learning_rate": 1.6106416541756943e-05, "loss": 0.0714, "step": 93475 }, { "epoch": 1.377004756925524, "grad_norm": 1.181187391281128, "learning_rate": 1.6104780169789957e-05, "loss": 0.0792, "step": 93500 }, { "epoch": 1.377372940015611, "grad_norm": 1.5966691970825195, "learning_rate": 1.610314379782297e-05, "loss": 0.0817, "step": 93525 }, { "epoch": 1.377741123105698, "grad_norm": 1.1199043989181519, "learning_rate": 1.6101507425855986e-05, "loss": 0.0747, "step": 93550 }, { "epoch": 1.378109306195785, "grad_norm": 1.2347215414047241, "learning_rate": 1.6099871053889004e-05, "loss": 0.0691, "step": 93575 }, { "epoch": 1.378477489285872, "grad_norm": 1.4695402383804321, "learning_rate": 1.609823468192202e-05, "loss": 0.0861, "step": 93600 }, { "epoch": 1.378845672375959, "grad_norm": 1.1878154277801514, "learning_rate": 1.6096598309955033e-05, "loss": 0.0681, "step": 93625 }, { "epoch": 1.379213855466046, "grad_norm": 1.849567174911499, "learning_rate": 1.609502739286673e-05, "loss": 0.0814, "step": 93650 }, { "epoch": 1.3795820385561333, "grad_norm": 1.2896862030029297, "learning_rate": 1.6093391020899745e-05, "loss": 0.0805, "step": 93675 }, { "epoch": 1.3799502216462203, "grad_norm": 1.5275253057479858, "learning_rate": 1.609175464893276e-05, "loss": 0.0767, "step": 93700 }, { "epoch": 1.3803184047363073, "grad_norm": 1.5272122621536255, "learning_rate": 1.6090118276965777e-05, "loss": 0.0747, "step": 93725 }, { "epoch": 1.3806865878263943, "grad_norm": 1.2588330507278442, "learning_rate": 1.6088481904998788e-05, "loss": 0.0708, "step": 93750 }, { "epoch": 1.3810547709164813, "grad_norm": 1.479750156402588, "learning_rate": 1.6086845533031806e-05, "loss": 0.0672, "step": 93775 }, { "epoch": 1.3814229540065683, "grad_norm": 1.2768731117248535, "learning_rate": 1.608520916106482e-05, "loss": 0.0725, "step": 93800 }, { "epoch": 1.3817911370966554, "grad_norm": 1.2872675657272339, "learning_rate": 1.6083572789097838e-05, "loss": 0.0685, "step": 93825 }, { "epoch": 1.3821593201867426, "grad_norm": 1.5474956035614014, "learning_rate": 1.6081936417130853e-05, "loss": 0.0742, "step": 93850 }, { "epoch": 1.3825275032768296, "grad_norm": 1.1625415086746216, "learning_rate": 1.6080300045163867e-05, "loss": 0.0769, "step": 93875 }, { "epoch": 1.3828956863669166, "grad_norm": 1.2839393615722656, "learning_rate": 1.6078663673196885e-05, "loss": 0.0778, "step": 93900 }, { "epoch": 1.3832638694570036, "grad_norm": 1.100573182106018, "learning_rate": 1.60770273012299e-05, "loss": 0.0758, "step": 93925 }, { "epoch": 1.3836320525470907, "grad_norm": 1.701767921447754, "learning_rate": 1.6075390929262914e-05, "loss": 0.0822, "step": 93950 }, { "epoch": 1.3840002356371777, "grad_norm": 1.4825780391693115, "learning_rate": 1.6073754557295928e-05, "loss": 0.0727, "step": 93975 }, { "epoch": 1.3843684187272647, "grad_norm": 1.6215708255767822, "learning_rate": 1.6072118185328943e-05, "loss": 0.0696, "step": 94000 }, { "epoch": 1.3847366018173517, "grad_norm": 1.6350444555282593, "learning_rate": 1.607048181336196e-05, "loss": 0.0714, "step": 94025 }, { "epoch": 1.3851047849074387, "grad_norm": 1.4409617185592651, "learning_rate": 1.6068845441394975e-05, "loss": 0.0735, "step": 94050 }, { "epoch": 1.3854729679975257, "grad_norm": 1.2251015901565552, "learning_rate": 1.6067209069427993e-05, "loss": 0.0704, "step": 94075 }, { "epoch": 1.3858411510876127, "grad_norm": 1.1130964756011963, "learning_rate": 1.6065572697461007e-05, "loss": 0.0837, "step": 94100 }, { "epoch": 1.3862093341776998, "grad_norm": 1.407037615776062, "learning_rate": 1.6063936325494022e-05, "loss": 0.0721, "step": 94125 }, { "epoch": 1.386577517267787, "grad_norm": 1.4179117679595947, "learning_rate": 1.606229995352704e-05, "loss": 0.0737, "step": 94150 }, { "epoch": 1.386945700357874, "grad_norm": 1.8100414276123047, "learning_rate": 1.606066358156005e-05, "loss": 0.0806, "step": 94175 }, { "epoch": 1.387313883447961, "grad_norm": 1.518386721611023, "learning_rate": 1.605902720959307e-05, "loss": 0.068, "step": 94200 }, { "epoch": 1.387682066538048, "grad_norm": 1.516484022140503, "learning_rate": 1.6057390837626083e-05, "loss": 0.07, "step": 94225 }, { "epoch": 1.388050249628135, "grad_norm": 1.5263150930404663, "learning_rate": 1.6055754465659097e-05, "loss": 0.0715, "step": 94250 }, { "epoch": 1.388418432718222, "grad_norm": 1.3971295356750488, "learning_rate": 1.6054118093692115e-05, "loss": 0.0763, "step": 94275 }, { "epoch": 1.388786615808309, "grad_norm": 1.7819820642471313, "learning_rate": 1.605248172172513e-05, "loss": 0.0817, "step": 94300 }, { "epoch": 1.3891547988983963, "grad_norm": 1.4265540838241577, "learning_rate": 1.6050845349758148e-05, "loss": 0.0728, "step": 94325 }, { "epoch": 1.3895229819884833, "grad_norm": 1.0876445770263672, "learning_rate": 1.6049208977791162e-05, "loss": 0.0773, "step": 94350 }, { "epoch": 1.3898911650785704, "grad_norm": 1.161629557609558, "learning_rate": 1.6047572605824177e-05, "loss": 0.0757, "step": 94375 }, { "epoch": 1.3902593481686574, "grad_norm": 1.584875226020813, "learning_rate": 1.604593623385719e-05, "loss": 0.0783, "step": 94400 }, { "epoch": 1.3906275312587444, "grad_norm": 0.9503710269927979, "learning_rate": 1.6044299861890205e-05, "loss": 0.073, "step": 94425 }, { "epoch": 1.3909957143488314, "grad_norm": 1.6783994436264038, "learning_rate": 1.6042663489923223e-05, "loss": 0.0716, "step": 94450 }, { "epoch": 1.3913638974389184, "grad_norm": 1.7241624593734741, "learning_rate": 1.6041027117956238e-05, "loss": 0.0767, "step": 94475 }, { "epoch": 1.3917320805290054, "grad_norm": 1.6753015518188477, "learning_rate": 1.6039390745989256e-05, "loss": 0.0767, "step": 94500 }, { "epoch": 1.3921002636190924, "grad_norm": 1.2731801271438599, "learning_rate": 1.603775437402227e-05, "loss": 0.0759, "step": 94525 }, { "epoch": 1.3924684467091795, "grad_norm": 1.4037301540374756, "learning_rate": 1.6036118002055285e-05, "loss": 0.0731, "step": 94550 }, { "epoch": 1.3928366297992665, "grad_norm": 1.5242455005645752, "learning_rate": 1.60344816300883e-05, "loss": 0.075, "step": 94575 }, { "epoch": 1.3932048128893535, "grad_norm": 1.401528000831604, "learning_rate": 1.6032845258121313e-05, "loss": 0.0748, "step": 94600 }, { "epoch": 1.3935729959794407, "grad_norm": 1.6088056564331055, "learning_rate": 1.603120888615433e-05, "loss": 0.0765, "step": 94625 }, { "epoch": 1.3939411790695277, "grad_norm": 1.5187320709228516, "learning_rate": 1.6029572514187346e-05, "loss": 0.0708, "step": 94650 }, { "epoch": 1.3943093621596148, "grad_norm": 1.079283356666565, "learning_rate": 1.602793614222036e-05, "loss": 0.0707, "step": 94675 }, { "epoch": 1.3946775452497018, "grad_norm": 1.1687904596328735, "learning_rate": 1.6026299770253378e-05, "loss": 0.0775, "step": 94700 }, { "epoch": 1.3950457283397888, "grad_norm": 1.5087347030639648, "learning_rate": 1.6024663398286392e-05, "loss": 0.0727, "step": 94725 }, { "epoch": 1.3954139114298758, "grad_norm": 1.6743303537368774, "learning_rate": 1.602302702631941e-05, "loss": 0.0813, "step": 94750 }, { "epoch": 1.3957820945199628, "grad_norm": 1.2112457752227783, "learning_rate": 1.6021390654352425e-05, "loss": 0.0747, "step": 94775 }, { "epoch": 1.39615027761005, "grad_norm": 1.454703450202942, "learning_rate": 1.601975428238544e-05, "loss": 0.0736, "step": 94800 }, { "epoch": 1.396518460700137, "grad_norm": 0.8092736005783081, "learning_rate": 1.6018117910418454e-05, "loss": 0.067, "step": 94825 }, { "epoch": 1.396886643790224, "grad_norm": 1.178327202796936, "learning_rate": 1.6016481538451468e-05, "loss": 0.0704, "step": 94850 }, { "epoch": 1.397254826880311, "grad_norm": 1.6238834857940674, "learning_rate": 1.6014845166484486e-05, "loss": 0.0777, "step": 94875 }, { "epoch": 1.3976230099703981, "grad_norm": 1.8076914548873901, "learning_rate": 1.60132087945175e-05, "loss": 0.0726, "step": 94900 }, { "epoch": 1.3979911930604851, "grad_norm": 1.448445200920105, "learning_rate": 1.6011572422550515e-05, "loss": 0.0774, "step": 94925 }, { "epoch": 1.3983593761505722, "grad_norm": 1.3596991300582886, "learning_rate": 1.6009936050583533e-05, "loss": 0.0755, "step": 94950 }, { "epoch": 1.3987275592406592, "grad_norm": 1.2962175607681274, "learning_rate": 1.6008299678616547e-05, "loss": 0.0713, "step": 94975 }, { "epoch": 1.3990957423307462, "grad_norm": 1.2302311658859253, "learning_rate": 1.6006663306649562e-05, "loss": 0.0713, "step": 95000 }, { "epoch": 1.3994639254208332, "grad_norm": 1.334878921508789, "learning_rate": 1.6005026934682576e-05, "loss": 0.076, "step": 95025 }, { "epoch": 1.3998321085109202, "grad_norm": 1.5879437923431396, "learning_rate": 1.6003390562715594e-05, "loss": 0.0703, "step": 95050 }, { "epoch": 1.4002002916010072, "grad_norm": 1.1994304656982422, "learning_rate": 1.600175419074861e-05, "loss": 0.0802, "step": 95075 }, { "epoch": 1.4005684746910945, "grad_norm": 1.548785924911499, "learning_rate": 1.6000117818781623e-05, "loss": 0.0779, "step": 95100 }, { "epoch": 1.4009366577811815, "grad_norm": 1.2609056234359741, "learning_rate": 1.599848144681464e-05, "loss": 0.0786, "step": 95125 }, { "epoch": 1.4013048408712685, "grad_norm": 1.119028925895691, "learning_rate": 1.5996845074847655e-05, "loss": 0.0681, "step": 95150 }, { "epoch": 1.4016730239613555, "grad_norm": 1.7501965761184692, "learning_rate": 1.599520870288067e-05, "loss": 0.0805, "step": 95175 }, { "epoch": 1.4020412070514425, "grad_norm": 1.458664894104004, "learning_rate": 1.5993572330913688e-05, "loss": 0.073, "step": 95200 }, { "epoch": 1.4024093901415295, "grad_norm": 1.7162827253341675, "learning_rate": 1.5991935958946702e-05, "loss": 0.0723, "step": 95225 }, { "epoch": 1.4027775732316166, "grad_norm": 1.8378931283950806, "learning_rate": 1.5990299586979716e-05, "loss": 0.0756, "step": 95250 }, { "epoch": 1.4031457563217036, "grad_norm": 1.6052461862564087, "learning_rate": 1.598866321501273e-05, "loss": 0.0681, "step": 95275 }, { "epoch": 1.4035139394117908, "grad_norm": 1.5615938901901245, "learning_rate": 1.598702684304575e-05, "loss": 0.0798, "step": 95300 }, { "epoch": 1.4038821225018778, "grad_norm": 1.5934979915618896, "learning_rate": 1.5985390471078763e-05, "loss": 0.0768, "step": 95325 }, { "epoch": 1.4042503055919648, "grad_norm": 1.564051628112793, "learning_rate": 1.5983754099111778e-05, "loss": 0.0668, "step": 95350 }, { "epoch": 1.4046184886820519, "grad_norm": 1.3912962675094604, "learning_rate": 1.5982117727144795e-05, "loss": 0.0692, "step": 95375 }, { "epoch": 1.4049866717721389, "grad_norm": 1.6267430782318115, "learning_rate": 1.598048135517781e-05, "loss": 0.0828, "step": 95400 }, { "epoch": 1.4053548548622259, "grad_norm": 1.5203158855438232, "learning_rate": 1.5978844983210824e-05, "loss": 0.0864, "step": 95425 }, { "epoch": 1.405723037952313, "grad_norm": 1.7149254083633423, "learning_rate": 1.597720861124384e-05, "loss": 0.0742, "step": 95450 }, { "epoch": 1.4060912210424, "grad_norm": 1.3976386785507202, "learning_rate": 1.5975572239276857e-05, "loss": 0.0671, "step": 95475 }, { "epoch": 1.406459404132487, "grad_norm": 1.5091254711151123, "learning_rate": 1.597393586730987e-05, "loss": 0.0752, "step": 95500 }, { "epoch": 1.406827587222574, "grad_norm": 1.513067364692688, "learning_rate": 1.5972299495342886e-05, "loss": 0.0703, "step": 95525 }, { "epoch": 1.407195770312661, "grad_norm": 1.1618019342422485, "learning_rate": 1.5970663123375903e-05, "loss": 0.0776, "step": 95550 }, { "epoch": 1.407563953402748, "grad_norm": 1.1448416709899902, "learning_rate": 1.5969026751408918e-05, "loss": 0.0786, "step": 95575 }, { "epoch": 1.4079321364928352, "grad_norm": 1.1268770694732666, "learning_rate": 1.5967390379441932e-05, "loss": 0.0666, "step": 95600 }, { "epoch": 1.4083003195829222, "grad_norm": 1.377536654472351, "learning_rate": 1.596575400747495e-05, "loss": 0.078, "step": 95625 }, { "epoch": 1.4086685026730092, "grad_norm": 1.4300528764724731, "learning_rate": 1.5964117635507965e-05, "loss": 0.0754, "step": 95650 }, { "epoch": 1.4090366857630963, "grad_norm": 1.8363966941833496, "learning_rate": 1.596248126354098e-05, "loss": 0.0715, "step": 95675 }, { "epoch": 1.4094048688531833, "grad_norm": 1.4692856073379517, "learning_rate": 1.5960844891573994e-05, "loss": 0.075, "step": 95700 }, { "epoch": 1.4097730519432703, "grad_norm": 1.1752272844314575, "learning_rate": 1.595927397448569e-05, "loss": 0.0714, "step": 95725 }, { "epoch": 1.4101412350333573, "grad_norm": 1.2121433019638062, "learning_rate": 1.5957637602518705e-05, "loss": 0.0797, "step": 95750 }, { "epoch": 1.4105094181234445, "grad_norm": 1.3295732736587524, "learning_rate": 1.595600123055172e-05, "loss": 0.0771, "step": 95775 }, { "epoch": 1.4108776012135316, "grad_norm": 1.4161239862442017, "learning_rate": 1.5954364858584738e-05, "loss": 0.0743, "step": 95800 }, { "epoch": 1.4112457843036186, "grad_norm": 1.4361987113952637, "learning_rate": 1.5952728486617752e-05, "loss": 0.0703, "step": 95825 }, { "epoch": 1.4116139673937056, "grad_norm": 1.1932249069213867, "learning_rate": 1.5951092114650767e-05, "loss": 0.0751, "step": 95850 }, { "epoch": 1.4119821504837926, "grad_norm": 1.57851243019104, "learning_rate": 1.594945574268378e-05, "loss": 0.0781, "step": 95875 }, { "epoch": 1.4123503335738796, "grad_norm": 1.181174635887146, "learning_rate": 1.59478193707168e-05, "loss": 0.0754, "step": 95900 }, { "epoch": 1.4127185166639666, "grad_norm": 1.4669899940490723, "learning_rate": 1.5946182998749813e-05, "loss": 0.0714, "step": 95925 }, { "epoch": 1.4130866997540537, "grad_norm": 1.3707056045532227, "learning_rate": 1.5944546626782828e-05, "loss": 0.0736, "step": 95950 }, { "epoch": 1.4134548828441407, "grad_norm": 1.4716074466705322, "learning_rate": 1.5942910254815846e-05, "loss": 0.076, "step": 95975 }, { "epoch": 1.4138230659342277, "grad_norm": 1.4471065998077393, "learning_rate": 1.594127388284886e-05, "loss": 0.0815, "step": 96000 }, { "epoch": 1.4141912490243147, "grad_norm": 1.6333789825439453, "learning_rate": 1.5939637510881875e-05, "loss": 0.0802, "step": 96025 }, { "epoch": 1.4145594321144017, "grad_norm": 1.924338459968567, "learning_rate": 1.5938001138914892e-05, "loss": 0.0705, "step": 96050 }, { "epoch": 1.414927615204489, "grad_norm": 1.0518661737442017, "learning_rate": 1.5936364766947903e-05, "loss": 0.0748, "step": 96075 }, { "epoch": 1.415295798294576, "grad_norm": 1.7915148735046387, "learning_rate": 1.593472839498092e-05, "loss": 0.0792, "step": 96100 }, { "epoch": 1.415663981384663, "grad_norm": 1.7382258176803589, "learning_rate": 1.5933092023013936e-05, "loss": 0.0756, "step": 96125 }, { "epoch": 1.41603216447475, "grad_norm": 1.5208255052566528, "learning_rate": 1.5931455651046954e-05, "loss": 0.0769, "step": 96150 }, { "epoch": 1.416400347564837, "grad_norm": 1.0508660078048706, "learning_rate": 1.5929819279079968e-05, "loss": 0.0625, "step": 96175 }, { "epoch": 1.416768530654924, "grad_norm": 1.800963044166565, "learning_rate": 1.5928182907112983e-05, "loss": 0.0672, "step": 96200 }, { "epoch": 1.417136713745011, "grad_norm": 1.3172813653945923, "learning_rate": 1.5926546535146e-05, "loss": 0.0733, "step": 96225 }, { "epoch": 1.4175048968350983, "grad_norm": 1.6335346698760986, "learning_rate": 1.5924910163179015e-05, "loss": 0.0747, "step": 96250 }, { "epoch": 1.4178730799251853, "grad_norm": 1.1219463348388672, "learning_rate": 1.592327379121203e-05, "loss": 0.0744, "step": 96275 }, { "epoch": 1.4182412630152723, "grad_norm": 1.4757314920425415, "learning_rate": 1.5921637419245044e-05, "loss": 0.0795, "step": 96300 }, { "epoch": 1.4186094461053593, "grad_norm": 1.4103058576583862, "learning_rate": 1.5920001047278058e-05, "loss": 0.0746, "step": 96325 }, { "epoch": 1.4189776291954463, "grad_norm": 1.389687180519104, "learning_rate": 1.5918364675311076e-05, "loss": 0.0709, "step": 96350 }, { "epoch": 1.4193458122855334, "grad_norm": 1.6277658939361572, "learning_rate": 1.591672830334409e-05, "loss": 0.0777, "step": 96375 }, { "epoch": 1.4197139953756204, "grad_norm": 1.535209059715271, "learning_rate": 1.591509193137711e-05, "loss": 0.0716, "step": 96400 }, { "epoch": 1.4200821784657074, "grad_norm": 1.2609444856643677, "learning_rate": 1.5913455559410123e-05, "loss": 0.0773, "step": 96425 }, { "epoch": 1.4204503615557944, "grad_norm": 1.3724538087844849, "learning_rate": 1.5911819187443137e-05, "loss": 0.0747, "step": 96450 }, { "epoch": 1.4208185446458814, "grad_norm": 1.3623946905136108, "learning_rate": 1.5910182815476152e-05, "loss": 0.0729, "step": 96475 }, { "epoch": 1.4211867277359684, "grad_norm": 1.5128545761108398, "learning_rate": 1.5908546443509166e-05, "loss": 0.0776, "step": 96500 }, { "epoch": 1.4215549108260555, "grad_norm": 1.3841090202331543, "learning_rate": 1.5906910071542184e-05, "loss": 0.0742, "step": 96525 }, { "epoch": 1.4219230939161427, "grad_norm": 1.6266148090362549, "learning_rate": 1.59052736995752e-05, "loss": 0.0719, "step": 96550 }, { "epoch": 1.4222912770062297, "grad_norm": 1.0477402210235596, "learning_rate": 1.5903637327608213e-05, "loss": 0.0693, "step": 96575 }, { "epoch": 1.4226594600963167, "grad_norm": 0.8229677081108093, "learning_rate": 1.590200095564123e-05, "loss": 0.0775, "step": 96600 }, { "epoch": 1.4230276431864037, "grad_norm": 1.2981600761413574, "learning_rate": 1.5900364583674245e-05, "loss": 0.0699, "step": 96625 }, { "epoch": 1.4233958262764907, "grad_norm": 1.284982681274414, "learning_rate": 1.5898728211707263e-05, "loss": 0.08, "step": 96650 }, { "epoch": 1.4237640093665778, "grad_norm": 1.4522628784179688, "learning_rate": 1.5897091839740278e-05, "loss": 0.0689, "step": 96675 }, { "epoch": 1.4241321924566648, "grad_norm": 1.519117832183838, "learning_rate": 1.5895455467773292e-05, "loss": 0.0774, "step": 96700 }, { "epoch": 1.424500375546752, "grad_norm": 1.2039984464645386, "learning_rate": 1.5893819095806306e-05, "loss": 0.0763, "step": 96725 }, { "epoch": 1.424868558636839, "grad_norm": 1.3003195524215698, "learning_rate": 1.589218272383932e-05, "loss": 0.0675, "step": 96750 }, { "epoch": 1.425236741726926, "grad_norm": 1.5068836212158203, "learning_rate": 1.589054635187234e-05, "loss": 0.0725, "step": 96775 }, { "epoch": 1.425604924817013, "grad_norm": 1.2453457117080688, "learning_rate": 1.5888909979905353e-05, "loss": 0.0769, "step": 96800 }, { "epoch": 1.4259731079071, "grad_norm": 1.1764041185379028, "learning_rate": 1.588727360793837e-05, "loss": 0.0784, "step": 96825 }, { "epoch": 1.426341290997187, "grad_norm": 1.4045989513397217, "learning_rate": 1.5885637235971386e-05, "loss": 0.0726, "step": 96850 }, { "epoch": 1.426709474087274, "grad_norm": 1.2621376514434814, "learning_rate": 1.58840008640044e-05, "loss": 0.0711, "step": 96875 }, { "epoch": 1.4270776571773611, "grad_norm": 1.2671632766723633, "learning_rate": 1.5882364492037414e-05, "loss": 0.0748, "step": 96900 }, { "epoch": 1.4274458402674481, "grad_norm": 1.3318380117416382, "learning_rate": 1.588072812007043e-05, "loss": 0.0753, "step": 96925 }, { "epoch": 1.4278140233575352, "grad_norm": 1.4506373405456543, "learning_rate": 1.5879091748103447e-05, "loss": 0.0705, "step": 96950 }, { "epoch": 1.4281822064476222, "grad_norm": 1.626297950744629, "learning_rate": 1.587745537613646e-05, "loss": 0.0747, "step": 96975 }, { "epoch": 1.4285503895377092, "grad_norm": 1.2483100891113281, "learning_rate": 1.5875819004169476e-05, "loss": 0.0774, "step": 97000 }, { "epoch": 1.4289185726277964, "grad_norm": 1.2446788549423218, "learning_rate": 1.5874182632202493e-05, "loss": 0.0716, "step": 97025 }, { "epoch": 1.4292867557178834, "grad_norm": 1.527416467666626, "learning_rate": 1.5872546260235508e-05, "loss": 0.0722, "step": 97050 }, { "epoch": 1.4296549388079705, "grad_norm": 1.4439234733581543, "learning_rate": 1.5870909888268526e-05, "loss": 0.0775, "step": 97075 }, { "epoch": 1.4300231218980575, "grad_norm": 1.5147870779037476, "learning_rate": 1.586927351630154e-05, "loss": 0.0714, "step": 97100 }, { "epoch": 1.4303913049881445, "grad_norm": 1.1291157007217407, "learning_rate": 1.5867637144334555e-05, "loss": 0.0753, "step": 97125 }, { "epoch": 1.4307594880782315, "grad_norm": 1.0651323795318604, "learning_rate": 1.586600077236757e-05, "loss": 0.075, "step": 97150 }, { "epoch": 1.4311276711683185, "grad_norm": 1.1143789291381836, "learning_rate": 1.5864364400400584e-05, "loss": 0.0735, "step": 97175 }, { "epoch": 1.4314958542584058, "grad_norm": 1.641231656074524, "learning_rate": 1.58627280284336e-05, "loss": 0.0772, "step": 97200 }, { "epoch": 1.4318640373484928, "grad_norm": 1.8680340051651, "learning_rate": 1.5861091656466616e-05, "loss": 0.0744, "step": 97225 }, { "epoch": 1.4322322204385798, "grad_norm": 1.45437490940094, "learning_rate": 1.585945528449963e-05, "loss": 0.0748, "step": 97250 }, { "epoch": 1.4326004035286668, "grad_norm": 1.2228707075119019, "learning_rate": 1.5857818912532648e-05, "loss": 0.0804, "step": 97275 }, { "epoch": 1.4329685866187538, "grad_norm": 1.3762407302856445, "learning_rate": 1.5856182540565663e-05, "loss": 0.0732, "step": 97300 }, { "epoch": 1.4333367697088408, "grad_norm": 1.4869719743728638, "learning_rate": 1.5854546168598677e-05, "loss": 0.0689, "step": 97325 }, { "epoch": 1.4337049527989278, "grad_norm": 0.9573516249656677, "learning_rate": 1.585290979663169e-05, "loss": 0.0685, "step": 97350 }, { "epoch": 1.4340731358890149, "grad_norm": 1.5761363506317139, "learning_rate": 1.585127342466471e-05, "loss": 0.0693, "step": 97375 }, { "epoch": 1.4344413189791019, "grad_norm": 1.2833172082901, "learning_rate": 1.5849637052697724e-05, "loss": 0.074, "step": 97400 }, { "epoch": 1.434809502069189, "grad_norm": 1.8005805015563965, "learning_rate": 1.584800068073074e-05, "loss": 0.0776, "step": 97425 }, { "epoch": 1.435177685159276, "grad_norm": 1.33426833152771, "learning_rate": 1.5846364308763756e-05, "loss": 0.0737, "step": 97450 }, { "epoch": 1.435545868249363, "grad_norm": 1.4123365879058838, "learning_rate": 1.584472793679677e-05, "loss": 0.0792, "step": 97475 }, { "epoch": 1.4359140513394502, "grad_norm": 1.5246069431304932, "learning_rate": 1.5843091564829785e-05, "loss": 0.0848, "step": 97500 }, { "epoch": 1.4362822344295372, "grad_norm": 1.2983479499816895, "learning_rate": 1.5841455192862803e-05, "loss": 0.0747, "step": 97525 }, { "epoch": 1.4366504175196242, "grad_norm": 0.9336724281311035, "learning_rate": 1.5839818820895817e-05, "loss": 0.0715, "step": 97550 }, { "epoch": 1.4370186006097112, "grad_norm": 1.1193267107009888, "learning_rate": 1.5838182448928832e-05, "loss": 0.0701, "step": 97575 }, { "epoch": 1.4373867836997982, "grad_norm": 1.6512045860290527, "learning_rate": 1.5836546076961846e-05, "loss": 0.0754, "step": 97600 }, { "epoch": 1.4377549667898852, "grad_norm": 1.604440689086914, "learning_rate": 1.5834909704994864e-05, "loss": 0.0748, "step": 97625 }, { "epoch": 1.4381231498799723, "grad_norm": 1.7810752391815186, "learning_rate": 1.583327333302788e-05, "loss": 0.0746, "step": 97650 }, { "epoch": 1.4384913329700595, "grad_norm": 1.3520724773406982, "learning_rate": 1.5831636961060893e-05, "loss": 0.0713, "step": 97675 }, { "epoch": 1.4388595160601465, "grad_norm": 1.6861071586608887, "learning_rate": 1.583000058909391e-05, "loss": 0.0752, "step": 97700 }, { "epoch": 1.4392276991502335, "grad_norm": 1.742470383644104, "learning_rate": 1.5828364217126925e-05, "loss": 0.0672, "step": 97725 }, { "epoch": 1.4395958822403205, "grad_norm": 1.4519072771072388, "learning_rate": 1.582672784515994e-05, "loss": 0.0748, "step": 97750 }, { "epoch": 1.4399640653304075, "grad_norm": 1.3369940519332886, "learning_rate": 1.5825091473192954e-05, "loss": 0.0781, "step": 97775 }, { "epoch": 1.4403322484204946, "grad_norm": 1.3350610733032227, "learning_rate": 1.5823455101225972e-05, "loss": 0.0748, "step": 97800 }, { "epoch": 1.4407004315105816, "grad_norm": 1.5795238018035889, "learning_rate": 1.5821818729258987e-05, "loss": 0.0776, "step": 97825 }, { "epoch": 1.4410686146006686, "grad_norm": 1.3560709953308105, "learning_rate": 1.5820182357292e-05, "loss": 0.0639, "step": 97850 }, { "epoch": 1.4414367976907556, "grad_norm": 1.3758741617202759, "learning_rate": 1.581854598532502e-05, "loss": 0.0782, "step": 97875 }, { "epoch": 1.4418049807808426, "grad_norm": 1.6172165870666504, "learning_rate": 1.5816909613358033e-05, "loss": 0.0776, "step": 97900 }, { "epoch": 1.4421731638709296, "grad_norm": 1.5455402135849, "learning_rate": 1.5815273241391048e-05, "loss": 0.0779, "step": 97925 }, { "epoch": 1.4425413469610167, "grad_norm": 1.3516970872879028, "learning_rate": 1.5813636869424062e-05, "loss": 0.0714, "step": 97950 }, { "epoch": 1.442909530051104, "grad_norm": 1.3554434776306152, "learning_rate": 1.5812000497457077e-05, "loss": 0.0723, "step": 97975 }, { "epoch": 1.443277713141191, "grad_norm": 1.6780847311019897, "learning_rate": 1.5810364125490095e-05, "loss": 0.0789, "step": 98000 }, { "epoch": 1.443645896231278, "grad_norm": 1.2315202951431274, "learning_rate": 1.580872775352311e-05, "loss": 0.0773, "step": 98025 }, { "epoch": 1.444014079321365, "grad_norm": 1.3949530124664307, "learning_rate": 1.5807091381556127e-05, "loss": 0.071, "step": 98050 }, { "epoch": 1.444382262411452, "grad_norm": 1.6291261911392212, "learning_rate": 1.580545500958914e-05, "loss": 0.0813, "step": 98075 }, { "epoch": 1.444750445501539, "grad_norm": 1.2096186876296997, "learning_rate": 1.5803818637622156e-05, "loss": 0.0726, "step": 98100 }, { "epoch": 1.445118628591626, "grad_norm": 1.509542465209961, "learning_rate": 1.5802247720533853e-05, "loss": 0.0743, "step": 98125 }, { "epoch": 1.445486811681713, "grad_norm": 1.62225341796875, "learning_rate": 1.5800611348566868e-05, "loss": 0.0763, "step": 98150 }, { "epoch": 1.4458549947718002, "grad_norm": 1.1660709381103516, "learning_rate": 1.5798974976599882e-05, "loss": 0.0654, "step": 98175 }, { "epoch": 1.4462231778618873, "grad_norm": 1.1793895959854126, "learning_rate": 1.5797338604632896e-05, "loss": 0.0696, "step": 98200 }, { "epoch": 1.4465913609519743, "grad_norm": 1.0981831550598145, "learning_rate": 1.5795702232665914e-05, "loss": 0.077, "step": 98225 }, { "epoch": 1.4469595440420613, "grad_norm": 1.3388949632644653, "learning_rate": 1.579406586069893e-05, "loss": 0.0803, "step": 98250 }, { "epoch": 1.4473277271321483, "grad_norm": 1.572522521018982, "learning_rate": 1.5792429488731943e-05, "loss": 0.0742, "step": 98275 }, { "epoch": 1.4476959102222353, "grad_norm": 1.7953388690948486, "learning_rate": 1.579079311676496e-05, "loss": 0.0715, "step": 98300 }, { "epoch": 1.4480640933123223, "grad_norm": 1.402962327003479, "learning_rate": 1.5789156744797976e-05, "loss": 0.0797, "step": 98325 }, { "epoch": 1.4484322764024093, "grad_norm": 1.1952656507492065, "learning_rate": 1.578752037283099e-05, "loss": 0.0732, "step": 98350 }, { "epoch": 1.4488004594924964, "grad_norm": 1.5005369186401367, "learning_rate": 1.5785884000864004e-05, "loss": 0.0657, "step": 98375 }, { "epoch": 1.4491686425825834, "grad_norm": 1.3722046613693237, "learning_rate": 1.578424762889702e-05, "loss": 0.0741, "step": 98400 }, { "epoch": 1.4495368256726704, "grad_norm": 1.3201954364776611, "learning_rate": 1.5782611256930037e-05, "loss": 0.0702, "step": 98425 }, { "epoch": 1.4499050087627574, "grad_norm": 1.0372276306152344, "learning_rate": 1.578097488496305e-05, "loss": 0.0694, "step": 98450 }, { "epoch": 1.4502731918528446, "grad_norm": 1.4597052335739136, "learning_rate": 1.577933851299607e-05, "loss": 0.0709, "step": 98475 }, { "epoch": 1.4506413749429317, "grad_norm": 1.759572982788086, "learning_rate": 1.5777702141029083e-05, "loss": 0.0742, "step": 98500 }, { "epoch": 1.4510095580330187, "grad_norm": 1.263596773147583, "learning_rate": 1.5776065769062098e-05, "loss": 0.0752, "step": 98525 }, { "epoch": 1.4513777411231057, "grad_norm": 1.363990306854248, "learning_rate": 1.5774429397095116e-05, "loss": 0.0789, "step": 98550 }, { "epoch": 1.4517459242131927, "grad_norm": 1.1553142070770264, "learning_rate": 1.577279302512813e-05, "loss": 0.0785, "step": 98575 }, { "epoch": 1.4521141073032797, "grad_norm": 1.4600054025650024, "learning_rate": 1.5771156653161145e-05, "loss": 0.0775, "step": 98600 }, { "epoch": 1.4524822903933667, "grad_norm": 1.1618602275848389, "learning_rate": 1.576952028119416e-05, "loss": 0.0708, "step": 98625 }, { "epoch": 1.452850473483454, "grad_norm": 1.7354905605316162, "learning_rate": 1.5767883909227174e-05, "loss": 0.0749, "step": 98650 }, { "epoch": 1.453218656573541, "grad_norm": 1.7131768465042114, "learning_rate": 1.576624753726019e-05, "loss": 0.068, "step": 98675 }, { "epoch": 1.453586839663628, "grad_norm": 1.8787506818771362, "learning_rate": 1.5764611165293206e-05, "loss": 0.0718, "step": 98700 }, { "epoch": 1.453955022753715, "grad_norm": 1.067794680595398, "learning_rate": 1.5762974793326224e-05, "loss": 0.0734, "step": 98725 }, { "epoch": 1.454323205843802, "grad_norm": 1.3868683576583862, "learning_rate": 1.5761338421359238e-05, "loss": 0.0735, "step": 98750 }, { "epoch": 1.454691388933889, "grad_norm": 1.3692728281021118, "learning_rate": 1.5759702049392253e-05, "loss": 0.0748, "step": 98775 }, { "epoch": 1.455059572023976, "grad_norm": 1.8034658432006836, "learning_rate": 1.5758065677425267e-05, "loss": 0.0726, "step": 98800 }, { "epoch": 1.455427755114063, "grad_norm": 1.0928919315338135, "learning_rate": 1.575642930545828e-05, "loss": 0.0727, "step": 98825 }, { "epoch": 1.45579593820415, "grad_norm": 1.397918701171875, "learning_rate": 1.57547929334913e-05, "loss": 0.0785, "step": 98850 }, { "epoch": 1.4561641212942371, "grad_norm": 1.60765540599823, "learning_rate": 1.5753156561524314e-05, "loss": 0.0744, "step": 98875 }, { "epoch": 1.4565323043843241, "grad_norm": 1.329447865486145, "learning_rate": 1.575152018955733e-05, "loss": 0.0697, "step": 98900 }, { "epoch": 1.4569004874744111, "grad_norm": 1.276035189628601, "learning_rate": 1.5749883817590346e-05, "loss": 0.0698, "step": 98925 }, { "epoch": 1.4572686705644984, "grad_norm": 1.4128130674362183, "learning_rate": 1.574824744562336e-05, "loss": 0.0733, "step": 98950 }, { "epoch": 1.4576368536545854, "grad_norm": 1.2025258541107178, "learning_rate": 1.574661107365638e-05, "loss": 0.0679, "step": 98975 }, { "epoch": 1.4580050367446724, "grad_norm": 1.434647798538208, "learning_rate": 1.5744974701689393e-05, "loss": 0.0675, "step": 99000 }, { "epoch": 1.4583732198347594, "grad_norm": 1.061908483505249, "learning_rate": 1.5743338329722407e-05, "loss": 0.0791, "step": 99025 }, { "epoch": 1.4587414029248464, "grad_norm": 1.6741228103637695, "learning_rate": 1.5741701957755422e-05, "loss": 0.0755, "step": 99050 }, { "epoch": 1.4591095860149335, "grad_norm": 1.8702377080917358, "learning_rate": 1.5740065585788436e-05, "loss": 0.0736, "step": 99075 }, { "epoch": 1.4594777691050205, "grad_norm": 1.4650659561157227, "learning_rate": 1.5738429213821454e-05, "loss": 0.0767, "step": 99100 }, { "epoch": 1.4598459521951077, "grad_norm": 1.5197646617889404, "learning_rate": 1.573679284185447e-05, "loss": 0.0723, "step": 99125 }, { "epoch": 1.4602141352851947, "grad_norm": 1.5481562614440918, "learning_rate": 1.5735156469887486e-05, "loss": 0.0769, "step": 99150 }, { "epoch": 1.4605823183752817, "grad_norm": 1.1672195196151733, "learning_rate": 1.57335200979205e-05, "loss": 0.0732, "step": 99175 }, { "epoch": 1.4609505014653688, "grad_norm": 1.4054820537567139, "learning_rate": 1.5731883725953515e-05, "loss": 0.0703, "step": 99200 }, { "epoch": 1.4613186845554558, "grad_norm": 1.4027718305587769, "learning_rate": 1.573024735398653e-05, "loss": 0.0793, "step": 99225 }, { "epoch": 1.4616868676455428, "grad_norm": 1.2779954671859741, "learning_rate": 1.5728610982019544e-05, "loss": 0.0731, "step": 99250 }, { "epoch": 1.4620550507356298, "grad_norm": 1.266919732093811, "learning_rate": 1.5726974610052562e-05, "loss": 0.0654, "step": 99275 }, { "epoch": 1.4624232338257168, "grad_norm": 0.884549617767334, "learning_rate": 1.5725338238085577e-05, "loss": 0.0721, "step": 99300 }, { "epoch": 1.4627914169158038, "grad_norm": 1.6032646894454956, "learning_rate": 1.572370186611859e-05, "loss": 0.0726, "step": 99325 }, { "epoch": 1.4631596000058908, "grad_norm": 1.942748785018921, "learning_rate": 1.572206549415161e-05, "loss": 0.0752, "step": 99350 }, { "epoch": 1.4635277830959779, "grad_norm": 1.341539740562439, "learning_rate": 1.5720429122184623e-05, "loss": 0.0765, "step": 99375 }, { "epoch": 1.4638959661860649, "grad_norm": 1.8556504249572754, "learning_rate": 1.571879275021764e-05, "loss": 0.0688, "step": 99400 }, { "epoch": 1.4642641492761521, "grad_norm": 1.3915098905563354, "learning_rate": 1.5717156378250652e-05, "loss": 0.081, "step": 99425 }, { "epoch": 1.4646323323662391, "grad_norm": 1.2533347606658936, "learning_rate": 1.571552000628367e-05, "loss": 0.067, "step": 99450 }, { "epoch": 1.4650005154563261, "grad_norm": 1.229148268699646, "learning_rate": 1.5713883634316685e-05, "loss": 0.0707, "step": 99475 }, { "epoch": 1.4653686985464132, "grad_norm": 1.8005704879760742, "learning_rate": 1.57122472623497e-05, "loss": 0.0785, "step": 99500 }, { "epoch": 1.4657368816365002, "grad_norm": 1.70078706741333, "learning_rate": 1.5710610890382717e-05, "loss": 0.0715, "step": 99525 }, { "epoch": 1.4661050647265872, "grad_norm": 1.2894269227981567, "learning_rate": 1.570897451841573e-05, "loss": 0.075, "step": 99550 }, { "epoch": 1.4664732478166742, "grad_norm": 1.5065540075302124, "learning_rate": 1.5707338146448746e-05, "loss": 0.0765, "step": 99575 }, { "epoch": 1.4668414309067614, "grad_norm": 1.3765555620193481, "learning_rate": 1.5705701774481764e-05, "loss": 0.0656, "step": 99600 }, { "epoch": 1.4672096139968485, "grad_norm": 1.5339314937591553, "learning_rate": 1.5704065402514778e-05, "loss": 0.0695, "step": 99625 }, { "epoch": 1.4675777970869355, "grad_norm": 1.2181205749511719, "learning_rate": 1.5702429030547793e-05, "loss": 0.0708, "step": 99650 }, { "epoch": 1.4679459801770225, "grad_norm": 1.615257740020752, "learning_rate": 1.5700792658580807e-05, "loss": 0.0829, "step": 99675 }, { "epoch": 1.4683141632671095, "grad_norm": 1.094374179840088, "learning_rate": 1.5699156286613825e-05, "loss": 0.0719, "step": 99700 }, { "epoch": 1.4686823463571965, "grad_norm": 1.393290638923645, "learning_rate": 1.569751991464684e-05, "loss": 0.0781, "step": 99725 }, { "epoch": 1.4690505294472835, "grad_norm": 1.2727437019348145, "learning_rate": 1.5695883542679854e-05, "loss": 0.0736, "step": 99750 }, { "epoch": 1.4694187125373706, "grad_norm": 1.0170605182647705, "learning_rate": 1.569424717071287e-05, "loss": 0.0726, "step": 99775 }, { "epoch": 1.4697868956274576, "grad_norm": 1.6831586360931396, "learning_rate": 1.5692610798745886e-05, "loss": 0.0676, "step": 99800 }, { "epoch": 1.4701550787175446, "grad_norm": 1.3987754583358765, "learning_rate": 1.56909744267789e-05, "loss": 0.0711, "step": 99825 }, { "epoch": 1.4705232618076316, "grad_norm": 1.411293387413025, "learning_rate": 1.5689338054811915e-05, "loss": 0.0754, "step": 99850 }, { "epoch": 1.4708914448977186, "grad_norm": 1.3667187690734863, "learning_rate": 1.5687701682844933e-05, "loss": 0.0687, "step": 99875 }, { "epoch": 1.4712596279878059, "grad_norm": 1.6855864524841309, "learning_rate": 1.5686065310877947e-05, "loss": 0.0752, "step": 99900 }, { "epoch": 1.4716278110778929, "grad_norm": 1.7509233951568604, "learning_rate": 1.5684428938910962e-05, "loss": 0.07, "step": 99925 }, { "epoch": 1.4719959941679799, "grad_norm": 1.9707937240600586, "learning_rate": 1.568279256694398e-05, "loss": 0.074, "step": 99950 }, { "epoch": 1.472364177258067, "grad_norm": 1.3641451597213745, "learning_rate": 1.5681156194976994e-05, "loss": 0.0726, "step": 99975 }, { "epoch": 1.472732360348154, "grad_norm": 1.4700021743774414, "learning_rate": 1.567951982301001e-05, "loss": 0.0637, "step": 100000 }, { "epoch": 1.473100543438241, "grad_norm": 1.1824214458465576, "learning_rate": 1.5677883451043026e-05, "loss": 0.0706, "step": 100025 }, { "epoch": 1.473468726528328, "grad_norm": 1.4108469486236572, "learning_rate": 1.567624707907604e-05, "loss": 0.075, "step": 100050 }, { "epoch": 1.4738369096184152, "grad_norm": 1.175336480140686, "learning_rate": 1.5674610707109055e-05, "loss": 0.0706, "step": 100075 }, { "epoch": 1.4742050927085022, "grad_norm": 1.4789600372314453, "learning_rate": 1.567297433514207e-05, "loss": 0.0667, "step": 100100 }, { "epoch": 1.4745732757985892, "grad_norm": 2.167008876800537, "learning_rate": 1.5671337963175088e-05, "loss": 0.0758, "step": 100125 }, { "epoch": 1.4749414588886762, "grad_norm": 1.716241717338562, "learning_rate": 1.5669701591208102e-05, "loss": 0.0793, "step": 100150 }, { "epoch": 1.4753096419787632, "grad_norm": 1.2016184329986572, "learning_rate": 1.5668065219241117e-05, "loss": 0.0689, "step": 100175 }, { "epoch": 1.4756778250688503, "grad_norm": 0.9062038660049438, "learning_rate": 1.5666428847274134e-05, "loss": 0.0727, "step": 100200 }, { "epoch": 1.4760460081589373, "grad_norm": 1.2083696126937866, "learning_rate": 1.566479247530715e-05, "loss": 0.0664, "step": 100225 }, { "epoch": 1.4764141912490243, "grad_norm": 1.3190200328826904, "learning_rate": 1.5663156103340163e-05, "loss": 0.0693, "step": 100250 }, { "epoch": 1.4767823743391113, "grad_norm": 1.0811551809310913, "learning_rate": 1.5661519731373178e-05, "loss": 0.0656, "step": 100275 }, { "epoch": 1.4771505574291983, "grad_norm": 1.033502221107483, "learning_rate": 1.5659883359406192e-05, "loss": 0.0634, "step": 100300 }, { "epoch": 1.4775187405192853, "grad_norm": 0.956745982170105, "learning_rate": 1.565824698743921e-05, "loss": 0.0708, "step": 100325 }, { "epoch": 1.4778869236093723, "grad_norm": 1.2435214519500732, "learning_rate": 1.5656610615472224e-05, "loss": 0.0681, "step": 100350 }, { "epoch": 1.4782551066994596, "grad_norm": 1.8292759656906128, "learning_rate": 1.5654974243505242e-05, "loss": 0.0781, "step": 100375 }, { "epoch": 1.4786232897895466, "grad_norm": 1.631939172744751, "learning_rate": 1.5653337871538257e-05, "loss": 0.0663, "step": 100400 }, { "epoch": 1.4789914728796336, "grad_norm": 0.8636593818664551, "learning_rate": 1.565170149957127e-05, "loss": 0.0727, "step": 100425 }, { "epoch": 1.4793596559697206, "grad_norm": 1.6664172410964966, "learning_rate": 1.565006512760429e-05, "loss": 0.073, "step": 100450 }, { "epoch": 1.4797278390598076, "grad_norm": 1.6664351224899292, "learning_rate": 1.5648428755637304e-05, "loss": 0.076, "step": 100475 }, { "epoch": 1.4800960221498947, "grad_norm": 2.1522881984710693, "learning_rate": 1.5646792383670318e-05, "loss": 0.0765, "step": 100500 }, { "epoch": 1.4804642052399817, "grad_norm": 1.7682160139083862, "learning_rate": 1.5645156011703332e-05, "loss": 0.0716, "step": 100525 }, { "epoch": 1.4808323883300687, "grad_norm": 1.4331692457199097, "learning_rate": 1.564351963973635e-05, "loss": 0.0725, "step": 100550 }, { "epoch": 1.481200571420156, "grad_norm": 1.4662048816680908, "learning_rate": 1.5641883267769365e-05, "loss": 0.0809, "step": 100575 }, { "epoch": 1.481568754510243, "grad_norm": 1.7781012058258057, "learning_rate": 1.564024689580238e-05, "loss": 0.0739, "step": 100600 }, { "epoch": 1.48193693760033, "grad_norm": 1.4978768825531006, "learning_rate": 1.5638610523835397e-05, "loss": 0.0735, "step": 100625 }, { "epoch": 1.482305120690417, "grad_norm": 1.1126130819320679, "learning_rate": 1.563697415186841e-05, "loss": 0.0756, "step": 100650 }, { "epoch": 1.482673303780504, "grad_norm": 1.252961277961731, "learning_rate": 1.5635337779901426e-05, "loss": 0.0745, "step": 100675 }, { "epoch": 1.483041486870591, "grad_norm": 1.9854118824005127, "learning_rate": 1.563370140793444e-05, "loss": 0.0734, "step": 100700 }, { "epoch": 1.483409669960678, "grad_norm": 1.6315757036209106, "learning_rate": 1.5632065035967455e-05, "loss": 0.0727, "step": 100725 }, { "epoch": 1.483777853050765, "grad_norm": 1.4408893585205078, "learning_rate": 1.5630428664000473e-05, "loss": 0.0801, "step": 100750 }, { "epoch": 1.484146036140852, "grad_norm": 1.215943455696106, "learning_rate": 1.5628792292033487e-05, "loss": 0.0655, "step": 100775 }, { "epoch": 1.484514219230939, "grad_norm": 1.3008314371109009, "learning_rate": 1.5627155920066505e-05, "loss": 0.0714, "step": 100800 }, { "epoch": 1.484882402321026, "grad_norm": 1.7536879777908325, "learning_rate": 1.562551954809952e-05, "loss": 0.0692, "step": 100825 }, { "epoch": 1.485250585411113, "grad_norm": 0.9951906204223633, "learning_rate": 1.5623883176132534e-05, "loss": 0.0805, "step": 100850 }, { "epoch": 1.4856187685012003, "grad_norm": 1.1625759601593018, "learning_rate": 1.5622246804165552e-05, "loss": 0.0697, "step": 100875 }, { "epoch": 1.4859869515912874, "grad_norm": 1.129320740699768, "learning_rate": 1.5620675887077242e-05, "loss": 0.0737, "step": 100900 }, { "epoch": 1.4863551346813744, "grad_norm": 1.3437418937683105, "learning_rate": 1.561903951511026e-05, "loss": 0.0706, "step": 100925 }, { "epoch": 1.4867233177714614, "grad_norm": 1.1932883262634277, "learning_rate": 1.5617403143143275e-05, "loss": 0.0731, "step": 100950 }, { "epoch": 1.4870915008615484, "grad_norm": 1.018709421157837, "learning_rate": 1.561576677117629e-05, "loss": 0.074, "step": 100975 }, { "epoch": 1.4874596839516354, "grad_norm": 1.311795949935913, "learning_rate": 1.5614130399209307e-05, "loss": 0.0732, "step": 101000 }, { "epoch": 1.4878278670417224, "grad_norm": 1.0393089056015015, "learning_rate": 1.561249402724232e-05, "loss": 0.0675, "step": 101025 }, { "epoch": 1.4881960501318097, "grad_norm": 1.8284398317337036, "learning_rate": 1.561085765527534e-05, "loss": 0.0684, "step": 101050 }, { "epoch": 1.4885642332218967, "grad_norm": 1.3828203678131104, "learning_rate": 1.5609221283308354e-05, "loss": 0.0733, "step": 101075 }, { "epoch": 1.4889324163119837, "grad_norm": 1.7602605819702148, "learning_rate": 1.5607584911341368e-05, "loss": 0.0748, "step": 101100 }, { "epoch": 1.4893005994020707, "grad_norm": 1.1913679838180542, "learning_rate": 1.5605948539374383e-05, "loss": 0.0809, "step": 101125 }, { "epoch": 1.4896687824921577, "grad_norm": 1.6008341312408447, "learning_rate": 1.5604312167407397e-05, "loss": 0.076, "step": 101150 }, { "epoch": 1.4900369655822447, "grad_norm": 1.4985178709030151, "learning_rate": 1.5602675795440415e-05, "loss": 0.0812, "step": 101175 }, { "epoch": 1.4904051486723318, "grad_norm": 1.3773492574691772, "learning_rate": 1.560103942347343e-05, "loss": 0.0805, "step": 101200 }, { "epoch": 1.4907733317624188, "grad_norm": 1.7686105966567993, "learning_rate": 1.5599403051506444e-05, "loss": 0.0765, "step": 101225 }, { "epoch": 1.4911415148525058, "grad_norm": 1.4000052213668823, "learning_rate": 1.559776667953946e-05, "loss": 0.0697, "step": 101250 }, { "epoch": 1.4915096979425928, "grad_norm": 1.6129021644592285, "learning_rate": 1.5596130307572476e-05, "loss": 0.0766, "step": 101275 }, { "epoch": 1.4918778810326798, "grad_norm": 1.3519954681396484, "learning_rate": 1.5594493935605494e-05, "loss": 0.0759, "step": 101300 }, { "epoch": 1.4922460641227668, "grad_norm": 1.3702207803726196, "learning_rate": 1.5592857563638505e-05, "loss": 0.0751, "step": 101325 }, { "epoch": 1.492614247212854, "grad_norm": 1.0622360706329346, "learning_rate": 1.5591221191671523e-05, "loss": 0.0696, "step": 101350 }, { "epoch": 1.492982430302941, "grad_norm": 1.2110073566436768, "learning_rate": 1.5589584819704537e-05, "loss": 0.0693, "step": 101375 }, { "epoch": 1.493350613393028, "grad_norm": 1.026301383972168, "learning_rate": 1.5587948447737552e-05, "loss": 0.0751, "step": 101400 }, { "epoch": 1.4937187964831151, "grad_norm": 1.6028214693069458, "learning_rate": 1.558631207577057e-05, "loss": 0.0696, "step": 101425 }, { "epoch": 1.4940869795732021, "grad_norm": 1.1455265283584595, "learning_rate": 1.5584675703803584e-05, "loss": 0.0682, "step": 101450 }, { "epoch": 1.4944551626632891, "grad_norm": 1.6425275802612305, "learning_rate": 1.55830393318366e-05, "loss": 0.0745, "step": 101475 }, { "epoch": 1.4948233457533762, "grad_norm": 0.8957460522651672, "learning_rate": 1.5581402959869616e-05, "loss": 0.0761, "step": 101500 }, { "epoch": 1.4951915288434634, "grad_norm": 1.350748062133789, "learning_rate": 1.557976658790263e-05, "loss": 0.0737, "step": 101525 }, { "epoch": 1.4955597119335504, "grad_norm": 1.6226164102554321, "learning_rate": 1.5578130215935645e-05, "loss": 0.0698, "step": 101550 }, { "epoch": 1.4959278950236374, "grad_norm": 1.4175498485565186, "learning_rate": 1.557649384396866e-05, "loss": 0.0714, "step": 101575 }, { "epoch": 1.4962960781137244, "grad_norm": 1.5930166244506836, "learning_rate": 1.5574857472001678e-05, "loss": 0.0827, "step": 101600 }, { "epoch": 1.4966642612038115, "grad_norm": 1.288759469985962, "learning_rate": 1.5573221100034692e-05, "loss": 0.0778, "step": 101625 }, { "epoch": 1.4970324442938985, "grad_norm": 1.360939860343933, "learning_rate": 1.5571584728067707e-05, "loss": 0.067, "step": 101650 }, { "epoch": 1.4974006273839855, "grad_norm": 1.3344268798828125, "learning_rate": 1.5569948356100724e-05, "loss": 0.0806, "step": 101675 }, { "epoch": 1.4977688104740725, "grad_norm": 1.3413747549057007, "learning_rate": 1.556831198413374e-05, "loss": 0.0731, "step": 101700 }, { "epoch": 1.4981369935641595, "grad_norm": 1.6871967315673828, "learning_rate": 1.5566675612166757e-05, "loss": 0.0802, "step": 101725 }, { "epoch": 1.4985051766542465, "grad_norm": 1.1810795068740845, "learning_rate": 1.5565039240199768e-05, "loss": 0.0695, "step": 101750 }, { "epoch": 1.4988733597443336, "grad_norm": 1.5120656490325928, "learning_rate": 1.5563402868232786e-05, "loss": 0.0692, "step": 101775 }, { "epoch": 1.4992415428344206, "grad_norm": 1.4127033948898315, "learning_rate": 1.55617664962658e-05, "loss": 0.0709, "step": 101800 }, { "epoch": 1.4996097259245078, "grad_norm": 1.239804744720459, "learning_rate": 1.5560130124298814e-05, "loss": 0.0706, "step": 101825 }, { "epoch": 1.4999779090145948, "grad_norm": 1.615360140800476, "learning_rate": 1.5558493752331832e-05, "loss": 0.0744, "step": 101850 }, { "epoch": 1.5003460921046818, "grad_norm": 1.5092477798461914, "learning_rate": 1.5556857380364847e-05, "loss": 0.0734, "step": 101875 }, { "epoch": 1.5007142751947689, "grad_norm": 1.6290446519851685, "learning_rate": 1.555522100839786e-05, "loss": 0.0638, "step": 101900 }, { "epoch": 1.5010824582848559, "grad_norm": 1.972309947013855, "learning_rate": 1.555358463643088e-05, "loss": 0.0671, "step": 101925 }, { "epoch": 1.5014506413749429, "grad_norm": 1.2505921125411987, "learning_rate": 1.5551948264463894e-05, "loss": 0.0706, "step": 101950 }, { "epoch": 1.5018188244650301, "grad_norm": 1.4129692316055298, "learning_rate": 1.5550311892496908e-05, "loss": 0.0817, "step": 101975 }, { "epoch": 1.5021870075551171, "grad_norm": 1.4573155641555786, "learning_rate": 1.5548675520529922e-05, "loss": 0.0755, "step": 102000 }, { "epoch": 1.5025551906452042, "grad_norm": 1.9313448667526245, "learning_rate": 1.554703914856294e-05, "loss": 0.0696, "step": 102025 }, { "epoch": 1.5029233737352912, "grad_norm": 1.624696969985962, "learning_rate": 1.5545402776595955e-05, "loss": 0.0798, "step": 102050 }, { "epoch": 1.5032915568253782, "grad_norm": 1.3338876962661743, "learning_rate": 1.554376640462897e-05, "loss": 0.0748, "step": 102075 }, { "epoch": 1.5036597399154652, "grad_norm": 1.4414469003677368, "learning_rate": 1.5542130032661987e-05, "loss": 0.07, "step": 102100 }, { "epoch": 1.5040279230055522, "grad_norm": 1.9996525049209595, "learning_rate": 1.5540493660695e-05, "loss": 0.0758, "step": 102125 }, { "epoch": 1.5043961060956392, "grad_norm": 1.3696807622909546, "learning_rate": 1.5538857288728016e-05, "loss": 0.071, "step": 102150 }, { "epoch": 1.5047642891857262, "grad_norm": 1.4292805194854736, "learning_rate": 1.553722091676103e-05, "loss": 0.0688, "step": 102175 }, { "epoch": 1.5051324722758133, "grad_norm": 1.372391939163208, "learning_rate": 1.5535584544794048e-05, "loss": 0.0711, "step": 102200 }, { "epoch": 1.5055006553659003, "grad_norm": 1.611112356185913, "learning_rate": 1.5533948172827063e-05, "loss": 0.0711, "step": 102225 }, { "epoch": 1.5058688384559873, "grad_norm": 1.2051068544387817, "learning_rate": 1.5532311800860077e-05, "loss": 0.0723, "step": 102250 }, { "epoch": 1.5062370215460743, "grad_norm": 1.5507503747940063, "learning_rate": 1.5530675428893095e-05, "loss": 0.0714, "step": 102275 }, { "epoch": 1.5066052046361613, "grad_norm": 1.495367407798767, "learning_rate": 1.552903905692611e-05, "loss": 0.0691, "step": 102300 }, { "epoch": 1.5069733877262486, "grad_norm": 1.495174765586853, "learning_rate": 1.5527402684959124e-05, "loss": 0.0744, "step": 102325 }, { "epoch": 1.5073415708163356, "grad_norm": 1.657590389251709, "learning_rate": 1.5525766312992142e-05, "loss": 0.07, "step": 102350 }, { "epoch": 1.5077097539064226, "grad_norm": 1.5004724264144897, "learning_rate": 1.5524129941025153e-05, "loss": 0.0714, "step": 102375 }, { "epoch": 1.5080779369965096, "grad_norm": 1.940828800201416, "learning_rate": 1.552249356905817e-05, "loss": 0.0703, "step": 102400 }, { "epoch": 1.5084461200865966, "grad_norm": 1.0247461795806885, "learning_rate": 1.5520857197091185e-05, "loss": 0.0778, "step": 102425 }, { "epoch": 1.5088143031766839, "grad_norm": 1.2785711288452148, "learning_rate": 1.5519220825124203e-05, "loss": 0.0703, "step": 102450 }, { "epoch": 1.5091824862667709, "grad_norm": 1.5319916009902954, "learning_rate": 1.5517584453157217e-05, "loss": 0.0749, "step": 102475 }, { "epoch": 1.5095506693568579, "grad_norm": 1.5376935005187988, "learning_rate": 1.5515948081190232e-05, "loss": 0.0714, "step": 102500 }, { "epoch": 1.509918852446945, "grad_norm": 1.0213650465011597, "learning_rate": 1.551431170922325e-05, "loss": 0.0616, "step": 102525 }, { "epoch": 1.510287035537032, "grad_norm": 1.327736258506775, "learning_rate": 1.5512675337256264e-05, "loss": 0.0739, "step": 102550 }, { "epoch": 1.510655218627119, "grad_norm": 1.6903321743011475, "learning_rate": 1.551103896528928e-05, "loss": 0.0681, "step": 102575 }, { "epoch": 1.511023401717206, "grad_norm": 1.4521526098251343, "learning_rate": 1.5509402593322293e-05, "loss": 0.0686, "step": 102600 }, { "epoch": 1.511391584807293, "grad_norm": 1.2098474502563477, "learning_rate": 1.5507766221355308e-05, "loss": 0.0688, "step": 102625 }, { "epoch": 1.51175976789738, "grad_norm": 1.935627818107605, "learning_rate": 1.5506129849388325e-05, "loss": 0.0706, "step": 102650 }, { "epoch": 1.512127950987467, "grad_norm": 1.023953914642334, "learning_rate": 1.550449347742134e-05, "loss": 0.0678, "step": 102675 }, { "epoch": 1.512496134077554, "grad_norm": 1.8038815259933472, "learning_rate": 1.5502857105454358e-05, "loss": 0.0838, "step": 102700 }, { "epoch": 1.512864317167641, "grad_norm": 1.3764445781707764, "learning_rate": 1.5501220733487372e-05, "loss": 0.0657, "step": 102725 }, { "epoch": 1.513232500257728, "grad_norm": 1.6889528036117554, "learning_rate": 1.5499584361520387e-05, "loss": 0.0764, "step": 102750 }, { "epoch": 1.513600683347815, "grad_norm": 1.3472950458526611, "learning_rate": 1.5497947989553405e-05, "loss": 0.0717, "step": 102775 }, { "epoch": 1.5139688664379023, "grad_norm": 1.6028352975845337, "learning_rate": 1.5496311617586416e-05, "loss": 0.0812, "step": 102800 }, { "epoch": 1.5143370495279893, "grad_norm": 0.8961876630783081, "learning_rate": 1.5494675245619433e-05, "loss": 0.0797, "step": 102825 }, { "epoch": 1.5147052326180763, "grad_norm": 1.5843199491500854, "learning_rate": 1.5493038873652448e-05, "loss": 0.0757, "step": 102850 }, { "epoch": 1.5150734157081633, "grad_norm": 1.6867176294326782, "learning_rate": 1.5491402501685466e-05, "loss": 0.0752, "step": 102875 }, { "epoch": 1.5154415987982504, "grad_norm": 1.3222062587738037, "learning_rate": 1.548976612971848e-05, "loss": 0.0702, "step": 102900 }, { "epoch": 1.5158097818883376, "grad_norm": 1.1368613243103027, "learning_rate": 1.5488129757751495e-05, "loss": 0.075, "step": 102925 }, { "epoch": 1.5161779649784246, "grad_norm": 1.2368009090423584, "learning_rate": 1.5486493385784513e-05, "loss": 0.0755, "step": 102950 }, { "epoch": 1.5165461480685116, "grad_norm": Infinity, "learning_rate": 1.5484922468696206e-05, "loss": 0.0694, "step": 102975 }, { "epoch": 1.5169143311585986, "grad_norm": 1.538528323173523, "learning_rate": 1.548328609672922e-05, "loss": 0.0783, "step": 103000 }, { "epoch": 1.5172825142486857, "grad_norm": 1.6415791511535645, "learning_rate": 1.5481649724762235e-05, "loss": 0.0702, "step": 103025 }, { "epoch": 1.5176506973387727, "grad_norm": 1.1010034084320068, "learning_rate": 1.548001335279525e-05, "loss": 0.0813, "step": 103050 }, { "epoch": 1.5180188804288597, "grad_norm": 1.3240681886672974, "learning_rate": 1.5478376980828268e-05, "loss": 0.0798, "step": 103075 }, { "epoch": 1.5183870635189467, "grad_norm": 1.6091612577438354, "learning_rate": 1.5476740608861282e-05, "loss": 0.0683, "step": 103100 }, { "epoch": 1.5187552466090337, "grad_norm": 1.1695464849472046, "learning_rate": 1.54751042368943e-05, "loss": 0.0706, "step": 103125 }, { "epoch": 1.5191234296991207, "grad_norm": 1.1147621870040894, "learning_rate": 1.5473467864927314e-05, "loss": 0.0741, "step": 103150 }, { "epoch": 1.5194916127892077, "grad_norm": 1.2407925128936768, "learning_rate": 1.547183149296033e-05, "loss": 0.0728, "step": 103175 }, { "epoch": 1.5198597958792948, "grad_norm": 1.8448214530944824, "learning_rate": 1.5470195120993347e-05, "loss": 0.0774, "step": 103200 }, { "epoch": 1.5202279789693818, "grad_norm": 1.5810340642929077, "learning_rate": 1.5468558749026358e-05, "loss": 0.0701, "step": 103225 }, { "epoch": 1.5205961620594688, "grad_norm": 1.7474024295806885, "learning_rate": 1.5466922377059376e-05, "loss": 0.0703, "step": 103250 }, { "epoch": 1.5209643451495558, "grad_norm": 1.7364290952682495, "learning_rate": 1.546528600509239e-05, "loss": 0.0718, "step": 103275 }, { "epoch": 1.521332528239643, "grad_norm": 0.8367420434951782, "learning_rate": 1.5463649633125404e-05, "loss": 0.0726, "step": 103300 }, { "epoch": 1.52170071132973, "grad_norm": 1.2589001655578613, "learning_rate": 1.5462013261158422e-05, "loss": 0.077, "step": 103325 }, { "epoch": 1.522068894419817, "grad_norm": 1.3828003406524658, "learning_rate": 1.5460376889191437e-05, "loss": 0.0717, "step": 103350 }, { "epoch": 1.522437077509904, "grad_norm": 1.029020071029663, "learning_rate": 1.5458740517224455e-05, "loss": 0.0789, "step": 103375 }, { "epoch": 1.5228052605999913, "grad_norm": 1.3277159929275513, "learning_rate": 1.545710414525747e-05, "loss": 0.0778, "step": 103400 }, { "epoch": 1.5231734436900783, "grad_norm": 1.0542221069335938, "learning_rate": 1.5455467773290484e-05, "loss": 0.0661, "step": 103425 }, { "epoch": 1.5235416267801654, "grad_norm": 1.4062111377716064, "learning_rate": 1.5453831401323498e-05, "loss": 0.0711, "step": 103450 }, { "epoch": 1.5239098098702524, "grad_norm": 1.2907397747039795, "learning_rate": 1.5452195029356512e-05, "loss": 0.0726, "step": 103475 }, { "epoch": 1.5242779929603394, "grad_norm": 1.5134557485580444, "learning_rate": 1.545055865738953e-05, "loss": 0.0652, "step": 103500 }, { "epoch": 1.5246461760504264, "grad_norm": 1.8192025423049927, "learning_rate": 1.5448922285422545e-05, "loss": 0.0725, "step": 103525 }, { "epoch": 1.5250143591405134, "grad_norm": 1.3823870420455933, "learning_rate": 1.544728591345556e-05, "loss": 0.0734, "step": 103550 }, { "epoch": 1.5253825422306004, "grad_norm": 1.3769203424453735, "learning_rate": 1.5445649541488577e-05, "loss": 0.0682, "step": 103575 }, { "epoch": 1.5257507253206875, "grad_norm": 1.5179260969161987, "learning_rate": 1.544401316952159e-05, "loss": 0.0712, "step": 103600 }, { "epoch": 1.5261189084107745, "grad_norm": 1.086018443107605, "learning_rate": 1.544237679755461e-05, "loss": 0.08, "step": 103625 }, { "epoch": 1.5264870915008615, "grad_norm": 1.3227345943450928, "learning_rate": 1.544074042558762e-05, "loss": 0.0731, "step": 103650 }, { "epoch": 1.5268552745909485, "grad_norm": 1.4783951044082642, "learning_rate": 1.5439104053620638e-05, "loss": 0.0678, "step": 103675 }, { "epoch": 1.5272234576810355, "grad_norm": 1.495962142944336, "learning_rate": 1.5437467681653653e-05, "loss": 0.0746, "step": 103700 }, { "epoch": 1.5275916407711225, "grad_norm": 1.5806061029434204, "learning_rate": 1.5435831309686667e-05, "loss": 0.0707, "step": 103725 }, { "epoch": 1.5279598238612095, "grad_norm": 1.5442841053009033, "learning_rate": 1.5434194937719685e-05, "loss": 0.0655, "step": 103750 }, { "epoch": 1.5283280069512968, "grad_norm": 1.8976647853851318, "learning_rate": 1.54325585657527e-05, "loss": 0.0774, "step": 103775 }, { "epoch": 1.5286961900413838, "grad_norm": 1.5315282344818115, "learning_rate": 1.5430922193785714e-05, "loss": 0.0714, "step": 103800 }, { "epoch": 1.5290643731314708, "grad_norm": 1.432922124862671, "learning_rate": 1.5429285821818732e-05, "loss": 0.0708, "step": 103825 }, { "epoch": 1.5294325562215578, "grad_norm": 1.3605883121490479, "learning_rate": 1.5427649449851746e-05, "loss": 0.0687, "step": 103850 }, { "epoch": 1.5298007393116448, "grad_norm": 1.4034861326217651, "learning_rate": 1.542601307788476e-05, "loss": 0.0741, "step": 103875 }, { "epoch": 1.530168922401732, "grad_norm": 1.5694468021392822, "learning_rate": 1.5424376705917775e-05, "loss": 0.0644, "step": 103900 }, { "epoch": 1.530537105491819, "grad_norm": 1.4174963235855103, "learning_rate": 1.5422740333950793e-05, "loss": 0.0774, "step": 103925 }, { "epoch": 1.530905288581906, "grad_norm": 1.296543002128601, "learning_rate": 1.5421103961983807e-05, "loss": 0.0802, "step": 103950 }, { "epoch": 1.5312734716719931, "grad_norm": 1.7001346349716187, "learning_rate": 1.5419467590016822e-05, "loss": 0.0771, "step": 103975 }, { "epoch": 1.5316416547620801, "grad_norm": 1.6084377765655518, "learning_rate": 1.541783121804984e-05, "loss": 0.0656, "step": 104000 }, { "epoch": 1.5320098378521672, "grad_norm": 1.2867377996444702, "learning_rate": 1.5416194846082854e-05, "loss": 0.0644, "step": 104025 }, { "epoch": 1.5323780209422542, "grad_norm": 1.7494432926177979, "learning_rate": 1.5414558474115872e-05, "loss": 0.0746, "step": 104050 }, { "epoch": 1.5327462040323412, "grad_norm": 1.155861258506775, "learning_rate": 1.5412922102148883e-05, "loss": 0.0587, "step": 104075 }, { "epoch": 1.5331143871224282, "grad_norm": 1.509307861328125, "learning_rate": 1.54112857301819e-05, "loss": 0.067, "step": 104100 }, { "epoch": 1.5334825702125152, "grad_norm": 1.2659516334533691, "learning_rate": 1.5409649358214915e-05, "loss": 0.0674, "step": 104125 }, { "epoch": 1.5338507533026022, "grad_norm": 1.506854772567749, "learning_rate": 1.540801298624793e-05, "loss": 0.0728, "step": 104150 }, { "epoch": 1.5342189363926892, "grad_norm": 1.6616320610046387, "learning_rate": 1.5406376614280948e-05, "loss": 0.068, "step": 104175 }, { "epoch": 1.5345871194827763, "grad_norm": 1.2628109455108643, "learning_rate": 1.5404740242313962e-05, "loss": 0.0654, "step": 104200 }, { "epoch": 1.5349553025728633, "grad_norm": 1.2735340595245361, "learning_rate": 1.5403103870346977e-05, "loss": 0.0692, "step": 104225 }, { "epoch": 1.5353234856629505, "grad_norm": 1.400356650352478, "learning_rate": 1.5401467498379995e-05, "loss": 0.068, "step": 104250 }, { "epoch": 1.5356916687530375, "grad_norm": 1.6703741550445557, "learning_rate": 1.539983112641301e-05, "loss": 0.0732, "step": 104275 }, { "epoch": 1.5360598518431245, "grad_norm": 1.196559190750122, "learning_rate": 1.5398194754446023e-05, "loss": 0.0671, "step": 104300 }, { "epoch": 1.5364280349332116, "grad_norm": 2.0214624404907227, "learning_rate": 1.5396558382479038e-05, "loss": 0.0709, "step": 104325 }, { "epoch": 1.5367962180232986, "grad_norm": 1.3912500143051147, "learning_rate": 1.5394922010512056e-05, "loss": 0.071, "step": 104350 }, { "epoch": 1.5371644011133858, "grad_norm": 1.0920579433441162, "learning_rate": 1.539328563854507e-05, "loss": 0.0702, "step": 104375 }, { "epoch": 1.5375325842034728, "grad_norm": 1.3705909252166748, "learning_rate": 1.5391649266578085e-05, "loss": 0.0768, "step": 104400 }, { "epoch": 1.5379007672935598, "grad_norm": 1.253305435180664, "learning_rate": 1.5390012894611103e-05, "loss": 0.0703, "step": 104425 }, { "epoch": 1.5382689503836469, "grad_norm": 2.2879838943481445, "learning_rate": 1.5388376522644117e-05, "loss": 0.0715, "step": 104450 }, { "epoch": 1.5386371334737339, "grad_norm": 1.4344496726989746, "learning_rate": 1.538674015067713e-05, "loss": 0.0652, "step": 104475 }, { "epoch": 1.539005316563821, "grad_norm": 0.7615309357643127, "learning_rate": 1.5385103778710146e-05, "loss": 0.0656, "step": 104500 }, { "epoch": 1.539373499653908, "grad_norm": 1.0681302547454834, "learning_rate": 1.5383467406743164e-05, "loss": 0.0816, "step": 104525 }, { "epoch": 1.539741682743995, "grad_norm": 1.8376717567443848, "learning_rate": 1.5381831034776178e-05, "loss": 0.0732, "step": 104550 }, { "epoch": 1.540109865834082, "grad_norm": 1.7124261856079102, "learning_rate": 1.5380194662809193e-05, "loss": 0.0749, "step": 104575 }, { "epoch": 1.540478048924169, "grad_norm": 1.6960835456848145, "learning_rate": 1.537855829084221e-05, "loss": 0.0794, "step": 104600 }, { "epoch": 1.540846232014256, "grad_norm": 1.06011962890625, "learning_rate": 1.5376921918875225e-05, "loss": 0.066, "step": 104625 }, { "epoch": 1.541214415104343, "grad_norm": 1.4902781248092651, "learning_rate": 1.537528554690824e-05, "loss": 0.0737, "step": 104650 }, { "epoch": 1.54158259819443, "grad_norm": 1.2500584125518799, "learning_rate": 1.5373649174941257e-05, "loss": 0.072, "step": 104675 }, { "epoch": 1.541950781284517, "grad_norm": 0.9652636647224426, "learning_rate": 1.537201280297427e-05, "loss": 0.0735, "step": 104700 }, { "epoch": 1.5423189643746043, "grad_norm": 1.2820698022842407, "learning_rate": 1.5370376431007286e-05, "loss": 0.0658, "step": 104725 }, { "epoch": 1.5426871474646913, "grad_norm": 1.5940159559249878, "learning_rate": 1.53687400590403e-05, "loss": 0.0707, "step": 104750 }, { "epoch": 1.5430553305547783, "grad_norm": 0.8580747246742249, "learning_rate": 1.536710368707332e-05, "loss": 0.0647, "step": 104775 }, { "epoch": 1.5434235136448653, "grad_norm": 1.8936817646026611, "learning_rate": 1.5365467315106333e-05, "loss": 0.0777, "step": 104800 }, { "epoch": 1.5437916967349523, "grad_norm": 1.3605011701583862, "learning_rate": 1.5363830943139347e-05, "loss": 0.064, "step": 104825 }, { "epoch": 1.5441598798250395, "grad_norm": 1.314716100692749, "learning_rate": 1.5362194571172365e-05, "loss": 0.0707, "step": 104850 }, { "epoch": 1.5445280629151266, "grad_norm": 1.5477386713027954, "learning_rate": 1.536055819920538e-05, "loss": 0.0759, "step": 104875 }, { "epoch": 1.5448962460052136, "grad_norm": 2.173539876937866, "learning_rate": 1.5358921827238394e-05, "loss": 0.0776, "step": 104900 }, { "epoch": 1.5452644290953006, "grad_norm": 1.4477789402008057, "learning_rate": 1.535728545527141e-05, "loss": 0.0775, "step": 104925 }, { "epoch": 1.5456326121853876, "grad_norm": 1.4179946184158325, "learning_rate": 1.5355649083304423e-05, "loss": 0.0763, "step": 104950 }, { "epoch": 1.5460007952754746, "grad_norm": 1.4506452083587646, "learning_rate": 1.535401271133744e-05, "loss": 0.0693, "step": 104975 }, { "epoch": 1.5463689783655616, "grad_norm": 1.3918483257293701, "learning_rate": 1.5352376339370455e-05, "loss": 0.0777, "step": 105000 }, { "epoch": 1.5467371614556487, "grad_norm": 1.166577696800232, "learning_rate": 1.5350739967403473e-05, "loss": 0.0692, "step": 105025 }, { "epoch": 1.5471053445457357, "grad_norm": 1.01295804977417, "learning_rate": 1.5349103595436488e-05, "loss": 0.0706, "step": 105050 }, { "epoch": 1.5474735276358227, "grad_norm": 1.2988080978393555, "learning_rate": 1.5347467223469502e-05, "loss": 0.0691, "step": 105075 }, { "epoch": 1.5478417107259097, "grad_norm": 1.5271539688110352, "learning_rate": 1.534583085150252e-05, "loss": 0.0727, "step": 105100 }, { "epoch": 1.5482098938159967, "grad_norm": 1.363669753074646, "learning_rate": 1.534419447953553e-05, "loss": 0.0677, "step": 105125 }, { "epoch": 1.5485780769060837, "grad_norm": 1.1423157453536987, "learning_rate": 1.534255810756855e-05, "loss": 0.0695, "step": 105150 }, { "epoch": 1.5489462599961707, "grad_norm": 1.3760061264038086, "learning_rate": 1.5340921735601563e-05, "loss": 0.0717, "step": 105175 }, { "epoch": 1.549314443086258, "grad_norm": 1.4595600366592407, "learning_rate": 1.533928536363458e-05, "loss": 0.0696, "step": 105200 }, { "epoch": 1.549682626176345, "grad_norm": 1.7446681261062622, "learning_rate": 1.5337648991667596e-05, "loss": 0.0736, "step": 105225 }, { "epoch": 1.550050809266432, "grad_norm": 1.1961472034454346, "learning_rate": 1.533601261970061e-05, "loss": 0.0731, "step": 105250 }, { "epoch": 1.550418992356519, "grad_norm": 1.3641124963760376, "learning_rate": 1.5334376247733628e-05, "loss": 0.0709, "step": 105275 }, { "epoch": 1.550787175446606, "grad_norm": 1.3327598571777344, "learning_rate": 1.5332739875766642e-05, "loss": 0.0733, "step": 105300 }, { "epoch": 1.5511553585366933, "grad_norm": 1.0816978216171265, "learning_rate": 1.5331103503799657e-05, "loss": 0.0714, "step": 105325 }, { "epoch": 1.5515235416267803, "grad_norm": 1.0243499279022217, "learning_rate": 1.532946713183267e-05, "loss": 0.0728, "step": 105350 }, { "epoch": 1.5518917247168673, "grad_norm": 1.4931097030639648, "learning_rate": 1.5327830759865686e-05, "loss": 0.0734, "step": 105375 }, { "epoch": 1.5522599078069543, "grad_norm": 0.8803391456604004, "learning_rate": 1.5326194387898704e-05, "loss": 0.0774, "step": 105400 }, { "epoch": 1.5526280908970413, "grad_norm": 1.4650176763534546, "learning_rate": 1.5324558015931718e-05, "loss": 0.0758, "step": 105425 }, { "epoch": 1.5529962739871284, "grad_norm": 1.7435822486877441, "learning_rate": 1.5322921643964736e-05, "loss": 0.07, "step": 105450 }, { "epoch": 1.5533644570772154, "grad_norm": 1.2899059057235718, "learning_rate": 1.532128527199775e-05, "loss": 0.0662, "step": 105475 }, { "epoch": 1.5537326401673024, "grad_norm": 0.8504881858825684, "learning_rate": 1.5319648900030765e-05, "loss": 0.0703, "step": 105500 }, { "epoch": 1.5541008232573894, "grad_norm": 1.1455090045928955, "learning_rate": 1.5318012528063783e-05, "loss": 0.0745, "step": 105525 }, { "epoch": 1.5544690063474764, "grad_norm": 1.7868210077285767, "learning_rate": 1.5316376156096794e-05, "loss": 0.0792, "step": 105550 }, { "epoch": 1.5548371894375634, "grad_norm": 1.8337889909744263, "learning_rate": 1.531473978412981e-05, "loss": 0.0726, "step": 105575 }, { "epoch": 1.5552053725276505, "grad_norm": 1.2802447080612183, "learning_rate": 1.5313103412162826e-05, "loss": 0.0664, "step": 105600 }, { "epoch": 1.5555735556177375, "grad_norm": Infinity, "learning_rate": 1.531153249507452e-05, "loss": 0.0812, "step": 105625 }, { "epoch": 1.5559417387078245, "grad_norm": 1.2857905626296997, "learning_rate": 1.5309896123107538e-05, "loss": 0.0747, "step": 105650 }, { "epoch": 1.5563099217979115, "grad_norm": 1.1183959245681763, "learning_rate": 1.5308259751140552e-05, "loss": 0.0756, "step": 105675 }, { "epoch": 1.5566781048879987, "grad_norm": 1.501734972000122, "learning_rate": 1.530662337917357e-05, "loss": 0.0772, "step": 105700 }, { "epoch": 1.5570462879780858, "grad_norm": 1.4179561138153076, "learning_rate": 1.5304987007206585e-05, "loss": 0.0697, "step": 105725 }, { "epoch": 1.5574144710681728, "grad_norm": 1.4853519201278687, "learning_rate": 1.53033506352396e-05, "loss": 0.0777, "step": 105750 }, { "epoch": 1.5577826541582598, "grad_norm": 1.5254907608032227, "learning_rate": 1.5301714263272613e-05, "loss": 0.0685, "step": 105775 }, { "epoch": 1.558150837248347, "grad_norm": 1.8476866483688354, "learning_rate": 1.5300077891305628e-05, "loss": 0.0726, "step": 105800 }, { "epoch": 1.558519020338434, "grad_norm": 1.456032395362854, "learning_rate": 1.5298441519338646e-05, "loss": 0.0719, "step": 105825 }, { "epoch": 1.558887203428521, "grad_norm": 1.4664596319198608, "learning_rate": 1.529680514737166e-05, "loss": 0.0735, "step": 105850 }, { "epoch": 1.559255386518608, "grad_norm": 1.293497085571289, "learning_rate": 1.5295168775404675e-05, "loss": 0.0677, "step": 105875 }, { "epoch": 1.559623569608695, "grad_norm": 1.593989372253418, "learning_rate": 1.5293532403437693e-05, "loss": 0.0673, "step": 105900 }, { "epoch": 1.559991752698782, "grad_norm": 1.4005393981933594, "learning_rate": 1.5291896031470707e-05, "loss": 0.0742, "step": 105925 }, { "epoch": 1.5603599357888691, "grad_norm": 1.608427882194519, "learning_rate": 1.5290259659503725e-05, "loss": 0.071, "step": 105950 }, { "epoch": 1.5607281188789561, "grad_norm": 1.4881457090377808, "learning_rate": 1.5288623287536736e-05, "loss": 0.0714, "step": 105975 }, { "epoch": 1.5610963019690431, "grad_norm": 1.2351033687591553, "learning_rate": 1.5286986915569754e-05, "loss": 0.0616, "step": 106000 }, { "epoch": 1.5614644850591302, "grad_norm": 1.4147685766220093, "learning_rate": 1.5285350543602768e-05, "loss": 0.0719, "step": 106025 }, { "epoch": 1.5618326681492172, "grad_norm": 1.6583187580108643, "learning_rate": 1.5283714171635783e-05, "loss": 0.0713, "step": 106050 }, { "epoch": 1.5622008512393042, "grad_norm": 1.3932957649230957, "learning_rate": 1.52820777996688e-05, "loss": 0.0751, "step": 106075 }, { "epoch": 1.5625690343293912, "grad_norm": 1.1920204162597656, "learning_rate": 1.5280441427701815e-05, "loss": 0.0713, "step": 106100 }, { "epoch": 1.5629372174194782, "grad_norm": 1.090222716331482, "learning_rate": 1.527880505573483e-05, "loss": 0.0682, "step": 106125 }, { "epoch": 1.5633054005095652, "grad_norm": 1.216540813446045, "learning_rate": 1.5277168683767847e-05, "loss": 0.071, "step": 106150 }, { "epoch": 1.5636735835996525, "grad_norm": 1.424748182296753, "learning_rate": 1.5275532311800862e-05, "loss": 0.0681, "step": 106175 }, { "epoch": 1.5640417666897395, "grad_norm": 1.4271202087402344, "learning_rate": 1.5273895939833876e-05, "loss": 0.075, "step": 106200 }, { "epoch": 1.5644099497798265, "grad_norm": 1.2879153490066528, "learning_rate": 1.527225956786689e-05, "loss": 0.0643, "step": 106225 }, { "epoch": 1.5647781328699135, "grad_norm": 1.4442555904388428, "learning_rate": 1.527062319589991e-05, "loss": 0.0718, "step": 106250 }, { "epoch": 1.5651463159600005, "grad_norm": 0.9468669891357422, "learning_rate": 1.5268986823932923e-05, "loss": 0.0675, "step": 106275 }, { "epoch": 1.5655144990500878, "grad_norm": 1.0760447978973389, "learning_rate": 1.5267350451965937e-05, "loss": 0.0694, "step": 106300 }, { "epoch": 1.5658826821401748, "grad_norm": 1.3462986946105957, "learning_rate": 1.5265714079998955e-05, "loss": 0.0757, "step": 106325 }, { "epoch": 1.5662508652302618, "grad_norm": 1.2779897451400757, "learning_rate": 1.526407770803197e-05, "loss": 0.0668, "step": 106350 }, { "epoch": 1.5666190483203488, "grad_norm": 1.6397979259490967, "learning_rate": 1.5262441336064988e-05, "loss": 0.0739, "step": 106375 }, { "epoch": 1.5669872314104358, "grad_norm": 1.062831163406372, "learning_rate": 1.5260804964098e-05, "loss": 0.0707, "step": 106400 }, { "epoch": 1.5673554145005228, "grad_norm": 1.33720064163208, "learning_rate": 1.5259168592131016e-05, "loss": 0.0698, "step": 106425 }, { "epoch": 1.5677235975906099, "grad_norm": 1.417860984802246, "learning_rate": 1.5257532220164031e-05, "loss": 0.0711, "step": 106450 }, { "epoch": 1.5680917806806969, "grad_norm": 1.456868052482605, "learning_rate": 1.5255895848197047e-05, "loss": 0.0635, "step": 106475 }, { "epoch": 1.568459963770784, "grad_norm": 1.456259846687317, "learning_rate": 1.5254259476230062e-05, "loss": 0.0686, "step": 106500 }, { "epoch": 1.568828146860871, "grad_norm": 1.5034135580062866, "learning_rate": 1.5252623104263078e-05, "loss": 0.0721, "step": 106525 }, { "epoch": 1.569196329950958, "grad_norm": 1.4312362670898438, "learning_rate": 1.5250986732296094e-05, "loss": 0.0765, "step": 106550 }, { "epoch": 1.569564513041045, "grad_norm": 1.6333671808242798, "learning_rate": 1.524935036032911e-05, "loss": 0.0645, "step": 106575 }, { "epoch": 1.569932696131132, "grad_norm": 1.6294881105422974, "learning_rate": 1.5247713988362123e-05, "loss": 0.0775, "step": 106600 }, { "epoch": 1.570300879221219, "grad_norm": 1.624904751777649, "learning_rate": 1.5246077616395139e-05, "loss": 0.0659, "step": 106625 }, { "epoch": 1.5706690623113062, "grad_norm": 1.3214925527572632, "learning_rate": 1.5244441244428155e-05, "loss": 0.0736, "step": 106650 }, { "epoch": 1.5710372454013932, "grad_norm": 1.553481101989746, "learning_rate": 1.524280487246117e-05, "loss": 0.0785, "step": 106675 }, { "epoch": 1.5714054284914802, "grad_norm": 1.1896533966064453, "learning_rate": 1.5241168500494186e-05, "loss": 0.0732, "step": 106700 }, { "epoch": 1.5717736115815673, "grad_norm": 1.3297849893569946, "learning_rate": 1.5239532128527202e-05, "loss": 0.071, "step": 106725 }, { "epoch": 1.5721417946716543, "grad_norm": 1.256745457649231, "learning_rate": 1.5237895756560216e-05, "loss": 0.0767, "step": 106750 }, { "epoch": 1.5725099777617415, "grad_norm": 1.4505422115325928, "learning_rate": 1.5236259384593232e-05, "loss": 0.0707, "step": 106775 }, { "epoch": 1.5728781608518285, "grad_norm": 1.368282437324524, "learning_rate": 1.5234623012626249e-05, "loss": 0.0762, "step": 106800 }, { "epoch": 1.5732463439419155, "grad_norm": 1.8916006088256836, "learning_rate": 1.5232986640659261e-05, "loss": 0.0712, "step": 106825 }, { "epoch": 1.5736145270320026, "grad_norm": 1.4318642616271973, "learning_rate": 1.5231350268692277e-05, "loss": 0.0712, "step": 106850 }, { "epoch": 1.5739827101220896, "grad_norm": 1.4258930683135986, "learning_rate": 1.5229713896725294e-05, "loss": 0.0779, "step": 106875 }, { "epoch": 1.5743508932121766, "grad_norm": 1.4721462726593018, "learning_rate": 1.522807752475831e-05, "loss": 0.0731, "step": 106900 }, { "epoch": 1.5747190763022636, "grad_norm": 1.6253427267074585, "learning_rate": 1.5226441152791324e-05, "loss": 0.0749, "step": 106925 }, { "epoch": 1.5750872593923506, "grad_norm": 1.1349114179611206, "learning_rate": 1.522480478082434e-05, "loss": 0.0731, "step": 106950 }, { "epoch": 1.5754554424824376, "grad_norm": 1.2018711566925049, "learning_rate": 1.5223168408857357e-05, "loss": 0.0724, "step": 106975 }, { "epoch": 1.5758236255725246, "grad_norm": 1.2875616550445557, "learning_rate": 1.5221532036890371e-05, "loss": 0.0691, "step": 107000 }, { "epoch": 1.5761918086626117, "grad_norm": 1.9457770586013794, "learning_rate": 1.5219895664923385e-05, "loss": 0.0686, "step": 107025 }, { "epoch": 1.5765599917526987, "grad_norm": 1.1090826988220215, "learning_rate": 1.5218259292956402e-05, "loss": 0.075, "step": 107050 }, { "epoch": 1.5769281748427857, "grad_norm": 1.6381428241729736, "learning_rate": 1.5216622920989416e-05, "loss": 0.0748, "step": 107075 }, { "epoch": 1.5772963579328727, "grad_norm": 1.4712249040603638, "learning_rate": 1.5214986549022432e-05, "loss": 0.073, "step": 107100 }, { "epoch": 1.57766454102296, "grad_norm": 1.4448765516281128, "learning_rate": 1.5213350177055448e-05, "loss": 0.0722, "step": 107125 }, { "epoch": 1.578032724113047, "grad_norm": 1.2978557348251343, "learning_rate": 1.5211713805088465e-05, "loss": 0.0692, "step": 107150 }, { "epoch": 1.578400907203134, "grad_norm": 1.130519986152649, "learning_rate": 1.5210077433121479e-05, "loss": 0.0659, "step": 107175 }, { "epoch": 1.578769090293221, "grad_norm": 1.5109269618988037, "learning_rate": 1.5208441061154495e-05, "loss": 0.0761, "step": 107200 }, { "epoch": 1.579137273383308, "grad_norm": 1.2745883464813232, "learning_rate": 1.5206804689187508e-05, "loss": 0.0742, "step": 107225 }, { "epoch": 1.5795054564733952, "grad_norm": 1.2200329303741455, "learning_rate": 1.5205168317220524e-05, "loss": 0.0658, "step": 107250 }, { "epoch": 1.5798736395634823, "grad_norm": 1.1669632196426392, "learning_rate": 1.520353194525354e-05, "loss": 0.0667, "step": 107275 }, { "epoch": 1.5802418226535693, "grad_norm": 1.8420226573944092, "learning_rate": 1.5201895573286556e-05, "loss": 0.0744, "step": 107300 }, { "epoch": 1.5806100057436563, "grad_norm": 1.2124704122543335, "learning_rate": 1.520025920131957e-05, "loss": 0.0745, "step": 107325 }, { "epoch": 1.5809781888337433, "grad_norm": 1.3906047344207764, "learning_rate": 1.5198622829352587e-05, "loss": 0.0681, "step": 107350 }, { "epoch": 1.5813463719238303, "grad_norm": 1.08652663230896, "learning_rate": 1.5196986457385603e-05, "loss": 0.0734, "step": 107375 }, { "epoch": 1.5817145550139173, "grad_norm": 1.7926405668258667, "learning_rate": 1.519535008541862e-05, "loss": 0.073, "step": 107400 }, { "epoch": 1.5820827381040043, "grad_norm": 1.416136622428894, "learning_rate": 1.5193713713451634e-05, "loss": 0.0782, "step": 107425 }, { "epoch": 1.5824509211940914, "grad_norm": 1.4756971597671509, "learning_rate": 1.5192077341484648e-05, "loss": 0.0688, "step": 107450 }, { "epoch": 1.5828191042841784, "grad_norm": 1.0753921270370483, "learning_rate": 1.5190440969517664e-05, "loss": 0.0726, "step": 107475 }, { "epoch": 1.5831872873742654, "grad_norm": 1.4241995811462402, "learning_rate": 1.5188804597550679e-05, "loss": 0.0726, "step": 107500 }, { "epoch": 1.5835554704643524, "grad_norm": 1.2430777549743652, "learning_rate": 1.5187168225583695e-05, "loss": 0.0675, "step": 107525 }, { "epoch": 1.5839236535544394, "grad_norm": 1.2054349184036255, "learning_rate": 1.5185531853616711e-05, "loss": 0.0726, "step": 107550 }, { "epoch": 1.5842918366445264, "grad_norm": 1.1671655178070068, "learning_rate": 1.5183895481649726e-05, "loss": 0.0686, "step": 107575 }, { "epoch": 1.5846600197346137, "grad_norm": 1.4217971563339233, "learning_rate": 1.5182259109682742e-05, "loss": 0.0754, "step": 107600 }, { "epoch": 1.5850282028247007, "grad_norm": 1.5682121515274048, "learning_rate": 1.5180622737715758e-05, "loss": 0.0747, "step": 107625 }, { "epoch": 1.5853963859147877, "grad_norm": 2.056432008743286, "learning_rate": 1.517898636574877e-05, "loss": 0.0741, "step": 107650 }, { "epoch": 1.5857645690048747, "grad_norm": 1.61311674118042, "learning_rate": 1.5177415448660466e-05, "loss": 0.072, "step": 107675 }, { "epoch": 1.5861327520949617, "grad_norm": 1.5249961614608765, "learning_rate": 1.5175779076693482e-05, "loss": 0.0691, "step": 107700 }, { "epoch": 1.586500935185049, "grad_norm": 1.4391385316848755, "learning_rate": 1.5174142704726498e-05, "loss": 0.0726, "step": 107725 }, { "epoch": 1.586869118275136, "grad_norm": 1.7815829515457153, "learning_rate": 1.5172506332759513e-05, "loss": 0.0776, "step": 107750 }, { "epoch": 1.587237301365223, "grad_norm": 1.675018548965454, "learning_rate": 1.5170869960792529e-05, "loss": 0.0692, "step": 107775 }, { "epoch": 1.58760548445531, "grad_norm": 1.5182093381881714, "learning_rate": 1.5169233588825545e-05, "loss": 0.0686, "step": 107800 }, { "epoch": 1.587973667545397, "grad_norm": 1.2581740617752075, "learning_rate": 1.5167597216858561e-05, "loss": 0.0721, "step": 107825 }, { "epoch": 1.588341850635484, "grad_norm": 1.1967384815216064, "learning_rate": 1.5165960844891576e-05, "loss": 0.0658, "step": 107850 }, { "epoch": 1.588710033725571, "grad_norm": 1.770078420639038, "learning_rate": 1.516432447292459e-05, "loss": 0.0715, "step": 107875 }, { "epoch": 1.589078216815658, "grad_norm": 1.4590777158737183, "learning_rate": 1.5162688100957605e-05, "loss": 0.0668, "step": 107900 }, { "epoch": 1.589446399905745, "grad_norm": 1.1806178092956543, "learning_rate": 1.5161051728990621e-05, "loss": 0.0643, "step": 107925 }, { "epoch": 1.5898145829958321, "grad_norm": 1.8124110698699951, "learning_rate": 1.5159415357023637e-05, "loss": 0.0696, "step": 107950 }, { "epoch": 1.5901827660859191, "grad_norm": 1.9094833135604858, "learning_rate": 1.5157778985056653e-05, "loss": 0.0713, "step": 107975 }, { "epoch": 1.5905509491760061, "grad_norm": 1.5736355781555176, "learning_rate": 1.5156142613089668e-05, "loss": 0.0767, "step": 108000 }, { "epoch": 1.5909191322660932, "grad_norm": 1.275492548942566, "learning_rate": 1.5154506241122684e-05, "loss": 0.0723, "step": 108025 }, { "epoch": 1.5912873153561802, "grad_norm": 1.7701120376586914, "learning_rate": 1.51528698691557e-05, "loss": 0.072, "step": 108050 }, { "epoch": 1.5916554984462674, "grad_norm": 1.4056119918823242, "learning_rate": 1.5151233497188713e-05, "loss": 0.0681, "step": 108075 }, { "epoch": 1.5920236815363544, "grad_norm": 1.3238954544067383, "learning_rate": 1.5149597125221729e-05, "loss": 0.0624, "step": 108100 }, { "epoch": 1.5923918646264414, "grad_norm": 1.4717309474945068, "learning_rate": 1.5147960753254745e-05, "loss": 0.0719, "step": 108125 }, { "epoch": 1.5927600477165285, "grad_norm": 1.234840989112854, "learning_rate": 1.514632438128776e-05, "loss": 0.0759, "step": 108150 }, { "epoch": 1.5931282308066155, "grad_norm": 1.1387003660202026, "learning_rate": 1.5144688009320776e-05, "loss": 0.0654, "step": 108175 }, { "epoch": 1.5934964138967027, "grad_norm": 0.9490651488304138, "learning_rate": 1.5143051637353792e-05, "loss": 0.0694, "step": 108200 }, { "epoch": 1.5938645969867897, "grad_norm": 1.4842017889022827, "learning_rate": 1.5141415265386808e-05, "loss": 0.0782, "step": 108225 }, { "epoch": 1.5942327800768767, "grad_norm": 1.481844186782837, "learning_rate": 1.5139778893419822e-05, "loss": 0.071, "step": 108250 }, { "epoch": 1.5946009631669638, "grad_norm": 1.2599743604660034, "learning_rate": 1.5138142521452839e-05, "loss": 0.066, "step": 108275 }, { "epoch": 1.5949691462570508, "grad_norm": 1.3949706554412842, "learning_rate": 1.5136506149485853e-05, "loss": 0.069, "step": 108300 }, { "epoch": 1.5953373293471378, "grad_norm": 1.1331928968429565, "learning_rate": 1.5134869777518867e-05, "loss": 0.0717, "step": 108325 }, { "epoch": 1.5957055124372248, "grad_norm": 1.2667357921600342, "learning_rate": 1.5133233405551884e-05, "loss": 0.0714, "step": 108350 }, { "epoch": 1.5960736955273118, "grad_norm": 1.2307201623916626, "learning_rate": 1.51315970335849e-05, "loss": 0.0699, "step": 108375 }, { "epoch": 1.5964418786173988, "grad_norm": 1.3318933248519897, "learning_rate": 1.5129960661617916e-05, "loss": 0.0753, "step": 108400 }, { "epoch": 1.5968100617074859, "grad_norm": 1.3171318769454956, "learning_rate": 1.512832428965093e-05, "loss": 0.069, "step": 108425 }, { "epoch": 1.5971782447975729, "grad_norm": 1.5384852886199951, "learning_rate": 1.5126687917683947e-05, "loss": 0.0714, "step": 108450 }, { "epoch": 1.5975464278876599, "grad_norm": 1.031415581703186, "learning_rate": 1.5125051545716963e-05, "loss": 0.0671, "step": 108475 }, { "epoch": 1.597914610977747, "grad_norm": 1.7239437103271484, "learning_rate": 1.5123415173749975e-05, "loss": 0.072, "step": 108500 }, { "epoch": 1.598282794067834, "grad_norm": 1.6831711530685425, "learning_rate": 1.5121778801782992e-05, "loss": 0.0729, "step": 108525 }, { "epoch": 1.598650977157921, "grad_norm": 1.3334431648254395, "learning_rate": 1.5120142429816008e-05, "loss": 0.0686, "step": 108550 }, { "epoch": 1.5990191602480082, "grad_norm": 1.326115608215332, "learning_rate": 1.5118506057849022e-05, "loss": 0.0707, "step": 108575 }, { "epoch": 1.5993873433380952, "grad_norm": 1.1547571420669556, "learning_rate": 1.5116869685882038e-05, "loss": 0.0718, "step": 108600 }, { "epoch": 1.5997555264281822, "grad_norm": 1.4270943403244019, "learning_rate": 1.5115233313915055e-05, "loss": 0.0731, "step": 108625 }, { "epoch": 1.6001237095182692, "grad_norm": 1.5217137336730957, "learning_rate": 1.511359694194807e-05, "loss": 0.0717, "step": 108650 }, { "epoch": 1.6004918926083564, "grad_norm": 1.583696961402893, "learning_rate": 1.5111960569981085e-05, "loss": 0.0734, "step": 108675 }, { "epoch": 1.6008600756984435, "grad_norm": 1.1852489709854126, "learning_rate": 1.51103241980141e-05, "loss": 0.0682, "step": 108700 }, { "epoch": 1.6012282587885305, "grad_norm": 1.4153369665145874, "learning_rate": 1.5108687826047114e-05, "loss": 0.0791, "step": 108725 }, { "epoch": 1.6015964418786175, "grad_norm": 1.500317096710205, "learning_rate": 1.510705145408013e-05, "loss": 0.0775, "step": 108750 }, { "epoch": 1.6019646249687045, "grad_norm": 1.048874855041504, "learning_rate": 1.5105415082113146e-05, "loss": 0.0667, "step": 108775 }, { "epoch": 1.6023328080587915, "grad_norm": 1.6746413707733154, "learning_rate": 1.5103778710146163e-05, "loss": 0.0761, "step": 108800 }, { "epoch": 1.6027009911488785, "grad_norm": 1.6953356266021729, "learning_rate": 1.5102142338179177e-05, "loss": 0.0697, "step": 108825 }, { "epoch": 1.6030691742389656, "grad_norm": 1.3492467403411865, "learning_rate": 1.5100505966212193e-05, "loss": 0.0758, "step": 108850 }, { "epoch": 1.6034373573290526, "grad_norm": 1.6429636478424072, "learning_rate": 1.509886959424521e-05, "loss": 0.0749, "step": 108875 }, { "epoch": 1.6038055404191396, "grad_norm": 1.4930524826049805, "learning_rate": 1.5097233222278225e-05, "loss": 0.0759, "step": 108900 }, { "epoch": 1.6041737235092266, "grad_norm": 1.3129405975341797, "learning_rate": 1.5095596850311238e-05, "loss": 0.0707, "step": 108925 }, { "epoch": 1.6045419065993136, "grad_norm": 1.0810818672180176, "learning_rate": 1.5093960478344254e-05, "loss": 0.0775, "step": 108950 }, { "epoch": 1.6049100896894006, "grad_norm": 1.2722159624099731, "learning_rate": 1.5092324106377269e-05, "loss": 0.072, "step": 108975 }, { "epoch": 1.6052782727794876, "grad_norm": 1.2757669687271118, "learning_rate": 1.5090687734410285e-05, "loss": 0.0721, "step": 109000 }, { "epoch": 1.6056464558695747, "grad_norm": 1.9006820917129517, "learning_rate": 1.5089051362443301e-05, "loss": 0.0761, "step": 109025 }, { "epoch": 1.606014638959662, "grad_norm": 1.2178272008895874, "learning_rate": 1.5087414990476317e-05, "loss": 0.0693, "step": 109050 }, { "epoch": 1.606382822049749, "grad_norm": 1.3632571697235107, "learning_rate": 1.5085778618509332e-05, "loss": 0.0716, "step": 109075 }, { "epoch": 1.606751005139836, "grad_norm": 2.103853702545166, "learning_rate": 1.5084142246542348e-05, "loss": 0.081, "step": 109100 }, { "epoch": 1.607119188229923, "grad_norm": 1.2864320278167725, "learning_rate": 1.5082505874575362e-05, "loss": 0.0701, "step": 109125 }, { "epoch": 1.60748737132001, "grad_norm": 1.1540031433105469, "learning_rate": 1.5080869502608377e-05, "loss": 0.0706, "step": 109150 }, { "epoch": 1.6078555544100972, "grad_norm": 1.7386056184768677, "learning_rate": 1.5079233130641393e-05, "loss": 0.0746, "step": 109175 }, { "epoch": 1.6082237375001842, "grad_norm": 1.4164113998413086, "learning_rate": 1.5077596758674409e-05, "loss": 0.0678, "step": 109200 }, { "epoch": 1.6085919205902712, "grad_norm": 1.5290884971618652, "learning_rate": 1.5075960386707425e-05, "loss": 0.0725, "step": 109225 }, { "epoch": 1.6089601036803582, "grad_norm": 1.5598176717758179, "learning_rate": 1.507432401474044e-05, "loss": 0.0781, "step": 109250 }, { "epoch": 1.6093282867704453, "grad_norm": 1.4344979524612427, "learning_rate": 1.5072687642773456e-05, "loss": 0.0709, "step": 109275 }, { "epoch": 1.6096964698605323, "grad_norm": 0.9226469397544861, "learning_rate": 1.5071051270806472e-05, "loss": 0.0766, "step": 109300 }, { "epoch": 1.6100646529506193, "grad_norm": 1.2100237607955933, "learning_rate": 1.5069414898839486e-05, "loss": 0.0685, "step": 109325 }, { "epoch": 1.6104328360407063, "grad_norm": 1.4307045936584473, "learning_rate": 1.5067778526872501e-05, "loss": 0.0669, "step": 109350 }, { "epoch": 1.6108010191307933, "grad_norm": 1.2018272876739502, "learning_rate": 1.5066142154905517e-05, "loss": 0.0692, "step": 109375 }, { "epoch": 1.6111692022208803, "grad_norm": 1.6964788436889648, "learning_rate": 1.5064505782938532e-05, "loss": 0.077, "step": 109400 }, { "epoch": 1.6115373853109674, "grad_norm": 1.5950158834457397, "learning_rate": 1.5062869410971548e-05, "loss": 0.0775, "step": 109425 }, { "epoch": 1.6119055684010544, "grad_norm": 1.8836623430252075, "learning_rate": 1.5061233039004564e-05, "loss": 0.0745, "step": 109450 }, { "epoch": 1.6122737514911414, "grad_norm": 1.6316585540771484, "learning_rate": 1.505959666703758e-05, "loss": 0.0673, "step": 109475 }, { "epoch": 1.6126419345812284, "grad_norm": 1.4389818906784058, "learning_rate": 1.5057960295070594e-05, "loss": 0.0716, "step": 109500 }, { "epoch": 1.6130101176713156, "grad_norm": 1.037874698638916, "learning_rate": 1.505632392310361e-05, "loss": 0.0654, "step": 109525 }, { "epoch": 1.6133783007614027, "grad_norm": 1.2963496446609497, "learning_rate": 1.5054687551136623e-05, "loss": 0.0677, "step": 109550 }, { "epoch": 1.6137464838514897, "grad_norm": 1.8723078966140747, "learning_rate": 1.505305117916964e-05, "loss": 0.0784, "step": 109575 }, { "epoch": 1.6141146669415767, "grad_norm": 1.5051474571228027, "learning_rate": 1.5051414807202656e-05, "loss": 0.072, "step": 109600 }, { "epoch": 1.6144828500316637, "grad_norm": 1.5857611894607544, "learning_rate": 1.5049778435235672e-05, "loss": 0.0703, "step": 109625 }, { "epoch": 1.614851033121751, "grad_norm": 1.2172809839248657, "learning_rate": 1.5048142063268686e-05, "loss": 0.0713, "step": 109650 }, { "epoch": 1.615219216211838, "grad_norm": 2.0073728561401367, "learning_rate": 1.5046505691301702e-05, "loss": 0.0638, "step": 109675 }, { "epoch": 1.615587399301925, "grad_norm": 2.3504676818847656, "learning_rate": 1.5044869319334719e-05, "loss": 0.0751, "step": 109700 }, { "epoch": 1.615955582392012, "grad_norm": 1.41877019405365, "learning_rate": 1.5043232947367735e-05, "loss": 0.0759, "step": 109725 }, { "epoch": 1.616323765482099, "grad_norm": 1.5118829011917114, "learning_rate": 1.504159657540075e-05, "loss": 0.0721, "step": 109750 }, { "epoch": 1.616691948572186, "grad_norm": 1.2893067598342896, "learning_rate": 1.5039960203433764e-05, "loss": 0.0675, "step": 109775 }, { "epoch": 1.617060131662273, "grad_norm": 1.3109837770462036, "learning_rate": 1.503838928634546e-05, "loss": 0.0715, "step": 109800 }, { "epoch": 1.61742831475236, "grad_norm": 1.4243651628494263, "learning_rate": 1.5036752914378474e-05, "loss": 0.0661, "step": 109825 }, { "epoch": 1.617796497842447, "grad_norm": 1.4010547399520874, "learning_rate": 1.503511654241149e-05, "loss": 0.068, "step": 109850 }, { "epoch": 1.618164680932534, "grad_norm": 1.4534114599227905, "learning_rate": 1.5033480170444506e-05, "loss": 0.065, "step": 109875 }, { "epoch": 1.618532864022621, "grad_norm": 1.0850321054458618, "learning_rate": 1.503184379847752e-05, "loss": 0.0695, "step": 109900 }, { "epoch": 1.618901047112708, "grad_norm": 1.2090368270874023, "learning_rate": 1.5030207426510537e-05, "loss": 0.0671, "step": 109925 }, { "epoch": 1.6192692302027951, "grad_norm": 1.1374214887619019, "learning_rate": 1.5028571054543553e-05, "loss": 0.0651, "step": 109950 }, { "epoch": 1.6196374132928821, "grad_norm": 1.546884536743164, "learning_rate": 1.5026934682576565e-05, "loss": 0.0752, "step": 109975 }, { "epoch": 1.6200055963829694, "grad_norm": 1.5034337043762207, "learning_rate": 1.5025298310609582e-05, "loss": 0.0699, "step": 110000 }, { "epoch": 1.6203737794730564, "grad_norm": 0.9288470149040222, "learning_rate": 1.5023661938642598e-05, "loss": 0.0746, "step": 110025 }, { "epoch": 1.6207419625631434, "grad_norm": 1.1490124464035034, "learning_rate": 1.5022025566675614e-05, "loss": 0.0727, "step": 110050 }, { "epoch": 1.6211101456532304, "grad_norm": 1.4370003938674927, "learning_rate": 1.5020389194708628e-05, "loss": 0.0735, "step": 110075 }, { "epoch": 1.6214783287433174, "grad_norm": 1.039117693901062, "learning_rate": 1.5018752822741645e-05, "loss": 0.0667, "step": 110100 }, { "epoch": 1.6218465118334047, "grad_norm": 1.1348016262054443, "learning_rate": 1.501711645077466e-05, "loss": 0.0683, "step": 110125 }, { "epoch": 1.6222146949234917, "grad_norm": 1.0824862718582153, "learning_rate": 1.5015480078807677e-05, "loss": 0.0616, "step": 110150 }, { "epoch": 1.6225828780135787, "grad_norm": 0.8653793931007385, "learning_rate": 1.5013843706840691e-05, "loss": 0.0666, "step": 110175 }, { "epoch": 1.6229510611036657, "grad_norm": 1.882455825805664, "learning_rate": 1.5012207334873706e-05, "loss": 0.0734, "step": 110200 }, { "epoch": 1.6233192441937527, "grad_norm": 1.9056713581085205, "learning_rate": 1.501057096290672e-05, "loss": 0.0734, "step": 110225 }, { "epoch": 1.6236874272838397, "grad_norm": 1.5713926553726196, "learning_rate": 1.5008934590939736e-05, "loss": 0.0824, "step": 110250 }, { "epoch": 1.6240556103739268, "grad_norm": 1.3881809711456299, "learning_rate": 1.5007298218972753e-05, "loss": 0.0658, "step": 110275 }, { "epoch": 1.6244237934640138, "grad_norm": 1.973669409751892, "learning_rate": 1.5005661847005769e-05, "loss": 0.074, "step": 110300 }, { "epoch": 1.6247919765541008, "grad_norm": 2.0806632041931152, "learning_rate": 1.5004025475038783e-05, "loss": 0.0715, "step": 110325 }, { "epoch": 1.6251601596441878, "grad_norm": 1.4756126403808594, "learning_rate": 1.50023891030718e-05, "loss": 0.0722, "step": 110350 }, { "epoch": 1.6255283427342748, "grad_norm": 1.3490571975708008, "learning_rate": 1.5000752731104815e-05, "loss": 0.0707, "step": 110375 }, { "epoch": 1.6258965258243618, "grad_norm": 1.4615627527236938, "learning_rate": 1.4999116359137828e-05, "loss": 0.0719, "step": 110400 }, { "epoch": 1.6262647089144489, "grad_norm": 0.8929535150527954, "learning_rate": 1.4997479987170844e-05, "loss": 0.0717, "step": 110425 }, { "epoch": 1.6266328920045359, "grad_norm": 0.678657054901123, "learning_rate": 1.499584361520386e-05, "loss": 0.0647, "step": 110450 }, { "epoch": 1.627001075094623, "grad_norm": 1.6889570951461792, "learning_rate": 1.4994207243236875e-05, "loss": 0.0728, "step": 110475 }, { "epoch": 1.6273692581847101, "grad_norm": 1.0937069654464722, "learning_rate": 1.4992570871269891e-05, "loss": 0.065, "step": 110500 }, { "epoch": 1.6277374412747971, "grad_norm": 1.5447055101394653, "learning_rate": 1.4990934499302907e-05, "loss": 0.0741, "step": 110525 }, { "epoch": 1.6281056243648842, "grad_norm": 1.5269920825958252, "learning_rate": 1.4989298127335923e-05, "loss": 0.0702, "step": 110550 }, { "epoch": 1.6284738074549712, "grad_norm": 1.013918161392212, "learning_rate": 1.4987661755368938e-05, "loss": 0.0763, "step": 110575 }, { "epoch": 1.6288419905450584, "grad_norm": 1.2948002815246582, "learning_rate": 1.4986025383401952e-05, "loss": 0.0726, "step": 110600 }, { "epoch": 1.6292101736351454, "grad_norm": 1.2564746141433716, "learning_rate": 1.4984389011434968e-05, "loss": 0.0712, "step": 110625 }, { "epoch": 1.6295783567252324, "grad_norm": 1.756926417350769, "learning_rate": 1.4982752639467983e-05, "loss": 0.0679, "step": 110650 }, { "epoch": 1.6299465398153195, "grad_norm": 1.5547542572021484, "learning_rate": 1.4981116267500999e-05, "loss": 0.0697, "step": 110675 }, { "epoch": 1.6303147229054065, "grad_norm": 1.1720080375671387, "learning_rate": 1.4979479895534015e-05, "loss": 0.0677, "step": 110700 }, { "epoch": 1.6306829059954935, "grad_norm": 1.7022597789764404, "learning_rate": 1.497784352356703e-05, "loss": 0.0728, "step": 110725 }, { "epoch": 1.6310510890855805, "grad_norm": 1.089576005935669, "learning_rate": 1.4976207151600046e-05, "loss": 0.0664, "step": 110750 }, { "epoch": 1.6314192721756675, "grad_norm": 1.1406174898147583, "learning_rate": 1.4974570779633062e-05, "loss": 0.0664, "step": 110775 }, { "epoch": 1.6317874552657545, "grad_norm": 1.0567905902862549, "learning_rate": 1.4972934407666078e-05, "loss": 0.071, "step": 110800 }, { "epoch": 1.6321556383558415, "grad_norm": 1.025338888168335, "learning_rate": 1.4971298035699091e-05, "loss": 0.0697, "step": 110825 }, { "epoch": 1.6325238214459286, "grad_norm": 1.0947364568710327, "learning_rate": 1.4969661663732107e-05, "loss": 0.0695, "step": 110850 }, { "epoch": 1.6328920045360156, "grad_norm": 1.6678235530853271, "learning_rate": 1.4968025291765123e-05, "loss": 0.0754, "step": 110875 }, { "epoch": 1.6332601876261026, "grad_norm": 1.4429137706756592, "learning_rate": 1.4966388919798138e-05, "loss": 0.0684, "step": 110900 }, { "epoch": 1.6336283707161896, "grad_norm": 1.7626643180847168, "learning_rate": 1.4964752547831154e-05, "loss": 0.0791, "step": 110925 }, { "epoch": 1.6339965538062766, "grad_norm": 1.1349862813949585, "learning_rate": 1.496311617586417e-05, "loss": 0.0635, "step": 110950 }, { "epoch": 1.6343647368963639, "grad_norm": 1.5567644834518433, "learning_rate": 1.4961479803897186e-05, "loss": 0.0663, "step": 110975 }, { "epoch": 1.6347329199864509, "grad_norm": 1.0278818607330322, "learning_rate": 1.49598434319302e-05, "loss": 0.0702, "step": 111000 }, { "epoch": 1.6351011030765379, "grad_norm": 1.507691740989685, "learning_rate": 1.4958207059963215e-05, "loss": 0.067, "step": 111025 }, { "epoch": 1.635469286166625, "grad_norm": 1.0916681289672852, "learning_rate": 1.495657068799623e-05, "loss": 0.0656, "step": 111050 }, { "epoch": 1.6358374692567121, "grad_norm": 1.798743724822998, "learning_rate": 1.4954934316029246e-05, "loss": 0.0694, "step": 111075 }, { "epoch": 1.6362056523467992, "grad_norm": 1.8959791660308838, "learning_rate": 1.4953297944062262e-05, "loss": 0.071, "step": 111100 }, { "epoch": 1.6365738354368862, "grad_norm": 1.5384111404418945, "learning_rate": 1.4951661572095278e-05, "loss": 0.0675, "step": 111125 }, { "epoch": 1.6369420185269732, "grad_norm": 1.4105653762817383, "learning_rate": 1.4950025200128292e-05, "loss": 0.0722, "step": 111150 }, { "epoch": 1.6373102016170602, "grad_norm": 1.0455262660980225, "learning_rate": 1.4948388828161309e-05, "loss": 0.0748, "step": 111175 }, { "epoch": 1.6376783847071472, "grad_norm": 1.0242788791656494, "learning_rate": 1.4946752456194325e-05, "loss": 0.0683, "step": 111200 }, { "epoch": 1.6380465677972342, "grad_norm": 1.522147297859192, "learning_rate": 1.4945116084227341e-05, "loss": 0.0638, "step": 111225 }, { "epoch": 1.6384147508873212, "grad_norm": 1.4644957780838013, "learning_rate": 1.4943479712260354e-05, "loss": 0.0659, "step": 111250 }, { "epoch": 1.6387829339774083, "grad_norm": 1.6227401494979858, "learning_rate": 1.494184334029337e-05, "loss": 0.0767, "step": 111275 }, { "epoch": 1.6391511170674953, "grad_norm": 1.3954418897628784, "learning_rate": 1.4940206968326384e-05, "loss": 0.0717, "step": 111300 }, { "epoch": 1.6395193001575823, "grad_norm": 1.330049753189087, "learning_rate": 1.49385705963594e-05, "loss": 0.0695, "step": 111325 }, { "epoch": 1.6398874832476693, "grad_norm": 1.3499977588653564, "learning_rate": 1.4936934224392417e-05, "loss": 0.0689, "step": 111350 }, { "epoch": 1.6402556663377563, "grad_norm": 1.503440260887146, "learning_rate": 1.4935297852425433e-05, "loss": 0.0756, "step": 111375 }, { "epoch": 1.6406238494278433, "grad_norm": 1.3987048864364624, "learning_rate": 1.4933661480458447e-05, "loss": 0.0715, "step": 111400 }, { "epoch": 1.6409920325179304, "grad_norm": 2.3647842407226562, "learning_rate": 1.4932025108491463e-05, "loss": 0.0762, "step": 111425 }, { "epoch": 1.6413602156080176, "grad_norm": 1.2387139797210693, "learning_rate": 1.4930388736524478e-05, "loss": 0.0656, "step": 111450 }, { "epoch": 1.6417283986981046, "grad_norm": 1.6368547677993774, "learning_rate": 1.4928752364557492e-05, "loss": 0.0671, "step": 111475 }, { "epoch": 1.6420965817881916, "grad_norm": 1.33992600440979, "learning_rate": 1.4927115992590508e-05, "loss": 0.0756, "step": 111500 }, { "epoch": 1.6424647648782786, "grad_norm": 1.510890245437622, "learning_rate": 1.4925479620623525e-05, "loss": 0.0811, "step": 111525 }, { "epoch": 1.6428329479683659, "grad_norm": 1.5336076021194458, "learning_rate": 1.492384324865654e-05, "loss": 0.0699, "step": 111550 }, { "epoch": 1.643201131058453, "grad_norm": 1.6068021059036255, "learning_rate": 1.4922206876689555e-05, "loss": 0.0748, "step": 111575 }, { "epoch": 1.64356931414854, "grad_norm": 1.225089430809021, "learning_rate": 1.4920570504722571e-05, "loss": 0.0672, "step": 111600 }, { "epoch": 1.643937497238627, "grad_norm": 1.656948208808899, "learning_rate": 1.4918934132755587e-05, "loss": 0.0679, "step": 111625 }, { "epoch": 1.644305680328714, "grad_norm": 1.2077441215515137, "learning_rate": 1.4917297760788602e-05, "loss": 0.0767, "step": 111650 }, { "epoch": 1.644673863418801, "grad_norm": 0.8018178939819336, "learning_rate": 1.4915661388821616e-05, "loss": 0.0682, "step": 111675 }, { "epoch": 1.645042046508888, "grad_norm": 1.284868597984314, "learning_rate": 1.4914025016854632e-05, "loss": 0.0709, "step": 111700 }, { "epoch": 1.645410229598975, "grad_norm": 1.558538556098938, "learning_rate": 1.4912388644887647e-05, "loss": 0.0676, "step": 111725 }, { "epoch": 1.645778412689062, "grad_norm": 1.5586585998535156, "learning_rate": 1.4910752272920663e-05, "loss": 0.0632, "step": 111750 }, { "epoch": 1.646146595779149, "grad_norm": 1.346023440361023, "learning_rate": 1.490911590095368e-05, "loss": 0.0723, "step": 111775 }, { "epoch": 1.646514778869236, "grad_norm": 1.0624632835388184, "learning_rate": 1.4907479528986695e-05, "loss": 0.0725, "step": 111800 }, { "epoch": 1.646882961959323, "grad_norm": 1.1744948625564575, "learning_rate": 1.490584315701971e-05, "loss": 0.0738, "step": 111825 }, { "epoch": 1.64725114504941, "grad_norm": 1.0973681211471558, "learning_rate": 1.4904206785052726e-05, "loss": 0.0686, "step": 111850 }, { "epoch": 1.647619328139497, "grad_norm": 1.0810407400131226, "learning_rate": 1.4902570413085739e-05, "loss": 0.0631, "step": 111875 }, { "epoch": 1.647987511229584, "grad_norm": 1.1640533208847046, "learning_rate": 1.4900934041118755e-05, "loss": 0.0696, "step": 111900 }, { "epoch": 1.6483556943196713, "grad_norm": 1.622637391090393, "learning_rate": 1.4899297669151771e-05, "loss": 0.0743, "step": 111925 }, { "epoch": 1.6487238774097583, "grad_norm": 0.9899869561195374, "learning_rate": 1.4897661297184787e-05, "loss": 0.0665, "step": 111950 }, { "epoch": 1.6490920604998454, "grad_norm": 1.7448487281799316, "learning_rate": 1.4896024925217802e-05, "loss": 0.0711, "step": 111975 }, { "epoch": 1.6494602435899324, "grad_norm": 1.130418062210083, "learning_rate": 1.4894388553250818e-05, "loss": 0.0662, "step": 112000 }, { "epoch": 1.6498284266800194, "grad_norm": 1.063306212425232, "learning_rate": 1.4892752181283834e-05, "loss": 0.0693, "step": 112025 }, { "epoch": 1.6501966097701066, "grad_norm": 1.5438015460968018, "learning_rate": 1.489111580931685e-05, "loss": 0.0643, "step": 112050 }, { "epoch": 1.6505647928601936, "grad_norm": 1.6509699821472168, "learning_rate": 1.4889479437349863e-05, "loss": 0.0723, "step": 112075 }, { "epoch": 1.6509329759502807, "grad_norm": 1.7110092639923096, "learning_rate": 1.4887843065382879e-05, "loss": 0.0761, "step": 112100 }, { "epoch": 1.6513011590403677, "grad_norm": 1.7650585174560547, "learning_rate": 1.4886272148294575e-05, "loss": 0.0758, "step": 112125 }, { "epoch": 1.6516693421304547, "grad_norm": 1.121733546257019, "learning_rate": 1.4884635776327589e-05, "loss": 0.0812, "step": 112150 }, { "epoch": 1.6520375252205417, "grad_norm": 1.4370988607406616, "learning_rate": 1.4882999404360605e-05, "loss": 0.0686, "step": 112175 }, { "epoch": 1.6524057083106287, "grad_norm": 1.1857341527938843, "learning_rate": 1.4881363032393621e-05, "loss": 0.0708, "step": 112200 }, { "epoch": 1.6527738914007157, "grad_norm": 1.7822493314743042, "learning_rate": 1.4879726660426636e-05, "loss": 0.0746, "step": 112225 }, { "epoch": 1.6531420744908027, "grad_norm": 1.1039087772369385, "learning_rate": 1.4878090288459652e-05, "loss": 0.0728, "step": 112250 }, { "epoch": 1.6535102575808898, "grad_norm": 1.7532680034637451, "learning_rate": 1.4876453916492668e-05, "loss": 0.0731, "step": 112275 }, { "epoch": 1.6538784406709768, "grad_norm": 1.311147928237915, "learning_rate": 1.4874817544525681e-05, "loss": 0.0797, "step": 112300 }, { "epoch": 1.6542466237610638, "grad_norm": 1.24696683883667, "learning_rate": 1.4873181172558697e-05, "loss": 0.075, "step": 112325 }, { "epoch": 1.6546148068511508, "grad_norm": 1.130251169204712, "learning_rate": 1.4871544800591713e-05, "loss": 0.066, "step": 112350 }, { "epoch": 1.6549829899412378, "grad_norm": 1.2997519969940186, "learning_rate": 1.486990842862473e-05, "loss": 0.0693, "step": 112375 }, { "epoch": 1.655351173031325, "grad_norm": 1.2289658784866333, "learning_rate": 1.4868272056657744e-05, "loss": 0.0732, "step": 112400 }, { "epoch": 1.655719356121412, "grad_norm": 1.9662799835205078, "learning_rate": 1.486663568469076e-05, "loss": 0.0778, "step": 112425 }, { "epoch": 1.656087539211499, "grad_norm": 1.1644535064697266, "learning_rate": 1.4864999312723776e-05, "loss": 0.0637, "step": 112450 }, { "epoch": 1.656455722301586, "grad_norm": 1.3311115503311157, "learning_rate": 1.486336294075679e-05, "loss": 0.0722, "step": 112475 }, { "epoch": 1.6568239053916731, "grad_norm": 1.3293944597244263, "learning_rate": 1.4861726568789805e-05, "loss": 0.0756, "step": 112500 }, { "epoch": 1.6571920884817604, "grad_norm": 1.6116830110549927, "learning_rate": 1.4860090196822821e-05, "loss": 0.0734, "step": 112525 }, { "epoch": 1.6575602715718474, "grad_norm": 1.4583842754364014, "learning_rate": 1.4858453824855836e-05, "loss": 0.0675, "step": 112550 }, { "epoch": 1.6579284546619344, "grad_norm": 1.2287795543670654, "learning_rate": 1.4856817452888852e-05, "loss": 0.0708, "step": 112575 }, { "epoch": 1.6582966377520214, "grad_norm": 2.1830837726593018, "learning_rate": 1.4855181080921868e-05, "loss": 0.0693, "step": 112600 }, { "epoch": 1.6586648208421084, "grad_norm": 1.717875599861145, "learning_rate": 1.4853544708954884e-05, "loss": 0.0669, "step": 112625 }, { "epoch": 1.6590330039321954, "grad_norm": 1.4657039642333984, "learning_rate": 1.4851908336987899e-05, "loss": 0.0661, "step": 112650 }, { "epoch": 1.6594011870222825, "grad_norm": 1.6358797550201416, "learning_rate": 1.4850271965020915e-05, "loss": 0.0728, "step": 112675 }, { "epoch": 1.6597693701123695, "grad_norm": 1.5901389122009277, "learning_rate": 1.4848635593053931e-05, "loss": 0.0708, "step": 112700 }, { "epoch": 1.6601375532024565, "grad_norm": 0.8792926073074341, "learning_rate": 1.4846999221086944e-05, "loss": 0.072, "step": 112725 }, { "epoch": 1.6605057362925435, "grad_norm": 1.4907500743865967, "learning_rate": 1.484536284911996e-05, "loss": 0.0625, "step": 112750 }, { "epoch": 1.6608739193826305, "grad_norm": 1.5523349046707153, "learning_rate": 1.4843726477152976e-05, "loss": 0.076, "step": 112775 }, { "epoch": 1.6612421024727175, "grad_norm": 0.7739635705947876, "learning_rate": 1.484209010518599e-05, "loss": 0.0704, "step": 112800 }, { "epoch": 1.6616102855628045, "grad_norm": 1.430060625076294, "learning_rate": 1.4840453733219007e-05, "loss": 0.0669, "step": 112825 }, { "epoch": 1.6619784686528916, "grad_norm": 1.6013901233673096, "learning_rate": 1.4838817361252023e-05, "loss": 0.0757, "step": 112850 }, { "epoch": 1.6623466517429788, "grad_norm": 1.3758858442306519, "learning_rate": 1.4837180989285039e-05, "loss": 0.0681, "step": 112875 }, { "epoch": 1.6627148348330658, "grad_norm": 1.733469009399414, "learning_rate": 1.4835544617318053e-05, "loss": 0.0683, "step": 112900 }, { "epoch": 1.6630830179231528, "grad_norm": 1.7212387323379517, "learning_rate": 1.4833908245351068e-05, "loss": 0.0678, "step": 112925 }, { "epoch": 1.6634512010132398, "grad_norm": 1.648301362991333, "learning_rate": 1.4832271873384084e-05, "loss": 0.0709, "step": 112950 }, { "epoch": 1.6638193841033269, "grad_norm": 1.7456063032150269, "learning_rate": 1.4830635501417098e-05, "loss": 0.0702, "step": 112975 }, { "epoch": 1.664187567193414, "grad_norm": 0.9833461046218872, "learning_rate": 1.4828999129450115e-05, "loss": 0.0722, "step": 113000 }, { "epoch": 1.6645557502835011, "grad_norm": 1.1886141300201416, "learning_rate": 1.482736275748313e-05, "loss": 0.0699, "step": 113025 }, { "epoch": 1.6649239333735881, "grad_norm": 1.4260448217391968, "learning_rate": 1.4825726385516145e-05, "loss": 0.0681, "step": 113050 }, { "epoch": 1.6652921164636751, "grad_norm": 1.5015608072280884, "learning_rate": 1.4824090013549161e-05, "loss": 0.0729, "step": 113075 }, { "epoch": 1.6656602995537622, "grad_norm": 1.091711163520813, "learning_rate": 1.4822453641582177e-05, "loss": 0.0641, "step": 113100 }, { "epoch": 1.6660284826438492, "grad_norm": 0.8716529607772827, "learning_rate": 1.4820817269615194e-05, "loss": 0.0645, "step": 113125 }, { "epoch": 1.6663966657339362, "grad_norm": 1.4132845401763916, "learning_rate": 1.4819180897648206e-05, "loss": 0.0755, "step": 113150 }, { "epoch": 1.6667648488240232, "grad_norm": 1.70515775680542, "learning_rate": 1.4817544525681222e-05, "loss": 0.0722, "step": 113175 }, { "epoch": 1.6671330319141102, "grad_norm": 1.4492024183273315, "learning_rate": 1.4815908153714239e-05, "loss": 0.0834, "step": 113200 }, { "epoch": 1.6675012150041972, "grad_norm": 1.0968266725540161, "learning_rate": 1.4814271781747253e-05, "loss": 0.0671, "step": 113225 }, { "epoch": 1.6678693980942843, "grad_norm": 1.6026828289031982, "learning_rate": 1.481263540978027e-05, "loss": 0.0753, "step": 113250 }, { "epoch": 1.6682375811843713, "grad_norm": 1.0314505100250244, "learning_rate": 1.4810999037813285e-05, "loss": 0.0758, "step": 113275 }, { "epoch": 1.6686057642744583, "grad_norm": 1.3181610107421875, "learning_rate": 1.4809362665846302e-05, "loss": 0.0714, "step": 113300 }, { "epoch": 1.6689739473645453, "grad_norm": 1.8939374685287476, "learning_rate": 1.4807726293879316e-05, "loss": 0.0778, "step": 113325 }, { "epoch": 1.6693421304546325, "grad_norm": 1.4272085428237915, "learning_rate": 1.480608992191233e-05, "loss": 0.0719, "step": 113350 }, { "epoch": 1.6697103135447195, "grad_norm": 1.146855115890503, "learning_rate": 1.4804453549945345e-05, "loss": 0.0646, "step": 113375 }, { "epoch": 1.6700784966348066, "grad_norm": 1.147344946861267, "learning_rate": 1.4802817177978361e-05, "loss": 0.0672, "step": 113400 }, { "epoch": 1.6704466797248936, "grad_norm": 1.5350712537765503, "learning_rate": 1.4801180806011377e-05, "loss": 0.0679, "step": 113425 }, { "epoch": 1.6708148628149806, "grad_norm": 1.4562945365905762, "learning_rate": 1.4799544434044393e-05, "loss": 0.0729, "step": 113450 }, { "epoch": 1.6711830459050678, "grad_norm": 1.4934054613113403, "learning_rate": 1.4797908062077408e-05, "loss": 0.0725, "step": 113475 }, { "epoch": 1.6715512289951548, "grad_norm": 1.9927408695220947, "learning_rate": 1.4796271690110424e-05, "loss": 0.068, "step": 113500 }, { "epoch": 1.6719194120852419, "grad_norm": 1.384314775466919, "learning_rate": 1.479463531814344e-05, "loss": 0.0638, "step": 113525 }, { "epoch": 1.6722875951753289, "grad_norm": 1.1050392389297485, "learning_rate": 1.4792998946176453e-05, "loss": 0.0702, "step": 113550 }, { "epoch": 1.672655778265416, "grad_norm": 2.284367322921753, "learning_rate": 1.4791362574209469e-05, "loss": 0.0704, "step": 113575 }, { "epoch": 1.673023961355503, "grad_norm": 1.4867076873779297, "learning_rate": 1.4789726202242485e-05, "loss": 0.0771, "step": 113600 }, { "epoch": 1.67339214444559, "grad_norm": 1.328139066696167, "learning_rate": 1.47880898302755e-05, "loss": 0.0681, "step": 113625 }, { "epoch": 1.673760327535677, "grad_norm": 1.6640952825546265, "learning_rate": 1.4786453458308516e-05, "loss": 0.072, "step": 113650 }, { "epoch": 1.674128510625764, "grad_norm": 1.2903510332107544, "learning_rate": 1.4784817086341532e-05, "loss": 0.0758, "step": 113675 }, { "epoch": 1.674496693715851, "grad_norm": 1.137075424194336, "learning_rate": 1.4783180714374548e-05, "loss": 0.0724, "step": 113700 }, { "epoch": 1.674864876805938, "grad_norm": 1.641725778579712, "learning_rate": 1.4781544342407563e-05, "loss": 0.0606, "step": 113725 }, { "epoch": 1.675233059896025, "grad_norm": 1.3679289817810059, "learning_rate": 1.4779907970440579e-05, "loss": 0.0649, "step": 113750 }, { "epoch": 1.675601242986112, "grad_norm": 1.2159751653671265, "learning_rate": 1.4778271598473593e-05, "loss": 0.0721, "step": 113775 }, { "epoch": 1.675969426076199, "grad_norm": 2.194733142852783, "learning_rate": 1.4776635226506608e-05, "loss": 0.0752, "step": 113800 }, { "epoch": 1.676337609166286, "grad_norm": 1.3089299201965332, "learning_rate": 1.4774998854539624e-05, "loss": 0.0662, "step": 113825 }, { "epoch": 1.6767057922563733, "grad_norm": 1.672057867050171, "learning_rate": 1.477336248257264e-05, "loss": 0.0747, "step": 113850 }, { "epoch": 1.6770739753464603, "grad_norm": 1.4456645250320435, "learning_rate": 1.4771726110605656e-05, "loss": 0.0701, "step": 113875 }, { "epoch": 1.6774421584365473, "grad_norm": 1.05272376537323, "learning_rate": 1.477008973863867e-05, "loss": 0.0726, "step": 113900 }, { "epoch": 1.6778103415266343, "grad_norm": 1.6001595258712769, "learning_rate": 1.4768453366671687e-05, "loss": 0.0696, "step": 113925 }, { "epoch": 1.6781785246167216, "grad_norm": 1.5390887260437012, "learning_rate": 1.4766816994704703e-05, "loss": 0.0618, "step": 113950 }, { "epoch": 1.6785467077068086, "grad_norm": 1.4526466131210327, "learning_rate": 1.4765180622737716e-05, "loss": 0.0627, "step": 113975 }, { "epoch": 1.6789148907968956, "grad_norm": 1.9929816722869873, "learning_rate": 1.4763544250770732e-05, "loss": 0.0741, "step": 114000 }, { "epoch": 1.6792830738869826, "grad_norm": 1.51315176486969, "learning_rate": 1.4761907878803748e-05, "loss": 0.0701, "step": 114025 }, { "epoch": 1.6796512569770696, "grad_norm": 1.6183960437774658, "learning_rate": 1.4760271506836762e-05, "loss": 0.0695, "step": 114050 }, { "epoch": 1.6800194400671566, "grad_norm": 1.849167823791504, "learning_rate": 1.4758635134869779e-05, "loss": 0.0739, "step": 114075 }, { "epoch": 1.6803876231572437, "grad_norm": 1.4521594047546387, "learning_rate": 1.4756998762902795e-05, "loss": 0.0761, "step": 114100 }, { "epoch": 1.6807558062473307, "grad_norm": 1.2524521350860596, "learning_rate": 1.475536239093581e-05, "loss": 0.0721, "step": 114125 }, { "epoch": 1.6811239893374177, "grad_norm": 1.1944258213043213, "learning_rate": 1.4753726018968825e-05, "loss": 0.0669, "step": 114150 }, { "epoch": 1.6814921724275047, "grad_norm": 1.4698984622955322, "learning_rate": 1.4752089647001841e-05, "loss": 0.0701, "step": 114175 }, { "epoch": 1.6818603555175917, "grad_norm": 1.1985230445861816, "learning_rate": 1.4750453275034854e-05, "loss": 0.0714, "step": 114200 }, { "epoch": 1.6822285386076787, "grad_norm": 1.5233608484268188, "learning_rate": 1.474881690306787e-05, "loss": 0.0722, "step": 114225 }, { "epoch": 1.6825967216977658, "grad_norm": 1.4962819814682007, "learning_rate": 1.4747180531100887e-05, "loss": 0.0659, "step": 114250 }, { "epoch": 1.6829649047878528, "grad_norm": 1.8392302989959717, "learning_rate": 1.4745544159133903e-05, "loss": 0.0691, "step": 114275 }, { "epoch": 1.6833330878779398, "grad_norm": 1.3911542892456055, "learning_rate": 1.4743907787166917e-05, "loss": 0.0736, "step": 114300 }, { "epoch": 1.683701270968027, "grad_norm": 1.573142409324646, "learning_rate": 1.4742271415199933e-05, "loss": 0.0755, "step": 114325 }, { "epoch": 1.684069454058114, "grad_norm": 1.6133112907409668, "learning_rate": 1.474063504323295e-05, "loss": 0.0691, "step": 114350 }, { "epoch": 1.684437637148201, "grad_norm": 1.4357415437698364, "learning_rate": 1.4738998671265966e-05, "loss": 0.0675, "step": 114375 }, { "epoch": 1.684805820238288, "grad_norm": 1.5966781377792358, "learning_rate": 1.4737362299298978e-05, "loss": 0.0736, "step": 114400 }, { "epoch": 1.685174003328375, "grad_norm": 1.2438653707504272, "learning_rate": 1.4735725927331994e-05, "loss": 0.0755, "step": 114425 }, { "epoch": 1.6855421864184623, "grad_norm": 1.1986197233200073, "learning_rate": 1.473408955536501e-05, "loss": 0.0697, "step": 114450 }, { "epoch": 1.6859103695085493, "grad_norm": 1.7107959985733032, "learning_rate": 1.4732453183398025e-05, "loss": 0.0787, "step": 114475 }, { "epoch": 1.6862785525986363, "grad_norm": 1.837692379951477, "learning_rate": 1.4730816811431041e-05, "loss": 0.0729, "step": 114500 }, { "epoch": 1.6866467356887234, "grad_norm": 1.9337834119796753, "learning_rate": 1.4729180439464057e-05, "loss": 0.0735, "step": 114525 }, { "epoch": 1.6870149187788104, "grad_norm": 1.1843305826187134, "learning_rate": 1.4727544067497072e-05, "loss": 0.0601, "step": 114550 }, { "epoch": 1.6873831018688974, "grad_norm": 1.3090814352035522, "learning_rate": 1.4725907695530088e-05, "loss": 0.0717, "step": 114575 }, { "epoch": 1.6877512849589844, "grad_norm": 1.0865365266799927, "learning_rate": 1.4724271323563104e-05, "loss": 0.0708, "step": 114600 }, { "epoch": 1.6881194680490714, "grad_norm": 1.207167148590088, "learning_rate": 1.4722700406474796e-05, "loss": 0.0719, "step": 114625 }, { "epoch": 1.6884876511391584, "grad_norm": 1.486923336982727, "learning_rate": 1.4721064034507813e-05, "loss": 0.0691, "step": 114650 }, { "epoch": 1.6888558342292455, "grad_norm": 1.125577449798584, "learning_rate": 1.4719427662540829e-05, "loss": 0.0745, "step": 114675 }, { "epoch": 1.6892240173193325, "grad_norm": 1.625648021697998, "learning_rate": 1.4717791290573845e-05, "loss": 0.0763, "step": 114700 }, { "epoch": 1.6895922004094195, "grad_norm": 1.4399183988571167, "learning_rate": 1.471615491860686e-05, "loss": 0.0666, "step": 114725 }, { "epoch": 1.6899603834995065, "grad_norm": 1.0632739067077637, "learning_rate": 1.4714518546639875e-05, "loss": 0.0692, "step": 114750 }, { "epoch": 1.6903285665895935, "grad_norm": 1.558274269104004, "learning_rate": 1.4712882174672892e-05, "loss": 0.0666, "step": 114775 }, { "epoch": 1.6906967496796808, "grad_norm": 1.118222951889038, "learning_rate": 1.4711245802705906e-05, "loss": 0.0644, "step": 114800 }, { "epoch": 1.6910649327697678, "grad_norm": 1.4945584535598755, "learning_rate": 1.470960943073892e-05, "loss": 0.0667, "step": 114825 }, { "epoch": 1.6914331158598548, "grad_norm": 1.81775963306427, "learning_rate": 1.4707973058771937e-05, "loss": 0.0754, "step": 114850 }, { "epoch": 1.6918012989499418, "grad_norm": 1.2940665483474731, "learning_rate": 1.4706336686804951e-05, "loss": 0.069, "step": 114875 }, { "epoch": 1.6921694820400288, "grad_norm": 1.1451246738433838, "learning_rate": 1.4704700314837967e-05, "loss": 0.07, "step": 114900 }, { "epoch": 1.692537665130116, "grad_norm": 1.3590253591537476, "learning_rate": 1.4703063942870983e-05, "loss": 0.0708, "step": 114925 }, { "epoch": 1.692905848220203, "grad_norm": 1.463120460510254, "learning_rate": 1.4701427570904e-05, "loss": 0.068, "step": 114950 }, { "epoch": 1.69327403131029, "grad_norm": 1.0935049057006836, "learning_rate": 1.4699791198937014e-05, "loss": 0.0754, "step": 114975 }, { "epoch": 1.693642214400377, "grad_norm": 1.171722412109375, "learning_rate": 1.469815482697003e-05, "loss": 0.0719, "step": 115000 }, { "epoch": 1.6940103974904641, "grad_norm": 2.0070700645446777, "learning_rate": 1.4696518455003043e-05, "loss": 0.0775, "step": 115025 }, { "epoch": 1.6943785805805511, "grad_norm": 1.0836679935455322, "learning_rate": 1.4694882083036059e-05, "loss": 0.0631, "step": 115050 }, { "epoch": 1.6947467636706381, "grad_norm": 1.061313509941101, "learning_rate": 1.4693245711069075e-05, "loss": 0.0709, "step": 115075 }, { "epoch": 1.6951149467607252, "grad_norm": 1.680694580078125, "learning_rate": 1.4691609339102091e-05, "loss": 0.0631, "step": 115100 }, { "epoch": 1.6954831298508122, "grad_norm": 1.333789348602295, "learning_rate": 1.4689972967135106e-05, "loss": 0.0697, "step": 115125 }, { "epoch": 1.6958513129408992, "grad_norm": 1.414815068244934, "learning_rate": 1.4688336595168122e-05, "loss": 0.066, "step": 115150 }, { "epoch": 1.6962194960309862, "grad_norm": 1.1994245052337646, "learning_rate": 1.4686700223201138e-05, "loss": 0.0718, "step": 115175 }, { "epoch": 1.6965876791210732, "grad_norm": 1.5072047710418701, "learning_rate": 1.4685063851234154e-05, "loss": 0.0746, "step": 115200 }, { "epoch": 1.6969558622111602, "grad_norm": 1.4426829814910889, "learning_rate": 1.4683427479267169e-05, "loss": 0.0732, "step": 115225 }, { "epoch": 1.6973240453012473, "grad_norm": 1.1986182928085327, "learning_rate": 1.4681791107300183e-05, "loss": 0.0651, "step": 115250 }, { "epoch": 1.6976922283913345, "grad_norm": 1.66432523727417, "learning_rate": 1.46801547353332e-05, "loss": 0.0768, "step": 115275 }, { "epoch": 1.6980604114814215, "grad_norm": 1.262149453163147, "learning_rate": 1.4678518363366214e-05, "loss": 0.0759, "step": 115300 }, { "epoch": 1.6984285945715085, "grad_norm": 1.1279561519622803, "learning_rate": 1.467688199139923e-05, "loss": 0.0751, "step": 115325 }, { "epoch": 1.6987967776615955, "grad_norm": 1.1017858982086182, "learning_rate": 1.4675245619432246e-05, "loss": 0.0631, "step": 115350 }, { "epoch": 1.6991649607516826, "grad_norm": 1.5601005554199219, "learning_rate": 1.467360924746526e-05, "loss": 0.0715, "step": 115375 }, { "epoch": 1.6995331438417698, "grad_norm": 1.4502965211868286, "learning_rate": 1.4671972875498277e-05, "loss": 0.0718, "step": 115400 }, { "epoch": 1.6999013269318568, "grad_norm": 1.2339082956314087, "learning_rate": 1.4670336503531293e-05, "loss": 0.0658, "step": 115425 }, { "epoch": 1.7002695100219438, "grad_norm": 1.4210634231567383, "learning_rate": 1.4668700131564306e-05, "loss": 0.071, "step": 115450 }, { "epoch": 1.7006376931120308, "grad_norm": 1.2773572206497192, "learning_rate": 1.4667063759597322e-05, "loss": 0.0694, "step": 115475 }, { "epoch": 1.7010058762021179, "grad_norm": 1.2864840030670166, "learning_rate": 1.4665427387630338e-05, "loss": 0.0742, "step": 115500 }, { "epoch": 1.7013740592922049, "grad_norm": 1.39313542842865, "learning_rate": 1.4663791015663354e-05, "loss": 0.0715, "step": 115525 }, { "epoch": 1.7017422423822919, "grad_norm": 1.2489782571792603, "learning_rate": 1.4662154643696369e-05, "loss": 0.0743, "step": 115550 }, { "epoch": 1.702110425472379, "grad_norm": 1.1297526359558105, "learning_rate": 1.4660518271729385e-05, "loss": 0.0738, "step": 115575 }, { "epoch": 1.702478608562466, "grad_norm": 1.6708920001983643, "learning_rate": 1.46588818997624e-05, "loss": 0.0774, "step": 115600 }, { "epoch": 1.702846791652553, "grad_norm": 1.5281175374984741, "learning_rate": 1.4657245527795417e-05, "loss": 0.0586, "step": 115625 }, { "epoch": 1.70321497474264, "grad_norm": 1.1840823888778687, "learning_rate": 1.4655609155828431e-05, "loss": 0.059, "step": 115650 }, { "epoch": 1.703583157832727, "grad_norm": 1.2361899614334106, "learning_rate": 1.4653972783861446e-05, "loss": 0.0681, "step": 115675 }, { "epoch": 1.703951340922814, "grad_norm": 1.485796570777893, "learning_rate": 1.465233641189446e-05, "loss": 0.0772, "step": 115700 }, { "epoch": 1.704319524012901, "grad_norm": 1.5074681043624878, "learning_rate": 1.4650700039927477e-05, "loss": 0.0713, "step": 115725 }, { "epoch": 1.7046877071029882, "grad_norm": 1.3141595125198364, "learning_rate": 1.4649063667960493e-05, "loss": 0.0773, "step": 115750 }, { "epoch": 1.7050558901930752, "grad_norm": 1.017120361328125, "learning_rate": 1.4647427295993509e-05, "loss": 0.0653, "step": 115775 }, { "epoch": 1.7054240732831623, "grad_norm": 2.0109574794769287, "learning_rate": 1.4645790924026523e-05, "loss": 0.0718, "step": 115800 }, { "epoch": 1.7057922563732493, "grad_norm": 1.1323426961898804, "learning_rate": 1.464415455205954e-05, "loss": 0.0702, "step": 115825 }, { "epoch": 1.7061604394633363, "grad_norm": 1.0215483903884888, "learning_rate": 1.4642518180092556e-05, "loss": 0.0668, "step": 115850 }, { "epoch": 1.7065286225534235, "grad_norm": 1.184708833694458, "learning_rate": 1.4640881808125568e-05, "loss": 0.0728, "step": 115875 }, { "epoch": 1.7068968056435105, "grad_norm": 1.4762532711029053, "learning_rate": 1.4639245436158584e-05, "loss": 0.0685, "step": 115900 }, { "epoch": 1.7072649887335976, "grad_norm": 0.849449098110199, "learning_rate": 1.46376090641916e-05, "loss": 0.07, "step": 115925 }, { "epoch": 1.7076331718236846, "grad_norm": 1.733501672744751, "learning_rate": 1.4635972692224615e-05, "loss": 0.0747, "step": 115950 }, { "epoch": 1.7080013549137716, "grad_norm": 1.7008428573608398, "learning_rate": 1.4634336320257631e-05, "loss": 0.0746, "step": 115975 }, { "epoch": 1.7083695380038586, "grad_norm": 1.203247308731079, "learning_rate": 1.4632699948290647e-05, "loss": 0.0759, "step": 116000 }, { "epoch": 1.7087377210939456, "grad_norm": 1.3310378789901733, "learning_rate": 1.4631063576323664e-05, "loss": 0.0689, "step": 116025 }, { "epoch": 1.7091059041840326, "grad_norm": 1.4498660564422607, "learning_rate": 1.4629427204356678e-05, "loss": 0.0683, "step": 116050 }, { "epoch": 1.7094740872741196, "grad_norm": 1.5515391826629639, "learning_rate": 1.4627790832389694e-05, "loss": 0.0703, "step": 116075 }, { "epoch": 1.7098422703642067, "grad_norm": 1.4486724138259888, "learning_rate": 1.4626154460422709e-05, "loss": 0.0663, "step": 116100 }, { "epoch": 1.7102104534542937, "grad_norm": 1.1945711374282837, "learning_rate": 1.4624518088455723e-05, "loss": 0.0691, "step": 116125 }, { "epoch": 1.7105786365443807, "grad_norm": 1.4841253757476807, "learning_rate": 1.462288171648874e-05, "loss": 0.0704, "step": 116150 }, { "epoch": 1.7109468196344677, "grad_norm": 1.2552186250686646, "learning_rate": 1.4621245344521755e-05, "loss": 0.0757, "step": 116175 }, { "epoch": 1.7113150027245547, "grad_norm": 1.0416038036346436, "learning_rate": 1.4619608972554772e-05, "loss": 0.0716, "step": 116200 }, { "epoch": 1.7116831858146417, "grad_norm": 1.1674039363861084, "learning_rate": 1.4617972600587786e-05, "loss": 0.0712, "step": 116225 }, { "epoch": 1.712051368904729, "grad_norm": 1.4191275835037231, "learning_rate": 1.4616336228620802e-05, "loss": 0.0711, "step": 116250 }, { "epoch": 1.712419551994816, "grad_norm": 1.4573978185653687, "learning_rate": 1.4614699856653818e-05, "loss": 0.0711, "step": 116275 }, { "epoch": 1.712787735084903, "grad_norm": 0.7501809597015381, "learning_rate": 1.4613063484686831e-05, "loss": 0.065, "step": 116300 }, { "epoch": 1.71315591817499, "grad_norm": 1.1468013525009155, "learning_rate": 1.4611427112719847e-05, "loss": 0.0708, "step": 116325 }, { "epoch": 1.7135241012650773, "grad_norm": 1.7565195560455322, "learning_rate": 1.4609790740752863e-05, "loss": 0.0685, "step": 116350 }, { "epoch": 1.7138922843551643, "grad_norm": 1.2514381408691406, "learning_rate": 1.4608154368785878e-05, "loss": 0.0656, "step": 116375 }, { "epoch": 1.7142604674452513, "grad_norm": 1.3440977334976196, "learning_rate": 1.4606517996818894e-05, "loss": 0.0675, "step": 116400 }, { "epoch": 1.7146286505353383, "grad_norm": 1.4262274503707886, "learning_rate": 1.460488162485191e-05, "loss": 0.0674, "step": 116425 }, { "epoch": 1.7149968336254253, "grad_norm": 1.433107614517212, "learning_rate": 1.4603245252884926e-05, "loss": 0.0706, "step": 116450 }, { "epoch": 1.7153650167155123, "grad_norm": 1.055009126663208, "learning_rate": 1.460160888091794e-05, "loss": 0.0737, "step": 116475 }, { "epoch": 1.7157331998055994, "grad_norm": 1.516435980796814, "learning_rate": 1.4599972508950955e-05, "loss": 0.0684, "step": 116500 }, { "epoch": 1.7161013828956864, "grad_norm": 1.2547600269317627, "learning_rate": 1.459833613698397e-05, "loss": 0.0715, "step": 116525 }, { "epoch": 1.7164695659857734, "grad_norm": 1.6059954166412354, "learning_rate": 1.4596699765016986e-05, "loss": 0.0666, "step": 116550 }, { "epoch": 1.7168377490758604, "grad_norm": 1.5357228517532349, "learning_rate": 1.4595063393050002e-05, "loss": 0.0764, "step": 116575 }, { "epoch": 1.7172059321659474, "grad_norm": 1.314332127571106, "learning_rate": 1.4593427021083018e-05, "loss": 0.0745, "step": 116600 }, { "epoch": 1.7175741152560344, "grad_norm": 1.4616749286651611, "learning_rate": 1.4591790649116033e-05, "loss": 0.0719, "step": 116625 }, { "epoch": 1.7179422983461214, "grad_norm": 1.48330557346344, "learning_rate": 1.4590154277149049e-05, "loss": 0.0741, "step": 116650 }, { "epoch": 1.7183104814362085, "grad_norm": 1.3130346536636353, "learning_rate": 1.4588517905182065e-05, "loss": 0.0674, "step": 116675 }, { "epoch": 1.7186786645262955, "grad_norm": 1.320476770401001, "learning_rate": 1.4586881533215081e-05, "loss": 0.0703, "step": 116700 }, { "epoch": 1.7190468476163827, "grad_norm": 1.2791556119918823, "learning_rate": 1.4585245161248094e-05, "loss": 0.0775, "step": 116725 }, { "epoch": 1.7194150307064697, "grad_norm": 1.0319461822509766, "learning_rate": 1.458360878928111e-05, "loss": 0.0685, "step": 116750 }, { "epoch": 1.7197832137965567, "grad_norm": 1.420870065689087, "learning_rate": 1.4581972417314126e-05, "loss": 0.0643, "step": 116775 }, { "epoch": 1.7201513968866438, "grad_norm": 0.9832098484039307, "learning_rate": 1.458033604534714e-05, "loss": 0.0701, "step": 116800 }, { "epoch": 1.720519579976731, "grad_norm": 1.3880716562271118, "learning_rate": 1.4578699673380157e-05, "loss": 0.0647, "step": 116825 }, { "epoch": 1.720887763066818, "grad_norm": 1.3137727975845337, "learning_rate": 1.4577063301413173e-05, "loss": 0.0664, "step": 116850 }, { "epoch": 1.721255946156905, "grad_norm": 1.3756775856018066, "learning_rate": 1.4575426929446187e-05, "loss": 0.0693, "step": 116875 }, { "epoch": 1.721624129246992, "grad_norm": 1.3043729066848755, "learning_rate": 1.4573790557479203e-05, "loss": 0.068, "step": 116900 }, { "epoch": 1.721992312337079, "grad_norm": 0.9949998259544373, "learning_rate": 1.4572154185512218e-05, "loss": 0.0726, "step": 116925 }, { "epoch": 1.722360495427166, "grad_norm": 1.6280750036239624, "learning_rate": 1.4570517813545232e-05, "loss": 0.0674, "step": 116950 }, { "epoch": 1.722728678517253, "grad_norm": 1.2649887800216675, "learning_rate": 1.4568881441578249e-05, "loss": 0.0676, "step": 116975 }, { "epoch": 1.72309686160734, "grad_norm": 1.6157060861587524, "learning_rate": 1.4567245069611265e-05, "loss": 0.0729, "step": 117000 }, { "epoch": 1.7234650446974271, "grad_norm": 1.3674312829971313, "learning_rate": 1.456560869764428e-05, "loss": 0.0677, "step": 117025 }, { "epoch": 1.7238332277875141, "grad_norm": 1.454086184501648, "learning_rate": 1.4563972325677295e-05, "loss": 0.0672, "step": 117050 }, { "epoch": 1.7242014108776011, "grad_norm": 1.434951663017273, "learning_rate": 1.4562335953710311e-05, "loss": 0.0694, "step": 117075 }, { "epoch": 1.7245695939676882, "grad_norm": 1.2908538579940796, "learning_rate": 1.4560699581743328e-05, "loss": 0.0654, "step": 117100 }, { "epoch": 1.7249377770577752, "grad_norm": 1.363983392715454, "learning_rate": 1.4559128664655021e-05, "loss": 0.0723, "step": 117125 }, { "epoch": 1.7253059601478622, "grad_norm": 1.0666635036468506, "learning_rate": 1.4557492292688036e-05, "loss": 0.0667, "step": 117150 }, { "epoch": 1.7256741432379492, "grad_norm": 1.223769187927246, "learning_rate": 1.4555855920721052e-05, "loss": 0.0632, "step": 117175 }, { "epoch": 1.7260423263280364, "grad_norm": 1.1432114839553833, "learning_rate": 1.4554219548754067e-05, "loss": 0.0716, "step": 117200 }, { "epoch": 1.7264105094181235, "grad_norm": 0.8071144819259644, "learning_rate": 1.4552583176787083e-05, "loss": 0.0751, "step": 117225 }, { "epoch": 1.7267786925082105, "grad_norm": 1.6455711126327515, "learning_rate": 1.4550946804820099e-05, "loss": 0.0653, "step": 117250 }, { "epoch": 1.7271468755982975, "grad_norm": 1.1311428546905518, "learning_rate": 1.4549310432853115e-05, "loss": 0.066, "step": 117275 }, { "epoch": 1.7275150586883845, "grad_norm": 1.388342261314392, "learning_rate": 1.454767406088613e-05, "loss": 0.0651, "step": 117300 }, { "epoch": 1.7278832417784717, "grad_norm": 1.2155027389526367, "learning_rate": 1.4546037688919146e-05, "loss": 0.0728, "step": 117325 }, { "epoch": 1.7282514248685588, "grad_norm": 1.494720697402954, "learning_rate": 1.4544401316952158e-05, "loss": 0.0783, "step": 117350 }, { "epoch": 1.7286196079586458, "grad_norm": 1.4385688304901123, "learning_rate": 1.4542764944985175e-05, "loss": 0.067, "step": 117375 }, { "epoch": 1.7289877910487328, "grad_norm": 1.5436705350875854, "learning_rate": 1.454112857301819e-05, "loss": 0.0753, "step": 117400 }, { "epoch": 1.7293559741388198, "grad_norm": 1.8978846073150635, "learning_rate": 1.4539492201051207e-05, "loss": 0.0686, "step": 117425 }, { "epoch": 1.7297241572289068, "grad_norm": 1.692796230316162, "learning_rate": 1.4537855829084221e-05, "loss": 0.071, "step": 117450 }, { "epoch": 1.7300923403189938, "grad_norm": 1.5229357481002808, "learning_rate": 1.4536219457117237e-05, "loss": 0.0704, "step": 117475 }, { "epoch": 1.7304605234090809, "grad_norm": 1.0635727643966675, "learning_rate": 1.4534583085150254e-05, "loss": 0.0653, "step": 117500 }, { "epoch": 1.7308287064991679, "grad_norm": 1.2229571342468262, "learning_rate": 1.453294671318327e-05, "loss": 0.0674, "step": 117525 }, { "epoch": 1.7311968895892549, "grad_norm": 1.3343393802642822, "learning_rate": 1.4531310341216284e-05, "loss": 0.0666, "step": 117550 }, { "epoch": 1.731565072679342, "grad_norm": 1.3549617528915405, "learning_rate": 1.4529673969249299e-05, "loss": 0.072, "step": 117575 }, { "epoch": 1.731933255769429, "grad_norm": 1.5496704578399658, "learning_rate": 1.4528037597282315e-05, "loss": 0.0718, "step": 117600 }, { "epoch": 1.732301438859516, "grad_norm": 1.4900168180465698, "learning_rate": 1.452640122531533e-05, "loss": 0.0713, "step": 117625 }, { "epoch": 1.732669621949603, "grad_norm": 1.6626604795455933, "learning_rate": 1.4524764853348345e-05, "loss": 0.0647, "step": 117650 }, { "epoch": 1.7330378050396902, "grad_norm": 1.471441626548767, "learning_rate": 1.4523128481381362e-05, "loss": 0.0656, "step": 117675 }, { "epoch": 1.7334059881297772, "grad_norm": 1.0678582191467285, "learning_rate": 1.4521492109414376e-05, "loss": 0.0719, "step": 117700 }, { "epoch": 1.7337741712198642, "grad_norm": 1.842948317527771, "learning_rate": 1.4519855737447392e-05, "loss": 0.0672, "step": 117725 }, { "epoch": 1.7341423543099512, "grad_norm": 1.5209269523620605, "learning_rate": 1.4518219365480408e-05, "loss": 0.0686, "step": 117750 }, { "epoch": 1.7345105374000382, "grad_norm": 1.6039365530014038, "learning_rate": 1.4516582993513421e-05, "loss": 0.0703, "step": 117775 }, { "epoch": 1.7348787204901255, "grad_norm": 1.14345383644104, "learning_rate": 1.4514946621546437e-05, "loss": 0.0722, "step": 117800 }, { "epoch": 1.7352469035802125, "grad_norm": 1.0637685060501099, "learning_rate": 1.4513310249579453e-05, "loss": 0.0661, "step": 117825 }, { "epoch": 1.7356150866702995, "grad_norm": 1.4392364025115967, "learning_rate": 1.451167387761247e-05, "loss": 0.0715, "step": 117850 }, { "epoch": 1.7359832697603865, "grad_norm": 1.082470417022705, "learning_rate": 1.4510037505645484e-05, "loss": 0.0691, "step": 117875 }, { "epoch": 1.7363514528504735, "grad_norm": 1.4789104461669922, "learning_rate": 1.45084011336785e-05, "loss": 0.0735, "step": 117900 }, { "epoch": 1.7367196359405606, "grad_norm": 1.1306986808776855, "learning_rate": 1.4506764761711516e-05, "loss": 0.0677, "step": 117925 }, { "epoch": 1.7370878190306476, "grad_norm": 1.4292935132980347, "learning_rate": 1.4505128389744532e-05, "loss": 0.0735, "step": 117950 }, { "epoch": 1.7374560021207346, "grad_norm": 1.753593921661377, "learning_rate": 1.4503492017777547e-05, "loss": 0.0683, "step": 117975 }, { "epoch": 1.7378241852108216, "grad_norm": 1.2248762845993042, "learning_rate": 1.4501855645810561e-05, "loss": 0.06, "step": 118000 }, { "epoch": 1.7381923683009086, "grad_norm": 0.9938359260559082, "learning_rate": 1.4500219273843576e-05, "loss": 0.0698, "step": 118025 }, { "epoch": 1.7385605513909956, "grad_norm": 1.7148356437683105, "learning_rate": 1.4498582901876592e-05, "loss": 0.077, "step": 118050 }, { "epoch": 1.7389287344810826, "grad_norm": 1.6144236326217651, "learning_rate": 1.4496946529909608e-05, "loss": 0.0653, "step": 118075 }, { "epoch": 1.7392969175711697, "grad_norm": 1.386655330657959, "learning_rate": 1.4495310157942624e-05, "loss": 0.0653, "step": 118100 }, { "epoch": 1.7396651006612567, "grad_norm": 1.6300657987594604, "learning_rate": 1.4493673785975639e-05, "loss": 0.0708, "step": 118125 }, { "epoch": 1.740033283751344, "grad_norm": 1.2578023672103882, "learning_rate": 1.4492037414008655e-05, "loss": 0.0687, "step": 118150 }, { "epoch": 1.740401466841431, "grad_norm": 1.3382031917572021, "learning_rate": 1.4490401042041671e-05, "loss": 0.0671, "step": 118175 }, { "epoch": 1.740769649931518, "grad_norm": 1.3886079788208008, "learning_rate": 1.4488764670074684e-05, "loss": 0.069, "step": 118200 }, { "epoch": 1.741137833021605, "grad_norm": 1.4775012731552124, "learning_rate": 1.44871282981077e-05, "loss": 0.076, "step": 118225 }, { "epoch": 1.741506016111692, "grad_norm": 1.1081944704055786, "learning_rate": 1.4485491926140716e-05, "loss": 0.0627, "step": 118250 }, { "epoch": 1.7418741992017792, "grad_norm": 1.001936435699463, "learning_rate": 1.448385555417373e-05, "loss": 0.0683, "step": 118275 }, { "epoch": 1.7422423822918662, "grad_norm": 1.6779181957244873, "learning_rate": 1.4482219182206747e-05, "loss": 0.0661, "step": 118300 }, { "epoch": 1.7426105653819532, "grad_norm": 1.1340508460998535, "learning_rate": 1.4480582810239763e-05, "loss": 0.0695, "step": 118325 }, { "epoch": 1.7429787484720403, "grad_norm": 1.366167426109314, "learning_rate": 1.4478946438272779e-05, "loss": 0.072, "step": 118350 }, { "epoch": 1.7433469315621273, "grad_norm": 1.2603939771652222, "learning_rate": 1.4477310066305793e-05, "loss": 0.0648, "step": 118375 }, { "epoch": 1.7437151146522143, "grad_norm": 1.5625187158584595, "learning_rate": 1.4475673694338808e-05, "loss": 0.0742, "step": 118400 }, { "epoch": 1.7440832977423013, "grad_norm": 2.004777431488037, "learning_rate": 1.4474037322371824e-05, "loss": 0.079, "step": 118425 }, { "epoch": 1.7444514808323883, "grad_norm": 1.3128420114517212, "learning_rate": 1.4472400950404839e-05, "loss": 0.0697, "step": 118450 }, { "epoch": 1.7448196639224753, "grad_norm": 1.628818392753601, "learning_rate": 1.4470764578437855e-05, "loss": 0.0746, "step": 118475 }, { "epoch": 1.7451878470125624, "grad_norm": 1.5265029668807983, "learning_rate": 1.446912820647087e-05, "loss": 0.0701, "step": 118500 }, { "epoch": 1.7455560301026494, "grad_norm": 0.6551544070243835, "learning_rate": 1.4467491834503887e-05, "loss": 0.0725, "step": 118525 }, { "epoch": 1.7459242131927364, "grad_norm": 1.2299360036849976, "learning_rate": 1.4465855462536901e-05, "loss": 0.0709, "step": 118550 }, { "epoch": 1.7462923962828234, "grad_norm": 1.4879930019378662, "learning_rate": 1.4464219090569918e-05, "loss": 0.0678, "step": 118575 }, { "epoch": 1.7466605793729104, "grad_norm": 1.5581705570220947, "learning_rate": 1.4462582718602934e-05, "loss": 0.065, "step": 118600 }, { "epoch": 1.7470287624629977, "grad_norm": 0.8866927623748779, "learning_rate": 1.4460946346635947e-05, "loss": 0.0649, "step": 118625 }, { "epoch": 1.7473969455530847, "grad_norm": 1.5363082885742188, "learning_rate": 1.4459309974668963e-05, "loss": 0.0638, "step": 118650 }, { "epoch": 1.7477651286431717, "grad_norm": 0.9323269128799438, "learning_rate": 1.4457673602701979e-05, "loss": 0.0738, "step": 118675 }, { "epoch": 1.7481333117332587, "grad_norm": 1.2126998901367188, "learning_rate": 1.4456037230734993e-05, "loss": 0.0699, "step": 118700 }, { "epoch": 1.7485014948233457, "grad_norm": 1.1231118440628052, "learning_rate": 1.445440085876801e-05, "loss": 0.0572, "step": 118725 }, { "epoch": 1.748869677913433, "grad_norm": 1.0999683141708374, "learning_rate": 1.4452764486801026e-05, "loss": 0.0751, "step": 118750 }, { "epoch": 1.74923786100352, "grad_norm": 1.5710248947143555, "learning_rate": 1.4451128114834042e-05, "loss": 0.0712, "step": 118775 }, { "epoch": 1.749606044093607, "grad_norm": 1.6222867965698242, "learning_rate": 1.4449491742867056e-05, "loss": 0.0709, "step": 118800 }, { "epoch": 1.749974227183694, "grad_norm": 1.2778717279434204, "learning_rate": 1.444785537090007e-05, "loss": 0.0717, "step": 118825 }, { "epoch": 1.750342410273781, "grad_norm": 0.9944702982902527, "learning_rate": 1.4446218998933085e-05, "loss": 0.0659, "step": 118850 }, { "epoch": 1.750710593363868, "grad_norm": 1.477279543876648, "learning_rate": 1.4444582626966101e-05, "loss": 0.0701, "step": 118875 }, { "epoch": 1.751078776453955, "grad_norm": 1.252055048942566, "learning_rate": 1.4442946254999117e-05, "loss": 0.0696, "step": 118900 }, { "epoch": 1.751446959544042, "grad_norm": 1.0778433084487915, "learning_rate": 1.4441309883032134e-05, "loss": 0.0674, "step": 118925 }, { "epoch": 1.751815142634129, "grad_norm": 0.9636226892471313, "learning_rate": 1.4439673511065148e-05, "loss": 0.073, "step": 118950 }, { "epoch": 1.752183325724216, "grad_norm": 1.8021533489227295, "learning_rate": 1.4438037139098164e-05, "loss": 0.07, "step": 118975 }, { "epoch": 1.752551508814303, "grad_norm": 1.2711902856826782, "learning_rate": 1.443640076713118e-05, "loss": 0.0678, "step": 119000 }, { "epoch": 1.7529196919043901, "grad_norm": 1.6635637283325195, "learning_rate": 1.4434764395164196e-05, "loss": 0.0715, "step": 119025 }, { "epoch": 1.7532878749944771, "grad_norm": 1.0828711986541748, "learning_rate": 1.443312802319721e-05, "loss": 0.0668, "step": 119050 }, { "epoch": 1.7536560580845642, "grad_norm": 0.9765989780426025, "learning_rate": 1.4431491651230225e-05, "loss": 0.0689, "step": 119075 }, { "epoch": 1.7540242411746512, "grad_norm": 1.7505444288253784, "learning_rate": 1.4429855279263242e-05, "loss": 0.0739, "step": 119100 }, { "epoch": 1.7543924242647384, "grad_norm": 1.2258647680282593, "learning_rate": 1.4428218907296256e-05, "loss": 0.0664, "step": 119125 }, { "epoch": 1.7547606073548254, "grad_norm": 1.4681923389434814, "learning_rate": 1.4426582535329272e-05, "loss": 0.0723, "step": 119150 }, { "epoch": 1.7551287904449124, "grad_norm": 1.11790931224823, "learning_rate": 1.4425011618240968e-05, "loss": 0.0716, "step": 119175 }, { "epoch": 1.7554969735349994, "grad_norm": 1.5166959762573242, "learning_rate": 1.4423375246273982e-05, "loss": 0.072, "step": 119200 }, { "epoch": 1.7558651566250867, "grad_norm": 1.3520179986953735, "learning_rate": 1.4421738874306998e-05, "loss": 0.073, "step": 119225 }, { "epoch": 1.7562333397151737, "grad_norm": 1.4081478118896484, "learning_rate": 1.4420102502340013e-05, "loss": 0.0683, "step": 119250 }, { "epoch": 1.7566015228052607, "grad_norm": 1.392702579498291, "learning_rate": 1.4418466130373027e-05, "loss": 0.0705, "step": 119275 }, { "epoch": 1.7569697058953477, "grad_norm": 0.953191339969635, "learning_rate": 1.4416829758406043e-05, "loss": 0.0694, "step": 119300 }, { "epoch": 1.7573378889854347, "grad_norm": 1.3729522228240967, "learning_rate": 1.441519338643906e-05, "loss": 0.0788, "step": 119325 }, { "epoch": 1.7577060720755218, "grad_norm": 2.266904592514038, "learning_rate": 1.4413557014472076e-05, "loss": 0.0714, "step": 119350 }, { "epoch": 1.7580742551656088, "grad_norm": 1.3887821435928345, "learning_rate": 1.441192064250509e-05, "loss": 0.0694, "step": 119375 }, { "epoch": 1.7584424382556958, "grad_norm": 1.1650502681732178, "learning_rate": 1.4410284270538106e-05, "loss": 0.0687, "step": 119400 }, { "epoch": 1.7588106213457828, "grad_norm": 1.1449217796325684, "learning_rate": 1.4408647898571122e-05, "loss": 0.0652, "step": 119425 }, { "epoch": 1.7591788044358698, "grad_norm": 1.301220417022705, "learning_rate": 1.4407011526604137e-05, "loss": 0.072, "step": 119450 }, { "epoch": 1.7595469875259568, "grad_norm": 1.4718124866485596, "learning_rate": 1.4405375154637151e-05, "loss": 0.0722, "step": 119475 }, { "epoch": 1.7599151706160439, "grad_norm": 1.2857835292816162, "learning_rate": 1.4403738782670168e-05, "loss": 0.0708, "step": 119500 }, { "epoch": 1.7602833537061309, "grad_norm": 1.3989564180374146, "learning_rate": 1.4402102410703182e-05, "loss": 0.0667, "step": 119525 }, { "epoch": 1.7606515367962179, "grad_norm": 1.7022169828414917, "learning_rate": 1.4400466038736198e-05, "loss": 0.07, "step": 119550 }, { "epoch": 1.761019719886305, "grad_norm": 1.1868810653686523, "learning_rate": 1.4398829666769214e-05, "loss": 0.0639, "step": 119575 }, { "epoch": 1.7613879029763921, "grad_norm": 1.8378244638442993, "learning_rate": 1.439719329480223e-05, "loss": 0.0797, "step": 119600 }, { "epoch": 1.7617560860664792, "grad_norm": 1.4175103902816772, "learning_rate": 1.4395556922835245e-05, "loss": 0.0663, "step": 119625 }, { "epoch": 1.7621242691565662, "grad_norm": 1.5685625076293945, "learning_rate": 1.4393920550868261e-05, "loss": 0.0708, "step": 119650 }, { "epoch": 1.7624924522466532, "grad_norm": 1.862176537513733, "learning_rate": 1.4392284178901274e-05, "loss": 0.0652, "step": 119675 }, { "epoch": 1.7628606353367402, "grad_norm": 1.4839879274368286, "learning_rate": 1.439064780693429e-05, "loss": 0.0619, "step": 119700 }, { "epoch": 1.7632288184268274, "grad_norm": 1.0859131813049316, "learning_rate": 1.4389011434967306e-05, "loss": 0.0723, "step": 119725 }, { "epoch": 1.7635970015169145, "grad_norm": 1.548510193824768, "learning_rate": 1.4387375063000322e-05, "loss": 0.0731, "step": 119750 }, { "epoch": 1.7639651846070015, "grad_norm": 1.6976302862167358, "learning_rate": 1.4385738691033337e-05, "loss": 0.0756, "step": 119775 }, { "epoch": 1.7643333676970885, "grad_norm": 1.256464958190918, "learning_rate": 1.4384102319066353e-05, "loss": 0.0765, "step": 119800 }, { "epoch": 1.7647015507871755, "grad_norm": 1.97874915599823, "learning_rate": 1.4382465947099369e-05, "loss": 0.0729, "step": 119825 }, { "epoch": 1.7650697338772625, "grad_norm": 1.1645824909210205, "learning_rate": 1.4380829575132385e-05, "loss": 0.0745, "step": 119850 }, { "epoch": 1.7654379169673495, "grad_norm": 1.3096339702606201, "learning_rate": 1.4379193203165398e-05, "loss": 0.0643, "step": 119875 }, { "epoch": 1.7658061000574365, "grad_norm": 1.7846611738204956, "learning_rate": 1.4377556831198414e-05, "loss": 0.0763, "step": 119900 }, { "epoch": 1.7661742831475236, "grad_norm": 1.5958901643753052, "learning_rate": 1.437592045923143e-05, "loss": 0.0704, "step": 119925 }, { "epoch": 1.7665424662376106, "grad_norm": 1.6560559272766113, "learning_rate": 1.4374284087264445e-05, "loss": 0.075, "step": 119950 }, { "epoch": 1.7669106493276976, "grad_norm": 1.3931108713150024, "learning_rate": 1.437264771529746e-05, "loss": 0.0731, "step": 119975 }, { "epoch": 1.7672788324177846, "grad_norm": 1.3844809532165527, "learning_rate": 1.4371011343330477e-05, "loss": 0.0691, "step": 120000 }, { "epoch": 1.7676470155078716, "grad_norm": 1.3142585754394531, "learning_rate": 1.4369374971363491e-05, "loss": 0.0747, "step": 120025 }, { "epoch": 1.7680151985979586, "grad_norm": 1.5986624956130981, "learning_rate": 1.4367738599396508e-05, "loss": 0.0675, "step": 120050 }, { "epoch": 1.7683833816880459, "grad_norm": 1.2626944780349731, "learning_rate": 1.4366102227429524e-05, "loss": 0.0668, "step": 120075 }, { "epoch": 1.768751564778133, "grad_norm": 1.6989572048187256, "learning_rate": 1.4364465855462537e-05, "loss": 0.0759, "step": 120100 }, { "epoch": 1.76911974786822, "grad_norm": 0.8778809905052185, "learning_rate": 1.4362829483495553e-05, "loss": 0.0671, "step": 120125 }, { "epoch": 1.769487930958307, "grad_norm": 1.777380347251892, "learning_rate": 1.4361193111528569e-05, "loss": 0.0738, "step": 120150 }, { "epoch": 1.769856114048394, "grad_norm": 1.3435474634170532, "learning_rate": 1.4359556739561585e-05, "loss": 0.0719, "step": 120175 }, { "epoch": 1.7702242971384812, "grad_norm": 1.1195390224456787, "learning_rate": 1.43579203675946e-05, "loss": 0.0624, "step": 120200 }, { "epoch": 1.7705924802285682, "grad_norm": 1.039779782295227, "learning_rate": 1.4356283995627616e-05, "loss": 0.0708, "step": 120225 }, { "epoch": 1.7709606633186552, "grad_norm": 1.2294856309890747, "learning_rate": 1.4354647623660632e-05, "loss": 0.0651, "step": 120250 }, { "epoch": 1.7713288464087422, "grad_norm": 1.1152254343032837, "learning_rate": 1.4353011251693648e-05, "loss": 0.0607, "step": 120275 }, { "epoch": 1.7716970294988292, "grad_norm": 1.0624229907989502, "learning_rate": 1.435137487972666e-05, "loss": 0.0706, "step": 120300 }, { "epoch": 1.7720652125889162, "grad_norm": 1.59226655960083, "learning_rate": 1.4349738507759677e-05, "loss": 0.079, "step": 120325 }, { "epoch": 1.7724333956790033, "grad_norm": 1.6993451118469238, "learning_rate": 1.4348102135792691e-05, "loss": 0.0749, "step": 120350 }, { "epoch": 1.7728015787690903, "grad_norm": 0.9813623428344727, "learning_rate": 1.4346465763825707e-05, "loss": 0.0642, "step": 120375 }, { "epoch": 1.7731697618591773, "grad_norm": 0.764958918094635, "learning_rate": 1.4344829391858724e-05, "loss": 0.069, "step": 120400 }, { "epoch": 1.7735379449492643, "grad_norm": 1.2145322561264038, "learning_rate": 1.434319301989174e-05, "loss": 0.0653, "step": 120425 }, { "epoch": 1.7739061280393513, "grad_norm": 1.635727047920227, "learning_rate": 1.4341556647924754e-05, "loss": 0.0751, "step": 120450 }, { "epoch": 1.7742743111294383, "grad_norm": 1.4147942066192627, "learning_rate": 1.433992027595777e-05, "loss": 0.0629, "step": 120475 }, { "epoch": 1.7746424942195254, "grad_norm": 1.3779675960540771, "learning_rate": 1.4338283903990786e-05, "loss": 0.073, "step": 120500 }, { "epoch": 1.7750106773096124, "grad_norm": 1.3990569114685059, "learning_rate": 1.43366475320238e-05, "loss": 0.0698, "step": 120525 }, { "epoch": 1.7753788603996996, "grad_norm": 1.5038468837738037, "learning_rate": 1.4335011160056815e-05, "loss": 0.0607, "step": 120550 }, { "epoch": 1.7757470434897866, "grad_norm": 1.332115650177002, "learning_rate": 1.4333374788089832e-05, "loss": 0.0636, "step": 120575 }, { "epoch": 1.7761152265798736, "grad_norm": 1.2773898839950562, "learning_rate": 1.4331738416122846e-05, "loss": 0.0738, "step": 120600 }, { "epoch": 1.7764834096699607, "grad_norm": 1.379075288772583, "learning_rate": 1.4330102044155862e-05, "loss": 0.068, "step": 120625 }, { "epoch": 1.7768515927600477, "grad_norm": 1.3863091468811035, "learning_rate": 1.4328465672188878e-05, "loss": 0.0692, "step": 120650 }, { "epoch": 1.777219775850135, "grad_norm": 1.0513231754302979, "learning_rate": 1.4326829300221894e-05, "loss": 0.0651, "step": 120675 }, { "epoch": 1.777587958940222, "grad_norm": 1.1663014888763428, "learning_rate": 1.4325192928254909e-05, "loss": 0.0747, "step": 120700 }, { "epoch": 1.777956142030309, "grad_norm": 1.2848808765411377, "learning_rate": 1.4323556556287923e-05, "loss": 0.0664, "step": 120725 }, { "epoch": 1.778324325120396, "grad_norm": 1.8034509420394897, "learning_rate": 1.432192018432094e-05, "loss": 0.0698, "step": 120750 }, { "epoch": 1.778692508210483, "grad_norm": 1.3875097036361694, "learning_rate": 1.4320283812353954e-05, "loss": 0.0751, "step": 120775 }, { "epoch": 1.77906069130057, "grad_norm": 1.251055121421814, "learning_rate": 1.431864744038697e-05, "loss": 0.0647, "step": 120800 }, { "epoch": 1.779428874390657, "grad_norm": 1.7127313613891602, "learning_rate": 1.4317011068419986e-05, "loss": 0.0664, "step": 120825 }, { "epoch": 1.779797057480744, "grad_norm": 1.4669241905212402, "learning_rate": 1.4315374696453002e-05, "loss": 0.0669, "step": 120850 }, { "epoch": 1.780165240570831, "grad_norm": 1.3870619535446167, "learning_rate": 1.4313738324486017e-05, "loss": 0.0677, "step": 120875 }, { "epoch": 1.780533423660918, "grad_norm": 1.8698972463607788, "learning_rate": 1.4312101952519033e-05, "loss": 0.0703, "step": 120900 }, { "epoch": 1.780901606751005, "grad_norm": 1.6329165697097778, "learning_rate": 1.431046558055205e-05, "loss": 0.0694, "step": 120925 }, { "epoch": 1.781269789841092, "grad_norm": 1.3736357688903809, "learning_rate": 1.4308829208585062e-05, "loss": 0.0725, "step": 120950 }, { "epoch": 1.781637972931179, "grad_norm": 1.3963435888290405, "learning_rate": 1.4307192836618078e-05, "loss": 0.0646, "step": 120975 }, { "epoch": 1.782006156021266, "grad_norm": 1.7232320308685303, "learning_rate": 1.4305556464651094e-05, "loss": 0.0795, "step": 121000 }, { "epoch": 1.7823743391113533, "grad_norm": 1.8137731552124023, "learning_rate": 1.4303920092684109e-05, "loss": 0.0737, "step": 121025 }, { "epoch": 1.7827425222014404, "grad_norm": 1.3642964363098145, "learning_rate": 1.4302283720717125e-05, "loss": 0.0682, "step": 121050 }, { "epoch": 1.7831107052915274, "grad_norm": 0.9713894724845886, "learning_rate": 1.4300647348750141e-05, "loss": 0.0714, "step": 121075 }, { "epoch": 1.7834788883816144, "grad_norm": 1.5277451276779175, "learning_rate": 1.4299010976783157e-05, "loss": 0.078, "step": 121100 }, { "epoch": 1.7838470714717014, "grad_norm": 1.3738890886306763, "learning_rate": 1.4297374604816172e-05, "loss": 0.0611, "step": 121125 }, { "epoch": 1.7842152545617886, "grad_norm": 1.4161064624786377, "learning_rate": 1.4295738232849186e-05, "loss": 0.0677, "step": 121150 }, { "epoch": 1.7845834376518757, "grad_norm": 1.171356439590454, "learning_rate": 1.42941018608822e-05, "loss": 0.0709, "step": 121175 }, { "epoch": 1.7849516207419627, "grad_norm": 1.4238909482955933, "learning_rate": 1.4292465488915217e-05, "loss": 0.0718, "step": 121200 }, { "epoch": 1.7853198038320497, "grad_norm": 1.3162243366241455, "learning_rate": 1.4290829116948233e-05, "loss": 0.0708, "step": 121225 }, { "epoch": 1.7856879869221367, "grad_norm": 1.1067510843276978, "learning_rate": 1.4289192744981249e-05, "loss": 0.0678, "step": 121250 }, { "epoch": 1.7860561700122237, "grad_norm": 1.2170255184173584, "learning_rate": 1.4287556373014263e-05, "loss": 0.061, "step": 121275 }, { "epoch": 1.7864243531023107, "grad_norm": 1.4518414735794067, "learning_rate": 1.4285985455925959e-05, "loss": 0.0624, "step": 121300 }, { "epoch": 1.7867925361923978, "grad_norm": 0.9676971435546875, "learning_rate": 1.4284349083958975e-05, "loss": 0.065, "step": 121325 }, { "epoch": 1.7871607192824848, "grad_norm": 1.3099478483200073, "learning_rate": 1.4282712711991988e-05, "loss": 0.0724, "step": 121350 }, { "epoch": 1.7875289023725718, "grad_norm": 1.7666794061660767, "learning_rate": 1.4281076340025004e-05, "loss": 0.0663, "step": 121375 }, { "epoch": 1.7878970854626588, "grad_norm": 1.442685842514038, "learning_rate": 1.427943996805802e-05, "loss": 0.0676, "step": 121400 }, { "epoch": 1.7882652685527458, "grad_norm": 1.247192144393921, "learning_rate": 1.4277803596091035e-05, "loss": 0.0732, "step": 121425 }, { "epoch": 1.7886334516428328, "grad_norm": 1.809297800064087, "learning_rate": 1.427616722412405e-05, "loss": 0.066, "step": 121450 }, { "epoch": 1.7890016347329198, "grad_norm": 1.4799872636795044, "learning_rate": 1.4274530852157067e-05, "loss": 0.0696, "step": 121475 }, { "epoch": 1.7893698178230069, "grad_norm": 1.9324668645858765, "learning_rate": 1.4272894480190083e-05, "loss": 0.0705, "step": 121500 }, { "epoch": 1.789738000913094, "grad_norm": 1.2843706607818604, "learning_rate": 1.4271258108223098e-05, "loss": 0.0634, "step": 121525 }, { "epoch": 1.7901061840031811, "grad_norm": 1.338454246520996, "learning_rate": 1.4269621736256114e-05, "loss": 0.0653, "step": 121550 }, { "epoch": 1.7904743670932681, "grad_norm": 1.2111071348190308, "learning_rate": 1.4267985364289128e-05, "loss": 0.0634, "step": 121575 }, { "epoch": 1.7908425501833551, "grad_norm": 1.413698673248291, "learning_rate": 1.4266348992322143e-05, "loss": 0.0671, "step": 121600 }, { "epoch": 1.7912107332734424, "grad_norm": 1.3297215700149536, "learning_rate": 1.4264712620355159e-05, "loss": 0.0709, "step": 121625 }, { "epoch": 1.7915789163635294, "grad_norm": 1.3247640132904053, "learning_rate": 1.4263076248388175e-05, "loss": 0.0727, "step": 121650 }, { "epoch": 1.7919470994536164, "grad_norm": 1.161634922027588, "learning_rate": 1.4261439876421191e-05, "loss": 0.0749, "step": 121675 }, { "epoch": 1.7923152825437034, "grad_norm": 1.5505574941635132, "learning_rate": 1.4259803504454206e-05, "loss": 0.0656, "step": 121700 }, { "epoch": 1.7926834656337904, "grad_norm": 1.5611209869384766, "learning_rate": 1.4258167132487222e-05, "loss": 0.07, "step": 121725 }, { "epoch": 1.7930516487238775, "grad_norm": 2.1446967124938965, "learning_rate": 1.4256530760520238e-05, "loss": 0.0685, "step": 121750 }, { "epoch": 1.7934198318139645, "grad_norm": 1.0909874439239502, "learning_rate": 1.425489438855325e-05, "loss": 0.0715, "step": 121775 }, { "epoch": 1.7937880149040515, "grad_norm": 1.20832097530365, "learning_rate": 1.4253258016586267e-05, "loss": 0.0648, "step": 121800 }, { "epoch": 1.7941561979941385, "grad_norm": 1.3289990425109863, "learning_rate": 1.4251621644619283e-05, "loss": 0.0722, "step": 121825 }, { "epoch": 1.7945243810842255, "grad_norm": 1.5881668329238892, "learning_rate": 1.4249985272652297e-05, "loss": 0.0658, "step": 121850 }, { "epoch": 1.7948925641743125, "grad_norm": 1.1237159967422485, "learning_rate": 1.4248348900685314e-05, "loss": 0.0608, "step": 121875 }, { "epoch": 1.7952607472643995, "grad_norm": 1.7560815811157227, "learning_rate": 1.424671252871833e-05, "loss": 0.0699, "step": 121900 }, { "epoch": 1.7956289303544866, "grad_norm": 1.1331946849822998, "learning_rate": 1.4245076156751346e-05, "loss": 0.066, "step": 121925 }, { "epoch": 1.7959971134445736, "grad_norm": 1.2591310739517212, "learning_rate": 1.424343978478436e-05, "loss": 0.0627, "step": 121950 }, { "epoch": 1.7963652965346606, "grad_norm": 1.442726969718933, "learning_rate": 1.4241803412817376e-05, "loss": 0.0666, "step": 121975 }, { "epoch": 1.7967334796247478, "grad_norm": 1.6420409679412842, "learning_rate": 1.424016704085039e-05, "loss": 0.0703, "step": 122000 }, { "epoch": 1.7971016627148348, "grad_norm": 1.8024204969406128, "learning_rate": 1.4238530668883405e-05, "loss": 0.0672, "step": 122025 }, { "epoch": 1.7974698458049219, "grad_norm": 1.4262099266052246, "learning_rate": 1.4236894296916422e-05, "loss": 0.0705, "step": 122050 }, { "epoch": 1.7978380288950089, "grad_norm": 1.2743639945983887, "learning_rate": 1.4235257924949438e-05, "loss": 0.0667, "step": 122075 }, { "epoch": 1.7982062119850961, "grad_norm": 1.9641082286834717, "learning_rate": 1.4233621552982452e-05, "loss": 0.074, "step": 122100 }, { "epoch": 1.7985743950751831, "grad_norm": 1.8725675344467163, "learning_rate": 1.4231985181015468e-05, "loss": 0.0637, "step": 122125 }, { "epoch": 1.7989425781652701, "grad_norm": 1.1382743120193481, "learning_rate": 1.4230348809048484e-05, "loss": 0.0645, "step": 122150 }, { "epoch": 1.7993107612553572, "grad_norm": 1.1396962404251099, "learning_rate": 1.42287124370815e-05, "loss": 0.0618, "step": 122175 }, { "epoch": 1.7996789443454442, "grad_norm": 1.1252176761627197, "learning_rate": 1.4227076065114513e-05, "loss": 0.0707, "step": 122200 }, { "epoch": 1.8000471274355312, "grad_norm": 1.9477506875991821, "learning_rate": 1.422543969314753e-05, "loss": 0.0711, "step": 122225 }, { "epoch": 1.8004153105256182, "grad_norm": 1.0825227499008179, "learning_rate": 1.4223803321180546e-05, "loss": 0.0657, "step": 122250 }, { "epoch": 1.8007834936157052, "grad_norm": 1.5377202033996582, "learning_rate": 1.422216694921356e-05, "loss": 0.0706, "step": 122275 }, { "epoch": 1.8011516767057922, "grad_norm": 1.994928240776062, "learning_rate": 1.4220530577246576e-05, "loss": 0.0716, "step": 122300 }, { "epoch": 1.8015198597958793, "grad_norm": 1.5081052780151367, "learning_rate": 1.4218894205279592e-05, "loss": 0.064, "step": 122325 }, { "epoch": 1.8018880428859663, "grad_norm": 1.1986535787582397, "learning_rate": 1.4217257833312607e-05, "loss": 0.0633, "step": 122350 }, { "epoch": 1.8022562259760533, "grad_norm": 1.4311184883117676, "learning_rate": 1.4215621461345623e-05, "loss": 0.0764, "step": 122375 }, { "epoch": 1.8026244090661403, "grad_norm": 1.7433348894119263, "learning_rate": 1.421398508937864e-05, "loss": 0.0659, "step": 122400 }, { "epoch": 1.8029925921562273, "grad_norm": 1.5358247756958008, "learning_rate": 1.4212348717411652e-05, "loss": 0.0674, "step": 122425 }, { "epoch": 1.8033607752463143, "grad_norm": 1.254683494567871, "learning_rate": 1.4210712345444668e-05, "loss": 0.0691, "step": 122450 }, { "epoch": 1.8037289583364016, "grad_norm": 1.3888503313064575, "learning_rate": 1.4209075973477684e-05, "loss": 0.0689, "step": 122475 }, { "epoch": 1.8040971414264886, "grad_norm": 1.1659692525863647, "learning_rate": 1.42074396015107e-05, "loss": 0.0709, "step": 122500 }, { "epoch": 1.8044653245165756, "grad_norm": 1.6324865818023682, "learning_rate": 1.4205803229543715e-05, "loss": 0.0679, "step": 122525 }, { "epoch": 1.8048335076066626, "grad_norm": 1.0429387092590332, "learning_rate": 1.4204166857576731e-05, "loss": 0.0698, "step": 122550 }, { "epoch": 1.8052016906967496, "grad_norm": 1.061618447303772, "learning_rate": 1.4202530485609747e-05, "loss": 0.0685, "step": 122575 }, { "epoch": 1.8055698737868369, "grad_norm": 1.6197209358215332, "learning_rate": 1.4200894113642763e-05, "loss": 0.0708, "step": 122600 }, { "epoch": 1.8059380568769239, "grad_norm": 1.4687283039093018, "learning_rate": 1.4199257741675776e-05, "loss": 0.0686, "step": 122625 }, { "epoch": 1.806306239967011, "grad_norm": 1.6074970960617065, "learning_rate": 1.4197621369708792e-05, "loss": 0.0662, "step": 122650 }, { "epoch": 1.806674423057098, "grad_norm": 0.8636961579322815, "learning_rate": 1.4195984997741807e-05, "loss": 0.0619, "step": 122675 }, { "epoch": 1.807042606147185, "grad_norm": 1.3781434297561646, "learning_rate": 1.4194348625774823e-05, "loss": 0.069, "step": 122700 }, { "epoch": 1.807410789237272, "grad_norm": 1.4002094268798828, "learning_rate": 1.4192712253807839e-05, "loss": 0.0731, "step": 122725 }, { "epoch": 1.807778972327359, "grad_norm": 1.388077974319458, "learning_rate": 1.4191075881840855e-05, "loss": 0.0762, "step": 122750 }, { "epoch": 1.808147155417446, "grad_norm": 1.3945006132125854, "learning_rate": 1.418943950987387e-05, "loss": 0.0688, "step": 122775 }, { "epoch": 1.808515338507533, "grad_norm": 1.5331770181655884, "learning_rate": 1.4187803137906886e-05, "loss": 0.0731, "step": 122800 }, { "epoch": 1.80888352159762, "grad_norm": 1.9857841730117798, "learning_rate": 1.41861667659399e-05, "loss": 0.0738, "step": 122825 }, { "epoch": 1.809251704687707, "grad_norm": 1.7090383768081665, "learning_rate": 1.4184530393972915e-05, "loss": 0.0695, "step": 122850 }, { "epoch": 1.809619887777794, "grad_norm": 1.6358251571655273, "learning_rate": 1.418289402200593e-05, "loss": 0.0678, "step": 122875 }, { "epoch": 1.809988070867881, "grad_norm": 1.7000668048858643, "learning_rate": 1.4181257650038947e-05, "loss": 0.0725, "step": 122900 }, { "epoch": 1.810356253957968, "grad_norm": 1.3275409936904907, "learning_rate": 1.4179621278071961e-05, "loss": 0.0646, "step": 122925 }, { "epoch": 1.8107244370480553, "grad_norm": 2.237173080444336, "learning_rate": 1.4177984906104978e-05, "loss": 0.0697, "step": 122950 }, { "epoch": 1.8110926201381423, "grad_norm": 1.398925542831421, "learning_rate": 1.4176348534137994e-05, "loss": 0.0609, "step": 122975 }, { "epoch": 1.8114608032282293, "grad_norm": 1.2925307750701904, "learning_rate": 1.417471216217101e-05, "loss": 0.071, "step": 123000 }, { "epoch": 1.8118289863183163, "grad_norm": 1.352026104927063, "learning_rate": 1.4173075790204024e-05, "loss": 0.0663, "step": 123025 }, { "epoch": 1.8121971694084034, "grad_norm": 1.6469991207122803, "learning_rate": 1.4171439418237039e-05, "loss": 0.0696, "step": 123050 }, { "epoch": 1.8125653524984906, "grad_norm": 1.35920250415802, "learning_rate": 1.4169803046270055e-05, "loss": 0.0667, "step": 123075 }, { "epoch": 1.8129335355885776, "grad_norm": 0.9462726712226868, "learning_rate": 1.416816667430307e-05, "loss": 0.0729, "step": 123100 }, { "epoch": 1.8133017186786646, "grad_norm": 0.9434695839881897, "learning_rate": 1.4166530302336086e-05, "loss": 0.0644, "step": 123125 }, { "epoch": 1.8136699017687516, "grad_norm": 1.3169819116592407, "learning_rate": 1.4164893930369102e-05, "loss": 0.0684, "step": 123150 }, { "epoch": 1.8140380848588387, "grad_norm": 1.188008427619934, "learning_rate": 1.4163257558402118e-05, "loss": 0.0611, "step": 123175 }, { "epoch": 1.8144062679489257, "grad_norm": 1.1293237209320068, "learning_rate": 1.4161621186435132e-05, "loss": 0.0784, "step": 123200 }, { "epoch": 1.8147744510390127, "grad_norm": 1.3106647729873657, "learning_rate": 1.4159984814468148e-05, "loss": 0.0684, "step": 123225 }, { "epoch": 1.8151426341290997, "grad_norm": 1.7924137115478516, "learning_rate": 1.4158348442501161e-05, "loss": 0.0656, "step": 123250 }, { "epoch": 1.8155108172191867, "grad_norm": 1.1363129615783691, "learning_rate": 1.4156712070534177e-05, "loss": 0.0598, "step": 123275 }, { "epoch": 1.8158790003092737, "grad_norm": 1.4987635612487793, "learning_rate": 1.4155075698567194e-05, "loss": 0.072, "step": 123300 }, { "epoch": 1.8162471833993608, "grad_norm": 1.4073323011398315, "learning_rate": 1.415343932660021e-05, "loss": 0.07, "step": 123325 }, { "epoch": 1.8166153664894478, "grad_norm": 1.0494939088821411, "learning_rate": 1.4151802954633224e-05, "loss": 0.0644, "step": 123350 }, { "epoch": 1.8169835495795348, "grad_norm": 1.3889259099960327, "learning_rate": 1.415016658266624e-05, "loss": 0.0752, "step": 123375 }, { "epoch": 1.8173517326696218, "grad_norm": 1.484281301498413, "learning_rate": 1.4148530210699256e-05, "loss": 0.0692, "step": 123400 }, { "epoch": 1.817719915759709, "grad_norm": 2.207677125930786, "learning_rate": 1.4146893838732273e-05, "loss": 0.0708, "step": 123425 }, { "epoch": 1.818088098849796, "grad_norm": 1.457485556602478, "learning_rate": 1.4145257466765287e-05, "loss": 0.0673, "step": 123450 }, { "epoch": 1.818456281939883, "grad_norm": 1.2040473222732544, "learning_rate": 1.4143621094798302e-05, "loss": 0.0661, "step": 123475 }, { "epoch": 1.81882446502997, "grad_norm": 1.5974911451339722, "learning_rate": 1.4141984722831316e-05, "loss": 0.0651, "step": 123500 }, { "epoch": 1.819192648120057, "grad_norm": 1.0894943475723267, "learning_rate": 1.4140348350864332e-05, "loss": 0.0646, "step": 123525 }, { "epoch": 1.8195608312101443, "grad_norm": 1.3080356121063232, "learning_rate": 1.4138711978897348e-05, "loss": 0.0678, "step": 123550 }, { "epoch": 1.8199290143002314, "grad_norm": 0.965703010559082, "learning_rate": 1.4137075606930364e-05, "loss": 0.0705, "step": 123575 }, { "epoch": 1.8202971973903184, "grad_norm": 2.154559373855591, "learning_rate": 1.4135439234963379e-05, "loss": 0.0693, "step": 123600 }, { "epoch": 1.8206653804804054, "grad_norm": 1.5285266637802124, "learning_rate": 1.4133802862996395e-05, "loss": 0.0654, "step": 123625 }, { "epoch": 1.8210335635704924, "grad_norm": 1.1775543689727783, "learning_rate": 1.4132166491029411e-05, "loss": 0.0665, "step": 123650 }, { "epoch": 1.8214017466605794, "grad_norm": 1.6471136808395386, "learning_rate": 1.4130530119062424e-05, "loss": 0.0704, "step": 123675 }, { "epoch": 1.8217699297506664, "grad_norm": 1.290858507156372, "learning_rate": 1.412889374709544e-05, "loss": 0.0748, "step": 123700 }, { "epoch": 1.8221381128407534, "grad_norm": 1.5764347314834595, "learning_rate": 1.4127257375128456e-05, "loss": 0.0652, "step": 123725 }, { "epoch": 1.8225062959308405, "grad_norm": 1.1291478872299194, "learning_rate": 1.4125621003161472e-05, "loss": 0.0592, "step": 123750 }, { "epoch": 1.8228744790209275, "grad_norm": 1.6106886863708496, "learning_rate": 1.4123984631194487e-05, "loss": 0.0659, "step": 123775 }, { "epoch": 1.8232426621110145, "grad_norm": 1.2678749561309814, "learning_rate": 1.4122348259227503e-05, "loss": 0.0706, "step": 123800 }, { "epoch": 1.8236108452011015, "grad_norm": 1.6881310939788818, "learning_rate": 1.412071188726052e-05, "loss": 0.0679, "step": 123825 }, { "epoch": 1.8239790282911885, "grad_norm": 1.330228567123413, "learning_rate": 1.4119075515293534e-05, "loss": 0.0684, "step": 123850 }, { "epoch": 1.8243472113812755, "grad_norm": 1.768144965171814, "learning_rate": 1.411743914332655e-05, "loss": 0.075, "step": 123875 }, { "epoch": 1.8247153944713628, "grad_norm": 1.3235609531402588, "learning_rate": 1.4115802771359564e-05, "loss": 0.0689, "step": 123900 }, { "epoch": 1.8250835775614498, "grad_norm": 1.2848559617996216, "learning_rate": 1.4114166399392579e-05, "loss": 0.0611, "step": 123925 }, { "epoch": 1.8254517606515368, "grad_norm": 1.4224627017974854, "learning_rate": 1.4112530027425595e-05, "loss": 0.0695, "step": 123950 }, { "epoch": 1.8258199437416238, "grad_norm": 1.7034728527069092, "learning_rate": 1.4110893655458611e-05, "loss": 0.0637, "step": 123975 }, { "epoch": 1.8261881268317108, "grad_norm": 1.4163336753845215, "learning_rate": 1.4109257283491627e-05, "loss": 0.0641, "step": 124000 }, { "epoch": 1.826556309921798, "grad_norm": 1.9863816499710083, "learning_rate": 1.4107686366403321e-05, "loss": 0.0701, "step": 124025 }, { "epoch": 1.826924493011885, "grad_norm": 1.6415189504623413, "learning_rate": 1.4106049994436337e-05, "loss": 0.0602, "step": 124050 }, { "epoch": 1.827292676101972, "grad_norm": 1.7405691146850586, "learning_rate": 1.4104413622469353e-05, "loss": 0.0651, "step": 124075 }, { "epoch": 1.8276608591920591, "grad_norm": 1.368970513343811, "learning_rate": 1.4102777250502366e-05, "loss": 0.0716, "step": 124100 }, { "epoch": 1.8280290422821461, "grad_norm": 1.4880112409591675, "learning_rate": 1.4101140878535382e-05, "loss": 0.0701, "step": 124125 }, { "epoch": 1.8283972253722331, "grad_norm": 1.0609428882598877, "learning_rate": 1.4099504506568398e-05, "loss": 0.0637, "step": 124150 }, { "epoch": 1.8287654084623202, "grad_norm": 1.6436680555343628, "learning_rate": 1.4097868134601413e-05, "loss": 0.0729, "step": 124175 }, { "epoch": 1.8291335915524072, "grad_norm": 1.143778681755066, "learning_rate": 1.4096231762634429e-05, "loss": 0.0672, "step": 124200 }, { "epoch": 1.8295017746424942, "grad_norm": 1.2172858715057373, "learning_rate": 1.4094595390667445e-05, "loss": 0.0722, "step": 124225 }, { "epoch": 1.8298699577325812, "grad_norm": 2.0340380668640137, "learning_rate": 1.4092959018700461e-05, "loss": 0.0754, "step": 124250 }, { "epoch": 1.8302381408226682, "grad_norm": 1.838102102279663, "learning_rate": 1.4091322646733476e-05, "loss": 0.0645, "step": 124275 }, { "epoch": 1.8306063239127552, "grad_norm": 1.410643219947815, "learning_rate": 1.4089686274766492e-05, "loss": 0.0702, "step": 124300 }, { "epoch": 1.8309745070028423, "grad_norm": 1.8687243461608887, "learning_rate": 1.4088049902799505e-05, "loss": 0.0691, "step": 124325 }, { "epoch": 1.8313426900929293, "grad_norm": 1.682615876197815, "learning_rate": 1.408641353083252e-05, "loss": 0.0671, "step": 124350 }, { "epoch": 1.8317108731830163, "grad_norm": 1.1685667037963867, "learning_rate": 1.4084777158865537e-05, "loss": 0.0669, "step": 124375 }, { "epoch": 1.8320790562731035, "grad_norm": 1.583894968032837, "learning_rate": 1.4083140786898553e-05, "loss": 0.0738, "step": 124400 }, { "epoch": 1.8324472393631905, "grad_norm": 1.4164410829544067, "learning_rate": 1.4081504414931568e-05, "loss": 0.0656, "step": 124425 }, { "epoch": 1.8328154224532776, "grad_norm": 1.970167636871338, "learning_rate": 1.4079868042964584e-05, "loss": 0.0705, "step": 124450 }, { "epoch": 1.8331836055433646, "grad_norm": 1.6298333406448364, "learning_rate": 1.40782316709976e-05, "loss": 0.0649, "step": 124475 }, { "epoch": 1.8335517886334518, "grad_norm": 1.2371671199798584, "learning_rate": 1.4076595299030616e-05, "loss": 0.0682, "step": 124500 }, { "epoch": 1.8339199717235388, "grad_norm": 1.6401854753494263, "learning_rate": 1.4074958927063629e-05, "loss": 0.0671, "step": 124525 }, { "epoch": 1.8342881548136258, "grad_norm": 1.0946791172027588, "learning_rate": 1.4073322555096645e-05, "loss": 0.0711, "step": 124550 }, { "epoch": 1.8346563379037129, "grad_norm": 1.3830103874206543, "learning_rate": 1.4071686183129661e-05, "loss": 0.0687, "step": 124575 }, { "epoch": 1.8350245209937999, "grad_norm": 1.2596478462219238, "learning_rate": 1.4070049811162676e-05, "loss": 0.0656, "step": 124600 }, { "epoch": 1.8353927040838869, "grad_norm": 1.4882218837738037, "learning_rate": 1.4068413439195692e-05, "loss": 0.0742, "step": 124625 }, { "epoch": 1.835760887173974, "grad_norm": 1.4395406246185303, "learning_rate": 1.4066777067228708e-05, "loss": 0.0704, "step": 124650 }, { "epoch": 1.836129070264061, "grad_norm": 1.1731866598129272, "learning_rate": 1.4065140695261722e-05, "loss": 0.0609, "step": 124675 }, { "epoch": 1.836497253354148, "grad_norm": 1.500097393989563, "learning_rate": 1.4063504323294738e-05, "loss": 0.0716, "step": 124700 }, { "epoch": 1.836865436444235, "grad_norm": 1.510772943496704, "learning_rate": 1.4061867951327753e-05, "loss": 0.0669, "step": 124725 }, { "epoch": 1.837233619534322, "grad_norm": 1.2818152904510498, "learning_rate": 1.4060231579360767e-05, "loss": 0.0739, "step": 124750 }, { "epoch": 1.837601802624409, "grad_norm": 1.9229601621627808, "learning_rate": 1.4058595207393784e-05, "loss": 0.0651, "step": 124775 }, { "epoch": 1.837969985714496, "grad_norm": 1.207069754600525, "learning_rate": 1.40569588354268e-05, "loss": 0.0704, "step": 124800 }, { "epoch": 1.838338168804583, "grad_norm": 1.3579490184783936, "learning_rate": 1.4055322463459816e-05, "loss": 0.0679, "step": 124825 }, { "epoch": 1.83870635189467, "grad_norm": 1.5743180513381958, "learning_rate": 1.405368609149283e-05, "loss": 0.0741, "step": 124850 }, { "epoch": 1.8390745349847573, "grad_norm": 1.741626501083374, "learning_rate": 1.4052049719525846e-05, "loss": 0.0694, "step": 124875 }, { "epoch": 1.8394427180748443, "grad_norm": 1.5830440521240234, "learning_rate": 1.4050413347558863e-05, "loss": 0.073, "step": 124900 }, { "epoch": 1.8398109011649313, "grad_norm": 0.8269080519676208, "learning_rate": 1.4048776975591879e-05, "loss": 0.0737, "step": 124925 }, { "epoch": 1.8401790842550183, "grad_norm": 1.302711844444275, "learning_rate": 1.4047140603624892e-05, "loss": 0.0662, "step": 124950 }, { "epoch": 1.8405472673451053, "grad_norm": 1.016312599182129, "learning_rate": 1.4045504231657908e-05, "loss": 0.0672, "step": 124975 }, { "epoch": 1.8409154504351926, "grad_norm": 1.247109293937683, "learning_rate": 1.4043867859690922e-05, "loss": 0.0636, "step": 125000 }, { "epoch": 1.8412836335252796, "grad_norm": 1.4615647792816162, "learning_rate": 1.4042231487723938e-05, "loss": 0.0717, "step": 125025 }, { "epoch": 1.8416518166153666, "grad_norm": 1.698584794998169, "learning_rate": 1.4040595115756954e-05, "loss": 0.0705, "step": 125050 }, { "epoch": 1.8420199997054536, "grad_norm": 1.2643131017684937, "learning_rate": 1.403895874378997e-05, "loss": 0.0576, "step": 125075 }, { "epoch": 1.8423881827955406, "grad_norm": 1.2368474006652832, "learning_rate": 1.4037322371822985e-05, "loss": 0.0643, "step": 125100 }, { "epoch": 1.8427563658856276, "grad_norm": 1.357359766960144, "learning_rate": 1.4035685999856001e-05, "loss": 0.0695, "step": 125125 }, { "epoch": 1.8431245489757146, "grad_norm": 1.5304204225540161, "learning_rate": 1.4034049627889016e-05, "loss": 0.071, "step": 125150 }, { "epoch": 1.8434927320658017, "grad_norm": 1.029677152633667, "learning_rate": 1.403241325592203e-05, "loss": 0.0735, "step": 125175 }, { "epoch": 1.8438609151558887, "grad_norm": 1.4008866548538208, "learning_rate": 1.4030776883955046e-05, "loss": 0.0657, "step": 125200 }, { "epoch": 1.8442290982459757, "grad_norm": 1.617188811302185, "learning_rate": 1.4029140511988062e-05, "loss": 0.0643, "step": 125225 }, { "epoch": 1.8445972813360627, "grad_norm": 1.5113645792007446, "learning_rate": 1.4027504140021077e-05, "loss": 0.0752, "step": 125250 }, { "epoch": 1.8449654644261497, "grad_norm": 1.2097511291503906, "learning_rate": 1.4025867768054093e-05, "loss": 0.0636, "step": 125275 }, { "epoch": 1.8453336475162367, "grad_norm": 1.6903287172317505, "learning_rate": 1.402423139608711e-05, "loss": 0.0666, "step": 125300 }, { "epoch": 1.8457018306063238, "grad_norm": 0.9059404134750366, "learning_rate": 1.4022595024120125e-05, "loss": 0.0679, "step": 125325 }, { "epoch": 1.846070013696411, "grad_norm": 1.970717430114746, "learning_rate": 1.402095865215314e-05, "loss": 0.0665, "step": 125350 }, { "epoch": 1.846438196786498, "grad_norm": 1.2964277267456055, "learning_rate": 1.4019322280186154e-05, "loss": 0.0618, "step": 125375 }, { "epoch": 1.846806379876585, "grad_norm": 0.9654756188392639, "learning_rate": 1.401768590821917e-05, "loss": 0.0654, "step": 125400 }, { "epoch": 1.847174562966672, "grad_norm": 1.418273687362671, "learning_rate": 1.4016049536252185e-05, "loss": 0.0694, "step": 125425 }, { "epoch": 1.847542746056759, "grad_norm": 1.314102292060852, "learning_rate": 1.4014413164285201e-05, "loss": 0.0762, "step": 125450 }, { "epoch": 1.8479109291468463, "grad_norm": 1.6058028936386108, "learning_rate": 1.4012776792318217e-05, "loss": 0.0711, "step": 125475 }, { "epoch": 1.8482791122369333, "grad_norm": 1.3302124738693237, "learning_rate": 1.4011140420351233e-05, "loss": 0.063, "step": 125500 }, { "epoch": 1.8486472953270203, "grad_norm": 1.6998109817504883, "learning_rate": 1.4009504048384248e-05, "loss": 0.0647, "step": 125525 }, { "epoch": 1.8490154784171073, "grad_norm": 1.5907782316207886, "learning_rate": 1.4007867676417264e-05, "loss": 0.0713, "step": 125550 }, { "epoch": 1.8493836615071944, "grad_norm": 1.1828668117523193, "learning_rate": 1.4006231304450277e-05, "loss": 0.0769, "step": 125575 }, { "epoch": 1.8497518445972814, "grad_norm": 1.1502078771591187, "learning_rate": 1.4004594932483293e-05, "loss": 0.0664, "step": 125600 }, { "epoch": 1.8501200276873684, "grad_norm": 1.2194290161132812, "learning_rate": 1.4002958560516309e-05, "loss": 0.0619, "step": 125625 }, { "epoch": 1.8504882107774554, "grad_norm": 1.4610698223114014, "learning_rate": 1.4001322188549325e-05, "loss": 0.0669, "step": 125650 }, { "epoch": 1.8508563938675424, "grad_norm": 1.318021297454834, "learning_rate": 1.399968581658234e-05, "loss": 0.0697, "step": 125675 }, { "epoch": 1.8512245769576294, "grad_norm": 1.4886027574539185, "learning_rate": 1.3998049444615356e-05, "loss": 0.0767, "step": 125700 }, { "epoch": 1.8515927600477164, "grad_norm": 1.699396014213562, "learning_rate": 1.3996413072648372e-05, "loss": 0.0692, "step": 125725 }, { "epoch": 1.8519609431378035, "grad_norm": 1.5960372686386108, "learning_rate": 1.3994776700681388e-05, "loss": 0.0653, "step": 125750 }, { "epoch": 1.8523291262278905, "grad_norm": 1.6690467596054077, "learning_rate": 1.3993140328714402e-05, "loss": 0.0748, "step": 125775 }, { "epoch": 1.8526973093179775, "grad_norm": 1.5556851625442505, "learning_rate": 1.3991503956747417e-05, "loss": 0.0666, "step": 125800 }, { "epoch": 1.8530654924080647, "grad_norm": 1.1716454029083252, "learning_rate": 1.3989867584780431e-05, "loss": 0.068, "step": 125825 }, { "epoch": 1.8534336754981517, "grad_norm": 1.5424468517303467, "learning_rate": 1.3988231212813448e-05, "loss": 0.0664, "step": 125850 }, { "epoch": 1.8538018585882388, "grad_norm": 1.8190423250198364, "learning_rate": 1.3986594840846464e-05, "loss": 0.0712, "step": 125875 }, { "epoch": 1.8541700416783258, "grad_norm": 1.3615065813064575, "learning_rate": 1.398495846887948e-05, "loss": 0.0686, "step": 125900 }, { "epoch": 1.8545382247684128, "grad_norm": 1.3635457754135132, "learning_rate": 1.3983322096912494e-05, "loss": 0.075, "step": 125925 }, { "epoch": 1.8549064078585, "grad_norm": 1.9268525838851929, "learning_rate": 1.398168572494551e-05, "loss": 0.0703, "step": 125950 }, { "epoch": 1.855274590948587, "grad_norm": 1.5137852430343628, "learning_rate": 1.3980049352978527e-05, "loss": 0.0651, "step": 125975 }, { "epoch": 1.855642774038674, "grad_norm": 1.2876266241073608, "learning_rate": 1.397841298101154e-05, "loss": 0.0672, "step": 126000 }, { "epoch": 1.856010957128761, "grad_norm": 1.370152473449707, "learning_rate": 1.3976776609044556e-05, "loss": 0.0648, "step": 126025 }, { "epoch": 1.856379140218848, "grad_norm": 1.3608373403549194, "learning_rate": 1.3975140237077572e-05, "loss": 0.0722, "step": 126050 }, { "epoch": 1.856747323308935, "grad_norm": 1.3997831344604492, "learning_rate": 1.3973503865110586e-05, "loss": 0.0683, "step": 126075 }, { "epoch": 1.8571155063990221, "grad_norm": 1.488750696182251, "learning_rate": 1.3971867493143602e-05, "loss": 0.0709, "step": 126100 }, { "epoch": 1.8574836894891091, "grad_norm": 1.4350106716156006, "learning_rate": 1.3970231121176618e-05, "loss": 0.0693, "step": 126125 }, { "epoch": 1.8578518725791962, "grad_norm": 1.9889986515045166, "learning_rate": 1.3968594749209635e-05, "loss": 0.0726, "step": 126150 }, { "epoch": 1.8582200556692832, "grad_norm": 0.9648709893226624, "learning_rate": 1.3967023832121328e-05, "loss": 0.0645, "step": 126175 }, { "epoch": 1.8585882387593702, "grad_norm": 1.1292203664779663, "learning_rate": 1.3965387460154343e-05, "loss": 0.0665, "step": 126200 }, { "epoch": 1.8589564218494572, "grad_norm": 1.1202218532562256, "learning_rate": 1.3963751088187359e-05, "loss": 0.0656, "step": 126225 }, { "epoch": 1.8593246049395442, "grad_norm": 1.5572593212127686, "learning_rate": 1.3962114716220374e-05, "loss": 0.0711, "step": 126250 }, { "epoch": 1.8596927880296312, "grad_norm": 1.3085103034973145, "learning_rate": 1.396047834425339e-05, "loss": 0.0727, "step": 126275 }, { "epoch": 1.8600609711197185, "grad_norm": 1.7910100221633911, "learning_rate": 1.3958841972286406e-05, "loss": 0.0713, "step": 126300 }, { "epoch": 1.8604291542098055, "grad_norm": 1.2467713356018066, "learning_rate": 1.3957205600319422e-05, "loss": 0.0687, "step": 126325 }, { "epoch": 1.8607973372998925, "grad_norm": 1.255759596824646, "learning_rate": 1.3955569228352436e-05, "loss": 0.0737, "step": 126350 }, { "epoch": 1.8611655203899795, "grad_norm": 1.3584084510803223, "learning_rate": 1.3953932856385453e-05, "loss": 0.0657, "step": 126375 }, { "epoch": 1.8615337034800665, "grad_norm": 1.4888774156570435, "learning_rate": 1.3952296484418469e-05, "loss": 0.0722, "step": 126400 }, { "epoch": 1.8619018865701538, "grad_norm": 1.4281727075576782, "learning_rate": 1.3950660112451482e-05, "loss": 0.0702, "step": 126425 }, { "epoch": 1.8622700696602408, "grad_norm": 1.7842527627944946, "learning_rate": 1.3949023740484498e-05, "loss": 0.0714, "step": 126450 }, { "epoch": 1.8626382527503278, "grad_norm": 1.6052942276000977, "learning_rate": 1.3947387368517514e-05, "loss": 0.0695, "step": 126475 }, { "epoch": 1.8630064358404148, "grad_norm": 1.844359040260315, "learning_rate": 1.3945750996550528e-05, "loss": 0.0763, "step": 126500 }, { "epoch": 1.8633746189305018, "grad_norm": 1.3661956787109375, "learning_rate": 1.3944114624583544e-05, "loss": 0.0747, "step": 126525 }, { "epoch": 1.8637428020205888, "grad_norm": 1.7540936470031738, "learning_rate": 1.394247825261656e-05, "loss": 0.0684, "step": 126550 }, { "epoch": 1.8641109851106759, "grad_norm": 1.5911238193511963, "learning_rate": 1.3940841880649577e-05, "loss": 0.0643, "step": 126575 }, { "epoch": 1.8644791682007629, "grad_norm": 1.2241568565368652, "learning_rate": 1.3939205508682591e-05, "loss": 0.0647, "step": 126600 }, { "epoch": 1.8648473512908499, "grad_norm": 1.1999590396881104, "learning_rate": 1.3937569136715606e-05, "loss": 0.0679, "step": 126625 }, { "epoch": 1.865215534380937, "grad_norm": 1.4949324131011963, "learning_rate": 1.393593276474862e-05, "loss": 0.0682, "step": 126650 }, { "epoch": 1.865583717471024, "grad_norm": 1.37181556224823, "learning_rate": 1.3934296392781636e-05, "loss": 0.0688, "step": 126675 }, { "epoch": 1.865951900561111, "grad_norm": 2.012047529220581, "learning_rate": 1.3932660020814652e-05, "loss": 0.0735, "step": 126700 }, { "epoch": 1.866320083651198, "grad_norm": 1.7181177139282227, "learning_rate": 1.3931023648847669e-05, "loss": 0.0653, "step": 126725 }, { "epoch": 1.866688266741285, "grad_norm": 1.4143445491790771, "learning_rate": 1.3929387276880683e-05, "loss": 0.0696, "step": 126750 }, { "epoch": 1.8670564498313722, "grad_norm": 1.4846135377883911, "learning_rate": 1.39277509049137e-05, "loss": 0.0752, "step": 126775 }, { "epoch": 1.8674246329214592, "grad_norm": 0.8891171813011169, "learning_rate": 1.3926114532946715e-05, "loss": 0.0706, "step": 126800 }, { "epoch": 1.8677928160115462, "grad_norm": 1.3381410837173462, "learning_rate": 1.3924478160979731e-05, "loss": 0.0659, "step": 126825 }, { "epoch": 1.8681609991016332, "grad_norm": 1.7535873651504517, "learning_rate": 1.3922841789012744e-05, "loss": 0.0697, "step": 126850 }, { "epoch": 1.8685291821917203, "grad_norm": 1.4343935251235962, "learning_rate": 1.392120541704576e-05, "loss": 0.0653, "step": 126875 }, { "epoch": 1.8688973652818075, "grad_norm": 1.4396963119506836, "learning_rate": 1.3919569045078777e-05, "loss": 0.0711, "step": 126900 }, { "epoch": 1.8692655483718945, "grad_norm": 1.6458162069320679, "learning_rate": 1.3917932673111791e-05, "loss": 0.0623, "step": 126925 }, { "epoch": 1.8696337314619815, "grad_norm": 1.4046906232833862, "learning_rate": 1.3916296301144807e-05, "loss": 0.0607, "step": 126950 }, { "epoch": 1.8700019145520685, "grad_norm": 1.6016461849212646, "learning_rate": 1.3914659929177823e-05, "loss": 0.0682, "step": 126975 }, { "epoch": 1.8703700976421556, "grad_norm": 1.6409575939178467, "learning_rate": 1.3913023557210838e-05, "loss": 0.0684, "step": 127000 }, { "epoch": 1.8707382807322426, "grad_norm": 1.0247600078582764, "learning_rate": 1.3911387185243854e-05, "loss": 0.067, "step": 127025 }, { "epoch": 1.8711064638223296, "grad_norm": 0.8390852808952332, "learning_rate": 1.3909750813276868e-05, "loss": 0.0667, "step": 127050 }, { "epoch": 1.8714746469124166, "grad_norm": 1.2949190139770508, "learning_rate": 1.3908114441309883e-05, "loss": 0.0671, "step": 127075 }, { "epoch": 1.8718428300025036, "grad_norm": 1.2676405906677246, "learning_rate": 1.3906478069342899e-05, "loss": 0.0697, "step": 127100 }, { "epoch": 1.8722110130925906, "grad_norm": 1.339078664779663, "learning_rate": 1.3904841697375915e-05, "loss": 0.0664, "step": 127125 }, { "epoch": 1.8725791961826777, "grad_norm": 1.5541309118270874, "learning_rate": 1.3903205325408931e-05, "loss": 0.0684, "step": 127150 }, { "epoch": 1.8729473792727647, "grad_norm": 1.1422303915023804, "learning_rate": 1.3901568953441946e-05, "loss": 0.0706, "step": 127175 }, { "epoch": 1.8733155623628517, "grad_norm": 1.151033639907837, "learning_rate": 1.3899932581474962e-05, "loss": 0.0639, "step": 127200 }, { "epoch": 1.8736837454529387, "grad_norm": 1.6416378021240234, "learning_rate": 1.3898296209507978e-05, "loss": 0.0704, "step": 127225 }, { "epoch": 1.8740519285430257, "grad_norm": 1.3154687881469727, "learning_rate": 1.3896659837540993e-05, "loss": 0.0624, "step": 127250 }, { "epoch": 1.874420111633113, "grad_norm": 1.4494855403900146, "learning_rate": 1.3895023465574007e-05, "loss": 0.0624, "step": 127275 }, { "epoch": 1.8747882947232, "grad_norm": 1.4901341199874878, "learning_rate": 1.3893387093607023e-05, "loss": 0.0688, "step": 127300 }, { "epoch": 1.875156477813287, "grad_norm": 1.6520740985870361, "learning_rate": 1.3891750721640038e-05, "loss": 0.0657, "step": 127325 }, { "epoch": 1.875524660903374, "grad_norm": 1.2405890226364136, "learning_rate": 1.3890114349673054e-05, "loss": 0.0714, "step": 127350 }, { "epoch": 1.8758928439934612, "grad_norm": 1.6109544038772583, "learning_rate": 1.388847797770607e-05, "loss": 0.0663, "step": 127375 }, { "epoch": 1.8762610270835482, "grad_norm": 1.5050334930419922, "learning_rate": 1.3886841605739086e-05, "loss": 0.0685, "step": 127400 }, { "epoch": 1.8766292101736353, "grad_norm": 1.5214776992797852, "learning_rate": 1.38852052337721e-05, "loss": 0.0731, "step": 127425 }, { "epoch": 1.8769973932637223, "grad_norm": 1.0754930973052979, "learning_rate": 1.3883568861805117e-05, "loss": 0.0618, "step": 127450 }, { "epoch": 1.8773655763538093, "grad_norm": 1.4267821311950684, "learning_rate": 1.3881932489838131e-05, "loss": 0.0662, "step": 127475 }, { "epoch": 1.8777337594438963, "grad_norm": 1.4020365476608276, "learning_rate": 1.3880296117871146e-05, "loss": 0.0767, "step": 127500 }, { "epoch": 1.8781019425339833, "grad_norm": 1.3116518259048462, "learning_rate": 1.3878659745904162e-05, "loss": 0.0662, "step": 127525 }, { "epoch": 1.8784701256240703, "grad_norm": 1.164931058883667, "learning_rate": 1.3877023373937178e-05, "loss": 0.0696, "step": 127550 }, { "epoch": 1.8788383087141574, "grad_norm": 1.18403160572052, "learning_rate": 1.3875387001970192e-05, "loss": 0.0663, "step": 127575 }, { "epoch": 1.8792064918042444, "grad_norm": 1.3779178857803345, "learning_rate": 1.3873750630003208e-05, "loss": 0.0704, "step": 127600 }, { "epoch": 1.8795746748943314, "grad_norm": 1.4132893085479736, "learning_rate": 1.3872114258036225e-05, "loss": 0.0677, "step": 127625 }, { "epoch": 1.8799428579844184, "grad_norm": 1.7565144300460815, "learning_rate": 1.387047788606924e-05, "loss": 0.0707, "step": 127650 }, { "epoch": 1.8803110410745054, "grad_norm": 1.2129707336425781, "learning_rate": 1.3868841514102254e-05, "loss": 0.0618, "step": 127675 }, { "epoch": 1.8806792241645924, "grad_norm": 1.627714991569519, "learning_rate": 1.386720514213527e-05, "loss": 0.0722, "step": 127700 }, { "epoch": 1.8810474072546794, "grad_norm": 1.5501409769058228, "learning_rate": 1.3865568770168286e-05, "loss": 0.0655, "step": 127725 }, { "epoch": 1.8814155903447667, "grad_norm": 1.7595138549804688, "learning_rate": 1.38639323982013e-05, "loss": 0.0745, "step": 127750 }, { "epoch": 1.8817837734348537, "grad_norm": 1.1655981540679932, "learning_rate": 1.3862296026234316e-05, "loss": 0.061, "step": 127775 }, { "epoch": 1.8821519565249407, "grad_norm": 1.4186490774154663, "learning_rate": 1.3860659654267333e-05, "loss": 0.0664, "step": 127800 }, { "epoch": 1.8825201396150277, "grad_norm": 1.4063063859939575, "learning_rate": 1.3859023282300347e-05, "loss": 0.0692, "step": 127825 }, { "epoch": 1.8828883227051147, "grad_norm": 2.1492862701416016, "learning_rate": 1.3857386910333363e-05, "loss": 0.0669, "step": 127850 }, { "epoch": 1.883256505795202, "grad_norm": 1.2822747230529785, "learning_rate": 1.385575053836638e-05, "loss": 0.0655, "step": 127875 }, { "epoch": 1.883624688885289, "grad_norm": 1.2677335739135742, "learning_rate": 1.3854114166399392e-05, "loss": 0.0781, "step": 127900 }, { "epoch": 1.883992871975376, "grad_norm": 2.0128173828125, "learning_rate": 1.3852477794432408e-05, "loss": 0.0698, "step": 127925 }, { "epoch": 1.884361055065463, "grad_norm": 1.3676215410232544, "learning_rate": 1.3850841422465424e-05, "loss": 0.06, "step": 127950 }, { "epoch": 1.88472923815555, "grad_norm": 2.009725332260132, "learning_rate": 1.384920505049844e-05, "loss": 0.0686, "step": 127975 }, { "epoch": 1.885097421245637, "grad_norm": 1.3375903367996216, "learning_rate": 1.3847568678531455e-05, "loss": 0.0631, "step": 128000 }, { "epoch": 1.885465604335724, "grad_norm": 1.6937209367752075, "learning_rate": 1.3845932306564471e-05, "loss": 0.067, "step": 128025 }, { "epoch": 1.885833787425811, "grad_norm": 1.276186227798462, "learning_rate": 1.3844295934597487e-05, "loss": 0.0735, "step": 128050 }, { "epoch": 1.886201970515898, "grad_norm": 1.1274360418319702, "learning_rate": 1.3842659562630503e-05, "loss": 0.0614, "step": 128075 }, { "epoch": 1.8865701536059851, "grad_norm": 1.258409023284912, "learning_rate": 1.3841023190663516e-05, "loss": 0.0692, "step": 128100 }, { "epoch": 1.8869383366960721, "grad_norm": 1.585135579109192, "learning_rate": 1.3839386818696532e-05, "loss": 0.0707, "step": 128125 }, { "epoch": 1.8873065197861592, "grad_norm": 1.4370063543319702, "learning_rate": 1.3837750446729547e-05, "loss": 0.0757, "step": 128150 }, { "epoch": 1.8876747028762462, "grad_norm": 1.4753754138946533, "learning_rate": 1.3836114074762563e-05, "loss": 0.073, "step": 128175 }, { "epoch": 1.8880428859663332, "grad_norm": 0.8586186766624451, "learning_rate": 1.3834477702795579e-05, "loss": 0.0692, "step": 128200 }, { "epoch": 1.8884110690564204, "grad_norm": 1.8866338729858398, "learning_rate": 1.3832841330828595e-05, "loss": 0.0754, "step": 128225 }, { "epoch": 1.8887792521465074, "grad_norm": 1.4799463748931885, "learning_rate": 1.383127041374029e-05, "loss": 0.072, "step": 128250 }, { "epoch": 1.8891474352365945, "grad_norm": 1.740657091140747, "learning_rate": 1.3829634041773305e-05, "loss": 0.0738, "step": 128275 }, { "epoch": 1.8895156183266815, "grad_norm": 1.2831006050109863, "learning_rate": 1.3827997669806321e-05, "loss": 0.0761, "step": 128300 }, { "epoch": 1.8898838014167685, "grad_norm": 1.0753891468048096, "learning_rate": 1.3826361297839334e-05, "loss": 0.0727, "step": 128325 }, { "epoch": 1.8902519845068557, "grad_norm": 1.3246605396270752, "learning_rate": 1.382472492587235e-05, "loss": 0.0738, "step": 128350 }, { "epoch": 1.8906201675969427, "grad_norm": 1.0399971008300781, "learning_rate": 1.3823088553905367e-05, "loss": 0.0668, "step": 128375 }, { "epoch": 1.8909883506870298, "grad_norm": 1.3426973819732666, "learning_rate": 1.3821452181938381e-05, "loss": 0.0675, "step": 128400 }, { "epoch": 1.8913565337771168, "grad_norm": 1.9310686588287354, "learning_rate": 1.3819815809971397e-05, "loss": 0.0693, "step": 128425 }, { "epoch": 1.8917247168672038, "grad_norm": 2.069502592086792, "learning_rate": 1.3818179438004413e-05, "loss": 0.0776, "step": 128450 }, { "epoch": 1.8920928999572908, "grad_norm": 1.4830806255340576, "learning_rate": 1.381654306603743e-05, "loss": 0.0668, "step": 128475 }, { "epoch": 1.8924610830473778, "grad_norm": 1.444730281829834, "learning_rate": 1.3814906694070444e-05, "loss": 0.0769, "step": 128500 }, { "epoch": 1.8928292661374648, "grad_norm": 1.0734021663665771, "learning_rate": 1.3813270322103458e-05, "loss": 0.0658, "step": 128525 }, { "epoch": 1.8931974492275518, "grad_norm": 1.1945226192474365, "learning_rate": 1.3811633950136475e-05, "loss": 0.0667, "step": 128550 }, { "epoch": 1.8935656323176389, "grad_norm": 1.3909984827041626, "learning_rate": 1.3809997578169489e-05, "loss": 0.0644, "step": 128575 }, { "epoch": 1.8939338154077259, "grad_norm": 1.0541220903396606, "learning_rate": 1.3808361206202505e-05, "loss": 0.0673, "step": 128600 }, { "epoch": 1.8943019984978129, "grad_norm": 1.4383426904678345, "learning_rate": 1.3806724834235521e-05, "loss": 0.0693, "step": 128625 }, { "epoch": 1.8946701815879, "grad_norm": 1.356213092803955, "learning_rate": 1.3805088462268537e-05, "loss": 0.067, "step": 128650 }, { "epoch": 1.895038364677987, "grad_norm": 0.860144853591919, "learning_rate": 1.3803452090301552e-05, "loss": 0.0695, "step": 128675 }, { "epoch": 1.8954065477680742, "grad_norm": 1.758234977722168, "learning_rate": 1.3801815718334568e-05, "loss": 0.0663, "step": 128700 }, { "epoch": 1.8957747308581612, "grad_norm": 1.6093204021453857, "learning_rate": 1.3800179346367584e-05, "loss": 0.0693, "step": 128725 }, { "epoch": 1.8961429139482482, "grad_norm": 0.980208694934845, "learning_rate": 1.3798542974400597e-05, "loss": 0.0694, "step": 128750 }, { "epoch": 1.8965110970383352, "grad_norm": 1.3171117305755615, "learning_rate": 1.3796906602433613e-05, "loss": 0.0665, "step": 128775 }, { "epoch": 1.8968792801284222, "grad_norm": 1.201322317123413, "learning_rate": 1.379527023046663e-05, "loss": 0.0659, "step": 128800 }, { "epoch": 1.8972474632185095, "grad_norm": 1.3161327838897705, "learning_rate": 1.3793633858499644e-05, "loss": 0.0615, "step": 128825 }, { "epoch": 1.8976156463085965, "grad_norm": 1.1581096649169922, "learning_rate": 1.379199748653266e-05, "loss": 0.0673, "step": 128850 }, { "epoch": 1.8979838293986835, "grad_norm": 1.4302693605422974, "learning_rate": 1.3790361114565676e-05, "loss": 0.0664, "step": 128875 }, { "epoch": 1.8983520124887705, "grad_norm": 0.9522002339363098, "learning_rate": 1.3788724742598692e-05, "loss": 0.0654, "step": 128900 }, { "epoch": 1.8987201955788575, "grad_norm": 1.4306349754333496, "learning_rate": 1.3787088370631707e-05, "loss": 0.0677, "step": 128925 }, { "epoch": 1.8990883786689445, "grad_norm": 1.559780240058899, "learning_rate": 1.3785451998664721e-05, "loss": 0.072, "step": 128950 }, { "epoch": 1.8994565617590315, "grad_norm": 1.4206688404083252, "learning_rate": 1.3783815626697736e-05, "loss": 0.0714, "step": 128975 }, { "epoch": 1.8998247448491186, "grad_norm": 2.223038911819458, "learning_rate": 1.3782179254730752e-05, "loss": 0.066, "step": 129000 }, { "epoch": 1.9001929279392056, "grad_norm": 1.3395779132843018, "learning_rate": 1.3780542882763768e-05, "loss": 0.0667, "step": 129025 }, { "epoch": 1.9005611110292926, "grad_norm": 1.8482961654663086, "learning_rate": 1.3778906510796784e-05, "loss": 0.0682, "step": 129050 }, { "epoch": 1.9009292941193796, "grad_norm": 1.177207589149475, "learning_rate": 1.3777270138829798e-05, "loss": 0.0626, "step": 129075 }, { "epoch": 1.9012974772094666, "grad_norm": 1.647931456565857, "learning_rate": 1.3775633766862815e-05, "loss": 0.0672, "step": 129100 }, { "epoch": 1.9016656602995536, "grad_norm": 1.225214958190918, "learning_rate": 1.377399739489583e-05, "loss": 0.0709, "step": 129125 }, { "epoch": 1.9020338433896407, "grad_norm": 1.8574533462524414, "learning_rate": 1.3772361022928844e-05, "loss": 0.074, "step": 129150 }, { "epoch": 1.902402026479728, "grad_norm": 1.0986663103103638, "learning_rate": 1.377072465096186e-05, "loss": 0.0735, "step": 129175 }, { "epoch": 1.902770209569815, "grad_norm": 1.047900676727295, "learning_rate": 1.3769088278994876e-05, "loss": 0.0634, "step": 129200 }, { "epoch": 1.903138392659902, "grad_norm": 1.5773135423660278, "learning_rate": 1.3767451907027892e-05, "loss": 0.0704, "step": 129225 }, { "epoch": 1.903506575749989, "grad_norm": 1.3682206869125366, "learning_rate": 1.3765815535060906e-05, "loss": 0.0643, "step": 129250 }, { "epoch": 1.903874758840076, "grad_norm": 1.2353607416152954, "learning_rate": 1.3764179163093923e-05, "loss": 0.0647, "step": 129275 }, { "epoch": 1.9042429419301632, "grad_norm": 1.4179573059082031, "learning_rate": 1.3762542791126939e-05, "loss": 0.0651, "step": 129300 }, { "epoch": 1.9046111250202502, "grad_norm": 1.4659348726272583, "learning_rate": 1.3760906419159953e-05, "loss": 0.0688, "step": 129325 }, { "epoch": 1.9049793081103372, "grad_norm": 1.4568201303482056, "learning_rate": 1.375927004719297e-05, "loss": 0.0729, "step": 129350 }, { "epoch": 1.9053474912004242, "grad_norm": 1.249602198600769, "learning_rate": 1.3757633675225984e-05, "loss": 0.0692, "step": 129375 }, { "epoch": 1.9057156742905113, "grad_norm": 0.8342622518539429, "learning_rate": 1.3755997303258998e-05, "loss": 0.0691, "step": 129400 }, { "epoch": 1.9060838573805983, "grad_norm": 1.3442509174346924, "learning_rate": 1.3754360931292014e-05, "loss": 0.0625, "step": 129425 }, { "epoch": 1.9064520404706853, "grad_norm": 1.4729362726211548, "learning_rate": 1.375272455932503e-05, "loss": 0.0718, "step": 129450 }, { "epoch": 1.9068202235607723, "grad_norm": 1.4642016887664795, "learning_rate": 1.3751088187358047e-05, "loss": 0.068, "step": 129475 }, { "epoch": 1.9071884066508593, "grad_norm": 1.5003490447998047, "learning_rate": 1.3749451815391061e-05, "loss": 0.0622, "step": 129500 }, { "epoch": 1.9075565897409463, "grad_norm": 1.138119101524353, "learning_rate": 1.3747815443424077e-05, "loss": 0.0742, "step": 129525 }, { "epoch": 1.9079247728310333, "grad_norm": 1.7713483572006226, "learning_rate": 1.3746179071457093e-05, "loss": 0.0701, "step": 129550 }, { "epoch": 1.9082929559211204, "grad_norm": 1.2612457275390625, "learning_rate": 1.3744542699490106e-05, "loss": 0.0674, "step": 129575 }, { "epoch": 1.9086611390112074, "grad_norm": 1.200155258178711, "learning_rate": 1.3742906327523122e-05, "loss": 0.0632, "step": 129600 }, { "epoch": 1.9090293221012944, "grad_norm": 1.6533445119857788, "learning_rate": 1.3741269955556139e-05, "loss": 0.0706, "step": 129625 }, { "epoch": 1.9093975051913814, "grad_norm": 2.0837175846099854, "learning_rate": 1.3739633583589153e-05, "loss": 0.0741, "step": 129650 }, { "epoch": 1.9097656882814686, "grad_norm": 1.1843743324279785, "learning_rate": 1.373799721162217e-05, "loss": 0.0627, "step": 129675 }, { "epoch": 1.9101338713715557, "grad_norm": 1.6287152767181396, "learning_rate": 1.3736360839655185e-05, "loss": 0.0688, "step": 129700 }, { "epoch": 1.9105020544616427, "grad_norm": 1.1749218702316284, "learning_rate": 1.3734724467688201e-05, "loss": 0.0699, "step": 129725 }, { "epoch": 1.9108702375517297, "grad_norm": 1.2098321914672852, "learning_rate": 1.3733088095721216e-05, "loss": 0.0638, "step": 129750 }, { "epoch": 1.911238420641817, "grad_norm": 1.307051658630371, "learning_rate": 1.3731451723754232e-05, "loss": 0.0692, "step": 129775 }, { "epoch": 1.911606603731904, "grad_norm": 1.4983049631118774, "learning_rate": 1.3729815351787245e-05, "loss": 0.0715, "step": 129800 }, { "epoch": 1.911974786821991, "grad_norm": 1.3979671001434326, "learning_rate": 1.3728178979820261e-05, "loss": 0.0715, "step": 129825 }, { "epoch": 1.912342969912078, "grad_norm": 1.799364686012268, "learning_rate": 1.3726542607853277e-05, "loss": 0.0704, "step": 129850 }, { "epoch": 1.912711153002165, "grad_norm": 1.204068660736084, "learning_rate": 1.3724906235886293e-05, "loss": 0.0697, "step": 129875 }, { "epoch": 1.913079336092252, "grad_norm": 1.6166411638259888, "learning_rate": 1.3723269863919308e-05, "loss": 0.0705, "step": 129900 }, { "epoch": 1.913447519182339, "grad_norm": 1.3805930614471436, "learning_rate": 1.3721633491952324e-05, "loss": 0.064, "step": 129925 }, { "epoch": 1.913815702272426, "grad_norm": 1.4439527988433838, "learning_rate": 1.371999711998534e-05, "loss": 0.068, "step": 129950 }, { "epoch": 1.914183885362513, "grad_norm": 1.1924982070922852, "learning_rate": 1.3718360748018356e-05, "loss": 0.0655, "step": 129975 }, { "epoch": 1.9145520684526, "grad_norm": 1.069240927696228, "learning_rate": 1.3716724376051369e-05, "loss": 0.073, "step": 130000 }, { "epoch": 1.914920251542687, "grad_norm": 1.7875123023986816, "learning_rate": 1.3715088004084385e-05, "loss": 0.0705, "step": 130025 }, { "epoch": 1.915288434632774, "grad_norm": 1.511775255203247, "learning_rate": 1.3713451632117401e-05, "loss": 0.0648, "step": 130050 }, { "epoch": 1.915656617722861, "grad_norm": 1.9179155826568604, "learning_rate": 1.3711815260150416e-05, "loss": 0.0628, "step": 130075 }, { "epoch": 1.9160248008129481, "grad_norm": 1.4427399635314941, "learning_rate": 1.3710178888183432e-05, "loss": 0.071, "step": 130100 }, { "epoch": 1.9163929839030351, "grad_norm": 1.3722892999649048, "learning_rate": 1.3708542516216448e-05, "loss": 0.069, "step": 130125 }, { "epoch": 1.9167611669931224, "grad_norm": 1.374695062637329, "learning_rate": 1.3706906144249462e-05, "loss": 0.0675, "step": 130150 }, { "epoch": 1.9171293500832094, "grad_norm": 1.3429064750671387, "learning_rate": 1.3705269772282479e-05, "loss": 0.0716, "step": 130175 }, { "epoch": 1.9174975331732964, "grad_norm": 1.6231729984283447, "learning_rate": 1.3703633400315495e-05, "loss": 0.0636, "step": 130200 }, { "epoch": 1.9178657162633834, "grad_norm": 1.3547338247299194, "learning_rate": 1.3701997028348508e-05, "loss": 0.0685, "step": 130225 }, { "epoch": 1.9182338993534704, "grad_norm": 0.9752511382102966, "learning_rate": 1.3700360656381524e-05, "loss": 0.0731, "step": 130250 }, { "epoch": 1.9186020824435577, "grad_norm": 1.4045292139053345, "learning_rate": 1.369872428441454e-05, "loss": 0.0644, "step": 130275 }, { "epoch": 1.9189702655336447, "grad_norm": 1.7658567428588867, "learning_rate": 1.3697087912447556e-05, "loss": 0.0664, "step": 130300 }, { "epoch": 1.9193384486237317, "grad_norm": 1.4007624387741089, "learning_rate": 1.369545154048057e-05, "loss": 0.0765, "step": 130325 }, { "epoch": 1.9197066317138187, "grad_norm": 1.3450874090194702, "learning_rate": 1.3693815168513587e-05, "loss": 0.0684, "step": 130350 }, { "epoch": 1.9200748148039057, "grad_norm": 1.3079489469528198, "learning_rate": 1.3692178796546603e-05, "loss": 0.0669, "step": 130375 }, { "epoch": 1.9204429978939928, "grad_norm": 1.468977928161621, "learning_rate": 1.3690542424579619e-05, "loss": 0.068, "step": 130400 }, { "epoch": 1.9208111809840798, "grad_norm": 1.5864746570587158, "learning_rate": 1.3688906052612632e-05, "loss": 0.0734, "step": 130425 }, { "epoch": 1.9211793640741668, "grad_norm": 1.0984007120132446, "learning_rate": 1.3687269680645648e-05, "loss": 0.0688, "step": 130450 }, { "epoch": 1.9215475471642538, "grad_norm": 2.844667911529541, "learning_rate": 1.3685633308678662e-05, "loss": 0.069, "step": 130475 }, { "epoch": 1.9219157302543408, "grad_norm": 1.331337332725525, "learning_rate": 1.3683996936711678e-05, "loss": 0.0589, "step": 130500 }, { "epoch": 1.9222839133444278, "grad_norm": 1.4961379766464233, "learning_rate": 1.3682360564744695e-05, "loss": 0.0739, "step": 130525 }, { "epoch": 1.9226520964345148, "grad_norm": 0.9069755673408508, "learning_rate": 1.368072419277771e-05, "loss": 0.0714, "step": 130550 }, { "epoch": 1.9230202795246019, "grad_norm": 1.4977551698684692, "learning_rate": 1.3679087820810725e-05, "loss": 0.068, "step": 130575 }, { "epoch": 1.9233884626146889, "grad_norm": 1.2419164180755615, "learning_rate": 1.3677451448843741e-05, "loss": 0.0629, "step": 130600 }, { "epoch": 1.9237566457047761, "grad_norm": 1.3516414165496826, "learning_rate": 1.3675815076876756e-05, "loss": 0.0602, "step": 130625 }, { "epoch": 1.9241248287948631, "grad_norm": 0.9260694980621338, "learning_rate": 1.367417870490977e-05, "loss": 0.072, "step": 130650 }, { "epoch": 1.9244930118849501, "grad_norm": 1.580206274986267, "learning_rate": 1.3672542332942786e-05, "loss": 0.0787, "step": 130675 }, { "epoch": 1.9248611949750372, "grad_norm": 1.5083047151565552, "learning_rate": 1.3670905960975803e-05, "loss": 0.0696, "step": 130700 }, { "epoch": 1.9252293780651242, "grad_norm": 1.5534709692001343, "learning_rate": 1.3669335043887496e-05, "loss": 0.073, "step": 130725 }, { "epoch": 1.9255975611552114, "grad_norm": 0.7059693932533264, "learning_rate": 1.3667698671920513e-05, "loss": 0.0654, "step": 130750 }, { "epoch": 1.9259657442452984, "grad_norm": 1.315678358078003, "learning_rate": 1.3666062299953529e-05, "loss": 0.0696, "step": 130775 }, { "epoch": 1.9263339273353854, "grad_norm": 1.7695732116699219, "learning_rate": 1.3664425927986545e-05, "loss": 0.0677, "step": 130800 }, { "epoch": 1.9267021104254725, "grad_norm": 1.2399245500564575, "learning_rate": 1.366278955601956e-05, "loss": 0.0616, "step": 130825 }, { "epoch": 1.9270702935155595, "grad_norm": 1.3680822849273682, "learning_rate": 1.3661153184052574e-05, "loss": 0.0677, "step": 130850 }, { "epoch": 1.9274384766056465, "grad_norm": 1.2249548435211182, "learning_rate": 1.365951681208559e-05, "loss": 0.0683, "step": 130875 }, { "epoch": 1.9278066596957335, "grad_norm": 1.0708106756210327, "learning_rate": 1.3657880440118604e-05, "loss": 0.0701, "step": 130900 }, { "epoch": 1.9281748427858205, "grad_norm": 1.7924996614456177, "learning_rate": 1.365624406815162e-05, "loss": 0.063, "step": 130925 }, { "epoch": 1.9285430258759075, "grad_norm": 1.2274212837219238, "learning_rate": 1.3654607696184637e-05, "loss": 0.0643, "step": 130950 }, { "epoch": 1.9289112089659946, "grad_norm": 1.825490117073059, "learning_rate": 1.3652971324217653e-05, "loss": 0.0665, "step": 130975 }, { "epoch": 1.9292793920560816, "grad_norm": 1.2124346494674683, "learning_rate": 1.3651334952250667e-05, "loss": 0.0684, "step": 131000 }, { "epoch": 1.9296475751461686, "grad_norm": 1.2271982431411743, "learning_rate": 1.3649698580283683e-05, "loss": 0.068, "step": 131025 }, { "epoch": 1.9300157582362556, "grad_norm": 1.4989469051361084, "learning_rate": 1.3648062208316696e-05, "loss": 0.0696, "step": 131050 }, { "epoch": 1.9303839413263426, "grad_norm": 1.6688058376312256, "learning_rate": 1.3646425836349712e-05, "loss": 0.0631, "step": 131075 }, { "epoch": 1.9307521244164298, "grad_norm": 1.2299247980117798, "learning_rate": 1.3644789464382729e-05, "loss": 0.0704, "step": 131100 }, { "epoch": 1.9311203075065169, "grad_norm": 1.480790376663208, "learning_rate": 1.3643153092415745e-05, "loss": 0.0672, "step": 131125 }, { "epoch": 1.9314884905966039, "grad_norm": 1.5254203081130981, "learning_rate": 1.364151672044876e-05, "loss": 0.0673, "step": 131150 }, { "epoch": 1.931856673686691, "grad_norm": 1.2582379579544067, "learning_rate": 1.3639880348481775e-05, "loss": 0.0711, "step": 131175 }, { "epoch": 1.932224856776778, "grad_norm": 1.3952453136444092, "learning_rate": 1.3638243976514791e-05, "loss": 0.0628, "step": 131200 }, { "epoch": 1.9325930398668651, "grad_norm": 1.3503388166427612, "learning_rate": 1.3636607604547808e-05, "loss": 0.0639, "step": 131225 }, { "epoch": 1.9329612229569522, "grad_norm": 1.311848521232605, "learning_rate": 1.3634971232580822e-05, "loss": 0.0705, "step": 131250 }, { "epoch": 1.9333294060470392, "grad_norm": 1.1125034093856812, "learning_rate": 1.3633334860613837e-05, "loss": 0.0634, "step": 131275 }, { "epoch": 1.9336975891371262, "grad_norm": 1.7929441928863525, "learning_rate": 1.3631698488646851e-05, "loss": 0.0651, "step": 131300 }, { "epoch": 1.9340657722272132, "grad_norm": 1.1150362491607666, "learning_rate": 1.3630062116679867e-05, "loss": 0.0652, "step": 131325 }, { "epoch": 1.9344339553173002, "grad_norm": 1.198957085609436, "learning_rate": 1.3628425744712883e-05, "loss": 0.0604, "step": 131350 }, { "epoch": 1.9348021384073872, "grad_norm": 1.4122329950332642, "learning_rate": 1.36267893727459e-05, "loss": 0.0686, "step": 131375 }, { "epoch": 1.9351703214974743, "grad_norm": 1.4253836870193481, "learning_rate": 1.3625153000778914e-05, "loss": 0.0673, "step": 131400 }, { "epoch": 1.9355385045875613, "grad_norm": 1.620025873184204, "learning_rate": 1.362351662881193e-05, "loss": 0.0697, "step": 131425 }, { "epoch": 1.9359066876776483, "grad_norm": 1.8214222192764282, "learning_rate": 1.3621880256844946e-05, "loss": 0.0668, "step": 131450 }, { "epoch": 1.9362748707677353, "grad_norm": 1.322893500328064, "learning_rate": 1.3620243884877959e-05, "loss": 0.0723, "step": 131475 }, { "epoch": 1.9366430538578223, "grad_norm": 0.9099249839782715, "learning_rate": 1.3618607512910975e-05, "loss": 0.0656, "step": 131500 }, { "epoch": 1.9370112369479093, "grad_norm": 1.1030004024505615, "learning_rate": 1.3616971140943991e-05, "loss": 0.0648, "step": 131525 }, { "epoch": 1.9373794200379963, "grad_norm": 1.1647626161575317, "learning_rate": 1.3615334768977006e-05, "loss": 0.0682, "step": 131550 }, { "epoch": 1.9377476031280836, "grad_norm": 1.7726775407791138, "learning_rate": 1.3613698397010022e-05, "loss": 0.0707, "step": 131575 }, { "epoch": 1.9381157862181706, "grad_norm": 1.5560965538024902, "learning_rate": 1.3612062025043038e-05, "loss": 0.0764, "step": 131600 }, { "epoch": 1.9384839693082576, "grad_norm": 1.5568289756774902, "learning_rate": 1.3610425653076054e-05, "loss": 0.0724, "step": 131625 }, { "epoch": 1.9388521523983446, "grad_norm": 1.0233157873153687, "learning_rate": 1.3608789281109069e-05, "loss": 0.0692, "step": 131650 }, { "epoch": 1.9392203354884316, "grad_norm": 1.8779631853103638, "learning_rate": 1.3607152909142085e-05, "loss": 0.0726, "step": 131675 }, { "epoch": 1.9395885185785189, "grad_norm": 1.4789386987686157, "learning_rate": 1.36055165371751e-05, "loss": 0.0652, "step": 131700 }, { "epoch": 1.939956701668606, "grad_norm": 1.684277057647705, "learning_rate": 1.3603880165208114e-05, "loss": 0.0596, "step": 131725 }, { "epoch": 1.940324884758693, "grad_norm": 1.172790288925171, "learning_rate": 1.360224379324113e-05, "loss": 0.0653, "step": 131750 }, { "epoch": 1.94069306784878, "grad_norm": 1.4107803106307983, "learning_rate": 1.3600607421274146e-05, "loss": 0.0652, "step": 131775 }, { "epoch": 1.941061250938867, "grad_norm": 1.1625288724899292, "learning_rate": 1.3598971049307162e-05, "loss": 0.0644, "step": 131800 }, { "epoch": 1.941429434028954, "grad_norm": 1.083372950553894, "learning_rate": 1.3597334677340177e-05, "loss": 0.0705, "step": 131825 }, { "epoch": 1.941797617119041, "grad_norm": 1.3767958879470825, "learning_rate": 1.3595698305373193e-05, "loss": 0.0673, "step": 131850 }, { "epoch": 1.942165800209128, "grad_norm": 1.253279209136963, "learning_rate": 1.3594061933406209e-05, "loss": 0.0674, "step": 131875 }, { "epoch": 1.942533983299215, "grad_norm": 1.4885085821151733, "learning_rate": 1.3592425561439222e-05, "loss": 0.066, "step": 131900 }, { "epoch": 1.942902166389302, "grad_norm": 1.308825969696045, "learning_rate": 1.3590789189472238e-05, "loss": 0.0617, "step": 131925 }, { "epoch": 1.943270349479389, "grad_norm": 1.5395394563674927, "learning_rate": 1.3589152817505254e-05, "loss": 0.072, "step": 131950 }, { "epoch": 1.943638532569476, "grad_norm": 1.3309717178344727, "learning_rate": 1.3587516445538268e-05, "loss": 0.0634, "step": 131975 }, { "epoch": 1.944006715659563, "grad_norm": 1.2955114841461182, "learning_rate": 1.3585880073571285e-05, "loss": 0.0609, "step": 132000 }, { "epoch": 1.94437489874965, "grad_norm": 1.6768534183502197, "learning_rate": 1.35842437016043e-05, "loss": 0.074, "step": 132025 }, { "epoch": 1.9447430818397373, "grad_norm": 1.7316994667053223, "learning_rate": 1.3582607329637317e-05, "loss": 0.0711, "step": 132050 }, { "epoch": 1.9451112649298243, "grad_norm": 1.072589635848999, "learning_rate": 1.3580970957670331e-05, "loss": 0.0719, "step": 132075 }, { "epoch": 1.9454794480199114, "grad_norm": 1.1211479902267456, "learning_rate": 1.3579334585703348e-05, "loss": 0.0722, "step": 132100 }, { "epoch": 1.9458476311099984, "grad_norm": 1.4665725231170654, "learning_rate": 1.357769821373636e-05, "loss": 0.0613, "step": 132125 }, { "epoch": 1.9462158142000854, "grad_norm": 1.207033634185791, "learning_rate": 1.3576061841769376e-05, "loss": 0.0597, "step": 132150 }, { "epoch": 1.9465839972901726, "grad_norm": 1.6699589490890503, "learning_rate": 1.3574425469802393e-05, "loss": 0.0634, "step": 132175 }, { "epoch": 1.9469521803802596, "grad_norm": 0.996124267578125, "learning_rate": 1.3572789097835409e-05, "loss": 0.0674, "step": 132200 }, { "epoch": 1.9473203634703466, "grad_norm": 1.4296915531158447, "learning_rate": 1.3571152725868423e-05, "loss": 0.0707, "step": 132225 }, { "epoch": 1.9476885465604337, "grad_norm": 1.1712855100631714, "learning_rate": 1.356951635390144e-05, "loss": 0.0721, "step": 132250 }, { "epoch": 1.9480567296505207, "grad_norm": 1.4523475170135498, "learning_rate": 1.3567879981934455e-05, "loss": 0.0717, "step": 132275 }, { "epoch": 1.9484249127406077, "grad_norm": 1.652687907218933, "learning_rate": 1.3566243609967472e-05, "loss": 0.077, "step": 132300 }, { "epoch": 1.9487930958306947, "grad_norm": 1.402045726776123, "learning_rate": 1.3564607238000484e-05, "loss": 0.0647, "step": 132325 }, { "epoch": 1.9491612789207817, "grad_norm": 1.2048203945159912, "learning_rate": 1.35629708660335e-05, "loss": 0.0722, "step": 132350 }, { "epoch": 1.9495294620108687, "grad_norm": 1.4486668109893799, "learning_rate": 1.3561334494066517e-05, "loss": 0.0595, "step": 132375 }, { "epoch": 1.9498976451009558, "grad_norm": 0.9311128854751587, "learning_rate": 1.3559698122099531e-05, "loss": 0.0684, "step": 132400 }, { "epoch": 1.9502658281910428, "grad_norm": 1.1343120336532593, "learning_rate": 1.3558061750132547e-05, "loss": 0.0608, "step": 132425 }, { "epoch": 1.9506340112811298, "grad_norm": 1.9259246587753296, "learning_rate": 1.3556425378165563e-05, "loss": 0.0619, "step": 132450 }, { "epoch": 1.9510021943712168, "grad_norm": 1.3511766195297241, "learning_rate": 1.3554789006198578e-05, "loss": 0.072, "step": 132475 }, { "epoch": 1.9513703774613038, "grad_norm": 1.5985580682754517, "learning_rate": 1.3553152634231594e-05, "loss": 0.0603, "step": 132500 }, { "epoch": 1.9517385605513908, "grad_norm": 2.1103451251983643, "learning_rate": 1.3551516262264609e-05, "loss": 0.0743, "step": 132525 }, { "epoch": 1.952106743641478, "grad_norm": 1.3271312713623047, "learning_rate": 1.3549879890297623e-05, "loss": 0.0611, "step": 132550 }, { "epoch": 1.952474926731565, "grad_norm": 1.4392950534820557, "learning_rate": 1.3548243518330639e-05, "loss": 0.0693, "step": 132575 }, { "epoch": 1.952843109821652, "grad_norm": 1.2489006519317627, "learning_rate": 1.3546607146363655e-05, "loss": 0.0575, "step": 132600 }, { "epoch": 1.9532112929117391, "grad_norm": 1.6113096475601196, "learning_rate": 1.3544970774396671e-05, "loss": 0.0686, "step": 132625 }, { "epoch": 1.9535794760018264, "grad_norm": 1.4142565727233887, "learning_rate": 1.3543334402429686e-05, "loss": 0.067, "step": 132650 }, { "epoch": 1.9539476590919134, "grad_norm": 1.5173475742340088, "learning_rate": 1.3541698030462702e-05, "loss": 0.0629, "step": 132675 }, { "epoch": 1.9543158421820004, "grad_norm": 1.070237398147583, "learning_rate": 1.3540061658495718e-05, "loss": 0.0642, "step": 132700 }, { "epoch": 1.9546840252720874, "grad_norm": 1.234835147857666, "learning_rate": 1.3538425286528734e-05, "loss": 0.0662, "step": 132725 }, { "epoch": 1.9550522083621744, "grad_norm": 1.0772147178649902, "learning_rate": 1.3536788914561747e-05, "loss": 0.0641, "step": 132750 }, { "epoch": 1.9554203914522614, "grad_norm": 1.4735125303268433, "learning_rate": 1.3535152542594763e-05, "loss": 0.0706, "step": 132775 }, { "epoch": 1.9557885745423484, "grad_norm": 1.2210239171981812, "learning_rate": 1.3533516170627778e-05, "loss": 0.0656, "step": 132800 }, { "epoch": 1.9561567576324355, "grad_norm": 1.1605870723724365, "learning_rate": 1.3531879798660794e-05, "loss": 0.0723, "step": 132825 }, { "epoch": 1.9565249407225225, "grad_norm": 1.618125319480896, "learning_rate": 1.353024342669381e-05, "loss": 0.0663, "step": 132850 }, { "epoch": 1.9568931238126095, "grad_norm": 1.441646695137024, "learning_rate": 1.3528607054726826e-05, "loss": 0.0651, "step": 132875 }, { "epoch": 1.9572613069026965, "grad_norm": 1.383882761001587, "learning_rate": 1.352697068275984e-05, "loss": 0.0689, "step": 132900 }, { "epoch": 1.9576294899927835, "grad_norm": 1.426594853401184, "learning_rate": 1.3525334310792857e-05, "loss": 0.0692, "step": 132925 }, { "epoch": 1.9579976730828705, "grad_norm": 1.6448640823364258, "learning_rate": 1.3523697938825871e-05, "loss": 0.0695, "step": 132950 }, { "epoch": 1.9583658561729576, "grad_norm": 1.2162638902664185, "learning_rate": 1.3522061566858886e-05, "loss": 0.0695, "step": 132975 }, { "epoch": 1.9587340392630446, "grad_norm": 0.8742014169692993, "learning_rate": 1.3520425194891902e-05, "loss": 0.0556, "step": 133000 }, { "epoch": 1.9591022223531318, "grad_norm": 1.3841526508331299, "learning_rate": 1.3518788822924918e-05, "loss": 0.0607, "step": 133025 }, { "epoch": 1.9594704054432188, "grad_norm": 1.3889743089675903, "learning_rate": 1.3517152450957932e-05, "loss": 0.0673, "step": 133050 }, { "epoch": 1.9598385885333058, "grad_norm": 1.669912576675415, "learning_rate": 1.3515516078990949e-05, "loss": 0.067, "step": 133075 }, { "epoch": 1.9602067716233929, "grad_norm": 1.5863838195800781, "learning_rate": 1.3513879707023965e-05, "loss": 0.0761, "step": 133100 }, { "epoch": 1.9605749547134799, "grad_norm": 1.5423662662506104, "learning_rate": 1.3512243335056981e-05, "loss": 0.0588, "step": 133125 }, { "epoch": 1.960943137803567, "grad_norm": 1.1435093879699707, "learning_rate": 1.3510606963089995e-05, "loss": 0.0643, "step": 133150 }, { "epoch": 1.9613113208936541, "grad_norm": 1.0480999946594238, "learning_rate": 1.350897059112301e-05, "loss": 0.0637, "step": 133175 }, { "epoch": 1.9616795039837411, "grad_norm": 1.140135407447815, "learning_rate": 1.3507334219156026e-05, "loss": 0.0695, "step": 133200 }, { "epoch": 1.9620476870738282, "grad_norm": 1.8027437925338745, "learning_rate": 1.350569784718904e-05, "loss": 0.068, "step": 133225 }, { "epoch": 1.9624158701639152, "grad_norm": 1.075918197631836, "learning_rate": 1.3504061475222057e-05, "loss": 0.0603, "step": 133250 }, { "epoch": 1.9627840532540022, "grad_norm": 1.0406928062438965, "learning_rate": 1.3502425103255073e-05, "loss": 0.0636, "step": 133275 }, { "epoch": 1.9631522363440892, "grad_norm": 1.8164253234863281, "learning_rate": 1.3500788731288089e-05, "loss": 0.0692, "step": 133300 }, { "epoch": 1.9635204194341762, "grad_norm": 2.3341472148895264, "learning_rate": 1.3499152359321103e-05, "loss": 0.0684, "step": 133325 }, { "epoch": 1.9638886025242632, "grad_norm": 1.5393311977386475, "learning_rate": 1.349751598735412e-05, "loss": 0.0695, "step": 133350 }, { "epoch": 1.9642567856143502, "grad_norm": 0.9977759718894958, "learning_rate": 1.3495879615387132e-05, "loss": 0.0675, "step": 133375 }, { "epoch": 1.9646249687044373, "grad_norm": 1.6607754230499268, "learning_rate": 1.3494243243420148e-05, "loss": 0.072, "step": 133400 }, { "epoch": 1.9649931517945243, "grad_norm": 1.4092211723327637, "learning_rate": 1.3492606871453165e-05, "loss": 0.0699, "step": 133425 }, { "epoch": 1.9653613348846113, "grad_norm": 1.3322073221206665, "learning_rate": 1.349097049948618e-05, "loss": 0.0642, "step": 133450 }, { "epoch": 1.9657295179746983, "grad_norm": 1.4318546056747437, "learning_rate": 1.3489334127519195e-05, "loss": 0.0626, "step": 133475 }, { "epoch": 1.9660977010647855, "grad_norm": 0.9655215740203857, "learning_rate": 1.3487697755552211e-05, "loss": 0.0642, "step": 133500 }, { "epoch": 1.9664658841548726, "grad_norm": 1.1807976961135864, "learning_rate": 1.3486061383585227e-05, "loss": 0.0637, "step": 133525 }, { "epoch": 1.9668340672449596, "grad_norm": 1.1131510734558105, "learning_rate": 1.3484425011618244e-05, "loss": 0.0683, "step": 133550 }, { "epoch": 1.9672022503350466, "grad_norm": 1.3223555088043213, "learning_rate": 1.3482854094529938e-05, "loss": 0.0731, "step": 133575 }, { "epoch": 1.9675704334251336, "grad_norm": 2.066756010055542, "learning_rate": 1.3481217722562952e-05, "loss": 0.0674, "step": 133600 }, { "epoch": 1.9679386165152208, "grad_norm": 1.721665382385254, "learning_rate": 1.3479581350595966e-05, "loss": 0.0683, "step": 133625 }, { "epoch": 1.9683067996053079, "grad_norm": 1.6085426807403564, "learning_rate": 1.3477944978628983e-05, "loss": 0.0637, "step": 133650 }, { "epoch": 1.9686749826953949, "grad_norm": 0.8268477916717529, "learning_rate": 1.3476308606661999e-05, "loss": 0.0673, "step": 133675 }, { "epoch": 1.9690431657854819, "grad_norm": 1.0188851356506348, "learning_rate": 1.3474672234695015e-05, "loss": 0.0703, "step": 133700 }, { "epoch": 1.969411348875569, "grad_norm": 1.8331669569015503, "learning_rate": 1.347303586272803e-05, "loss": 0.0572, "step": 133725 }, { "epoch": 1.969779531965656, "grad_norm": 1.1416645050048828, "learning_rate": 1.3471399490761045e-05, "loss": 0.0684, "step": 133750 }, { "epoch": 1.970147715055743, "grad_norm": 1.3836314678192139, "learning_rate": 1.3469763118794062e-05, "loss": 0.0656, "step": 133775 }, { "epoch": 1.97051589814583, "grad_norm": 1.3859492540359497, "learning_rate": 1.3468126746827074e-05, "loss": 0.0638, "step": 133800 }, { "epoch": 1.970884081235917, "grad_norm": 0.8829729557037354, "learning_rate": 1.346649037486009e-05, "loss": 0.0619, "step": 133825 }, { "epoch": 1.971252264326004, "grad_norm": 1.0988966226577759, "learning_rate": 1.3464854002893107e-05, "loss": 0.0652, "step": 133850 }, { "epoch": 1.971620447416091, "grad_norm": 1.6365810632705688, "learning_rate": 1.3463217630926121e-05, "loss": 0.07, "step": 133875 }, { "epoch": 1.971988630506178, "grad_norm": 1.2164117097854614, "learning_rate": 1.3461581258959137e-05, "loss": 0.0596, "step": 133900 }, { "epoch": 1.972356813596265, "grad_norm": 1.175513505935669, "learning_rate": 1.3459944886992153e-05, "loss": 0.0703, "step": 133925 }, { "epoch": 1.972724996686352, "grad_norm": 1.099058747291565, "learning_rate": 1.345830851502517e-05, "loss": 0.0622, "step": 133950 }, { "epoch": 1.9730931797764393, "grad_norm": 1.2795138359069824, "learning_rate": 1.3456672143058184e-05, "loss": 0.0726, "step": 133975 }, { "epoch": 1.9734613628665263, "grad_norm": 1.2164860963821411, "learning_rate": 1.3455035771091199e-05, "loss": 0.0682, "step": 134000 }, { "epoch": 1.9738295459566133, "grad_norm": 1.0763880014419556, "learning_rate": 1.3453399399124215e-05, "loss": 0.0703, "step": 134025 }, { "epoch": 1.9741977290467003, "grad_norm": 1.5583292245864868, "learning_rate": 1.3451763027157229e-05, "loss": 0.0619, "step": 134050 }, { "epoch": 1.9745659121367873, "grad_norm": 1.5683109760284424, "learning_rate": 1.3450126655190245e-05, "loss": 0.0606, "step": 134075 }, { "epoch": 1.9749340952268746, "grad_norm": 1.1754193305969238, "learning_rate": 1.3448490283223261e-05, "loss": 0.0697, "step": 134100 }, { "epoch": 1.9753022783169616, "grad_norm": 1.5604636669158936, "learning_rate": 1.3446853911256278e-05, "loss": 0.0663, "step": 134125 }, { "epoch": 1.9756704614070486, "grad_norm": 1.7430775165557861, "learning_rate": 1.3445217539289292e-05, "loss": 0.0678, "step": 134150 }, { "epoch": 1.9760386444971356, "grad_norm": 1.7419719696044922, "learning_rate": 1.3443581167322308e-05, "loss": 0.0718, "step": 134175 }, { "epoch": 1.9764068275872226, "grad_norm": 1.3962990045547485, "learning_rate": 1.3441944795355324e-05, "loss": 0.0619, "step": 134200 }, { "epoch": 1.9767750106773097, "grad_norm": 1.4117295742034912, "learning_rate": 1.3440308423388337e-05, "loss": 0.068, "step": 134225 }, { "epoch": 1.9771431937673967, "grad_norm": 1.3334444761276245, "learning_rate": 1.3438672051421353e-05, "loss": 0.0638, "step": 134250 }, { "epoch": 1.9775113768574837, "grad_norm": 1.459469199180603, "learning_rate": 1.343703567945437e-05, "loss": 0.0701, "step": 134275 }, { "epoch": 1.9778795599475707, "grad_norm": 1.6322544813156128, "learning_rate": 1.3435399307487384e-05, "loss": 0.0656, "step": 134300 }, { "epoch": 1.9782477430376577, "grad_norm": 1.5134178400039673, "learning_rate": 1.34337629355204e-05, "loss": 0.0608, "step": 134325 }, { "epoch": 1.9786159261277447, "grad_norm": 1.6400991678237915, "learning_rate": 1.3432126563553416e-05, "loss": 0.0737, "step": 134350 }, { "epoch": 1.9789841092178317, "grad_norm": 1.7818231582641602, "learning_rate": 1.3430490191586432e-05, "loss": 0.067, "step": 134375 }, { "epoch": 1.9793522923079188, "grad_norm": 1.3408386707305908, "learning_rate": 1.3428853819619447e-05, "loss": 0.0607, "step": 134400 }, { "epoch": 1.9797204753980058, "grad_norm": 1.1619755029678345, "learning_rate": 1.3427217447652461e-05, "loss": 0.0652, "step": 134425 }, { "epoch": 1.980088658488093, "grad_norm": 1.128059983253479, "learning_rate": 1.3425581075685476e-05, "loss": 0.0665, "step": 134450 }, { "epoch": 1.98045684157818, "grad_norm": 0.9418730735778809, "learning_rate": 1.3423944703718492e-05, "loss": 0.0591, "step": 134475 }, { "epoch": 1.980825024668267, "grad_norm": 1.3298439979553223, "learning_rate": 1.3422308331751508e-05, "loss": 0.0651, "step": 134500 }, { "epoch": 1.981193207758354, "grad_norm": 1.6387840509414673, "learning_rate": 1.3420671959784524e-05, "loss": 0.0718, "step": 134525 }, { "epoch": 1.981561390848441, "grad_norm": 1.352338433265686, "learning_rate": 1.3419035587817539e-05, "loss": 0.0651, "step": 134550 }, { "epoch": 1.9819295739385283, "grad_norm": 1.4517595767974854, "learning_rate": 1.3417399215850555e-05, "loss": 0.0682, "step": 134575 }, { "epoch": 1.9822977570286153, "grad_norm": 1.3293002843856812, "learning_rate": 1.3415762843883571e-05, "loss": 0.0654, "step": 134600 }, { "epoch": 1.9826659401187023, "grad_norm": 1.3806308507919312, "learning_rate": 1.3414126471916587e-05, "loss": 0.0651, "step": 134625 }, { "epoch": 1.9830341232087894, "grad_norm": 1.2611339092254639, "learning_rate": 1.34124900999496e-05, "loss": 0.0637, "step": 134650 }, { "epoch": 1.9834023062988764, "grad_norm": 1.5833430290222168, "learning_rate": 1.3410853727982616e-05, "loss": 0.0629, "step": 134675 }, { "epoch": 1.9837704893889634, "grad_norm": 1.1704708337783813, "learning_rate": 1.3409217356015632e-05, "loss": 0.0679, "step": 134700 }, { "epoch": 1.9841386724790504, "grad_norm": 1.4179396629333496, "learning_rate": 1.3407580984048647e-05, "loss": 0.0729, "step": 134725 }, { "epoch": 1.9845068555691374, "grad_norm": 1.4768248796463013, "learning_rate": 1.3405944612081663e-05, "loss": 0.0687, "step": 134750 }, { "epoch": 1.9848750386592244, "grad_norm": 1.534879446029663, "learning_rate": 1.3404308240114679e-05, "loss": 0.066, "step": 134775 }, { "epoch": 1.9852432217493114, "grad_norm": 1.4434044361114502, "learning_rate": 1.3402671868147693e-05, "loss": 0.0632, "step": 134800 }, { "epoch": 1.9856114048393985, "grad_norm": 1.1986098289489746, "learning_rate": 1.340103549618071e-05, "loss": 0.066, "step": 134825 }, { "epoch": 1.9859795879294855, "grad_norm": 1.2370998859405518, "learning_rate": 1.3399399124213724e-05, "loss": 0.0646, "step": 134850 }, { "epoch": 1.9863477710195725, "grad_norm": 2.0699992179870605, "learning_rate": 1.3397762752246738e-05, "loss": 0.0718, "step": 134875 }, { "epoch": 1.9867159541096595, "grad_norm": 1.4780324697494507, "learning_rate": 1.3396126380279755e-05, "loss": 0.0644, "step": 134900 }, { "epoch": 1.9870841371997465, "grad_norm": 1.4479182958602905, "learning_rate": 1.339449000831277e-05, "loss": 0.0723, "step": 134925 }, { "epoch": 1.9874523202898338, "grad_norm": 1.32070791721344, "learning_rate": 1.3392853636345787e-05, "loss": 0.0683, "step": 134950 }, { "epoch": 1.9878205033799208, "grad_norm": 1.3346669673919678, "learning_rate": 1.3391217264378801e-05, "loss": 0.0649, "step": 134975 }, { "epoch": 1.9881886864700078, "grad_norm": 1.6738038063049316, "learning_rate": 1.3389580892411817e-05, "loss": 0.0665, "step": 135000 }, { "epoch": 1.9885568695600948, "grad_norm": 1.0200092792510986, "learning_rate": 1.3387944520444834e-05, "loss": 0.0658, "step": 135025 }, { "epoch": 1.988925052650182, "grad_norm": 1.3845388889312744, "learning_rate": 1.338630814847785e-05, "loss": 0.0675, "step": 135050 }, { "epoch": 1.989293235740269, "grad_norm": 1.2986143827438354, "learning_rate": 1.3384671776510863e-05, "loss": 0.0723, "step": 135075 }, { "epoch": 1.989661418830356, "grad_norm": 1.4071464538574219, "learning_rate": 1.3383035404543879e-05, "loss": 0.0649, "step": 135100 }, { "epoch": 1.990029601920443, "grad_norm": 1.2547434568405151, "learning_rate": 1.3381399032576893e-05, "loss": 0.0648, "step": 135125 }, { "epoch": 1.99039778501053, "grad_norm": 1.0378402471542358, "learning_rate": 1.337976266060991e-05, "loss": 0.0652, "step": 135150 }, { "epoch": 1.9907659681006171, "grad_norm": 1.137075424194336, "learning_rate": 1.3378126288642925e-05, "loss": 0.0668, "step": 135175 }, { "epoch": 1.9911341511907041, "grad_norm": 1.057377815246582, "learning_rate": 1.3376489916675942e-05, "loss": 0.0607, "step": 135200 }, { "epoch": 1.9915023342807912, "grad_norm": 1.374295711517334, "learning_rate": 1.3374853544708956e-05, "loss": 0.0587, "step": 135225 }, { "epoch": 1.9918705173708782, "grad_norm": 1.5446298122406006, "learning_rate": 1.3373217172741972e-05, "loss": 0.0707, "step": 135250 }, { "epoch": 1.9922387004609652, "grad_norm": 1.2055821418762207, "learning_rate": 1.3371580800774987e-05, "loss": 0.0622, "step": 135275 }, { "epoch": 1.9926068835510522, "grad_norm": 1.558401107788086, "learning_rate": 1.3369944428808001e-05, "loss": 0.06, "step": 135300 }, { "epoch": 1.9929750666411392, "grad_norm": 0.8838926553726196, "learning_rate": 1.3368308056841017e-05, "loss": 0.0621, "step": 135325 }, { "epoch": 1.9933432497312262, "grad_norm": 1.9442501068115234, "learning_rate": 1.3366671684874033e-05, "loss": 0.064, "step": 135350 }, { "epoch": 1.9937114328213132, "grad_norm": 1.5155824422836304, "learning_rate": 1.3365035312907048e-05, "loss": 0.0697, "step": 135375 }, { "epoch": 1.9940796159114003, "grad_norm": 1.2601693868637085, "learning_rate": 1.3363398940940064e-05, "loss": 0.0696, "step": 135400 }, { "epoch": 1.9944477990014875, "grad_norm": 1.1521936655044556, "learning_rate": 1.336176256897308e-05, "loss": 0.0666, "step": 135425 }, { "epoch": 1.9948159820915745, "grad_norm": 1.2650203704833984, "learning_rate": 1.3360126197006096e-05, "loss": 0.0667, "step": 135450 }, { "epoch": 1.9951841651816615, "grad_norm": 1.6142756938934326, "learning_rate": 1.3358489825039109e-05, "loss": 0.0677, "step": 135475 }, { "epoch": 1.9955523482717485, "grad_norm": 1.4373137950897217, "learning_rate": 1.3356853453072125e-05, "loss": 0.0701, "step": 135500 }, { "epoch": 1.9959205313618356, "grad_norm": 1.6161986589431763, "learning_rate": 1.3355217081105141e-05, "loss": 0.0644, "step": 135525 }, { "epoch": 1.9962887144519228, "grad_norm": 1.5426843166351318, "learning_rate": 1.3353580709138156e-05, "loss": 0.0695, "step": 135550 }, { "epoch": 1.9966568975420098, "grad_norm": 1.4742108583450317, "learning_rate": 1.3351944337171172e-05, "loss": 0.0607, "step": 135575 }, { "epoch": 1.9970250806320968, "grad_norm": 1.3358180522918701, "learning_rate": 1.3350307965204188e-05, "loss": 0.0641, "step": 135600 }, { "epoch": 1.9973932637221838, "grad_norm": 0.905835747718811, "learning_rate": 1.3348671593237204e-05, "loss": 0.0661, "step": 135625 }, { "epoch": 1.9977614468122709, "grad_norm": 1.4315563440322876, "learning_rate": 1.3347035221270219e-05, "loss": 0.0655, "step": 135650 }, { "epoch": 1.9981296299023579, "grad_norm": 1.4081978797912598, "learning_rate": 1.3345398849303235e-05, "loss": 0.0681, "step": 135675 }, { "epoch": 1.9984978129924449, "grad_norm": 1.5095083713531494, "learning_rate": 1.3343762477336248e-05, "loss": 0.0688, "step": 135700 }, { "epoch": 1.998865996082532, "grad_norm": 1.2680156230926514, "learning_rate": 1.3342126105369264e-05, "loss": 0.0671, "step": 135725 }, { "epoch": 1.999234179172619, "grad_norm": 1.801788091659546, "learning_rate": 1.334048973340228e-05, "loss": 0.0649, "step": 135750 }, { "epoch": 1.999602362262706, "grad_norm": 1.603861689567566, "learning_rate": 1.3338853361435296e-05, "loss": 0.0626, "step": 135775 }, { "epoch": 1.999970545352793, "grad_norm": 1.4096496105194092, "learning_rate": 1.333721698946831e-05, "loss": 0.0693, "step": 135800 }, { "epoch": 2.0, "eval_loss": 0.06252778321504593, "eval_runtime": 115.2179, "eval_samples_per_second": 3078.93, "eval_steps_per_second": 6.015, "step": 135802 }, { "epoch": 2.00033872844288, "grad_norm": 1.7620490789413452, "learning_rate": 1.3335580617501327e-05, "loss": 0.0664, "step": 135825 }, { "epoch": 2.000706911532967, "grad_norm": 1.201253890991211, "learning_rate": 1.3333944245534343e-05, "loss": 0.0647, "step": 135850 }, { "epoch": 2.001075094623054, "grad_norm": 0.907674252986908, "learning_rate": 1.3332307873567359e-05, "loss": 0.0577, "step": 135875 }, { "epoch": 2.001443277713141, "grad_norm": 1.0521012544631958, "learning_rate": 1.3330671501600372e-05, "loss": 0.0687, "step": 135900 }, { "epoch": 2.001811460803228, "grad_norm": 1.1505175828933716, "learning_rate": 1.3329035129633388e-05, "loss": 0.0696, "step": 135925 }, { "epoch": 2.0021796438933155, "grad_norm": 1.0560606718063354, "learning_rate": 1.3327398757666402e-05, "loss": 0.0571, "step": 135950 }, { "epoch": 2.0025478269834025, "grad_norm": 1.2308223247528076, "learning_rate": 1.3325762385699419e-05, "loss": 0.0582, "step": 135975 }, { "epoch": 2.0029160100734895, "grad_norm": 1.277739405632019, "learning_rate": 1.3324191468611114e-05, "loss": 0.0604, "step": 136000 }, { "epoch": 2.0032841931635765, "grad_norm": 1.522047996520996, "learning_rate": 1.332255509664413e-05, "loss": 0.0607, "step": 136025 }, { "epoch": 2.0036523762536635, "grad_norm": 1.4001848697662354, "learning_rate": 1.3320918724677145e-05, "loss": 0.0602, "step": 136050 }, { "epoch": 2.0040205593437506, "grad_norm": 1.267114520072937, "learning_rate": 1.3319282352710161e-05, "loss": 0.0632, "step": 136075 }, { "epoch": 2.0043887424338376, "grad_norm": 1.0386160612106323, "learning_rate": 1.3317645980743177e-05, "loss": 0.068, "step": 136100 }, { "epoch": 2.0047569255239246, "grad_norm": 1.0514968633651733, "learning_rate": 1.331600960877619e-05, "loss": 0.0633, "step": 136125 }, { "epoch": 2.0051251086140116, "grad_norm": 1.0674878358840942, "learning_rate": 1.3314373236809206e-05, "loss": 0.0601, "step": 136150 }, { "epoch": 2.0054932917040986, "grad_norm": 1.092009425163269, "learning_rate": 1.3312736864842222e-05, "loss": 0.0635, "step": 136175 }, { "epoch": 2.0058614747941856, "grad_norm": 1.6346251964569092, "learning_rate": 1.3311100492875237e-05, "loss": 0.0595, "step": 136200 }, { "epoch": 2.0062296578842727, "grad_norm": 1.1024657487869263, "learning_rate": 1.3309464120908253e-05, "loss": 0.0667, "step": 136225 }, { "epoch": 2.0065978409743597, "grad_norm": 1.558005928993225, "learning_rate": 1.3307827748941269e-05, "loss": 0.0653, "step": 136250 }, { "epoch": 2.0069660240644467, "grad_norm": 1.4044561386108398, "learning_rate": 1.3306191376974285e-05, "loss": 0.0664, "step": 136275 }, { "epoch": 2.0073342071545337, "grad_norm": 1.5821421146392822, "learning_rate": 1.33045550050073e-05, "loss": 0.0521, "step": 136300 }, { "epoch": 2.0077023902446207, "grad_norm": 1.2789149284362793, "learning_rate": 1.3302918633040314e-05, "loss": 0.0565, "step": 136325 }, { "epoch": 2.0080705733347077, "grad_norm": 1.0251727104187012, "learning_rate": 1.330128226107333e-05, "loss": 0.0671, "step": 136350 }, { "epoch": 2.0084387564247947, "grad_norm": 1.1342840194702148, "learning_rate": 1.3299645889106345e-05, "loss": 0.0615, "step": 136375 }, { "epoch": 2.0088069395148818, "grad_norm": 1.3609364032745361, "learning_rate": 1.329800951713936e-05, "loss": 0.0581, "step": 136400 }, { "epoch": 2.0091751226049688, "grad_norm": 1.4621198177337646, "learning_rate": 1.3296373145172377e-05, "loss": 0.061, "step": 136425 }, { "epoch": 2.0095433056950562, "grad_norm": 1.2368834018707275, "learning_rate": 1.3294736773205393e-05, "loss": 0.0609, "step": 136450 }, { "epoch": 2.0099114887851433, "grad_norm": 1.1904629468917847, "learning_rate": 1.3293100401238407e-05, "loss": 0.0648, "step": 136475 }, { "epoch": 2.0102796718752303, "grad_norm": 0.9137811660766602, "learning_rate": 1.3291464029271424e-05, "loss": 0.0653, "step": 136500 }, { "epoch": 2.0106478549653173, "grad_norm": 1.5463374853134155, "learning_rate": 1.328982765730444e-05, "loss": 0.0675, "step": 136525 }, { "epoch": 2.0110160380554043, "grad_norm": 1.4156731367111206, "learning_rate": 1.3288191285337453e-05, "loss": 0.0629, "step": 136550 }, { "epoch": 2.0113842211454913, "grad_norm": 1.0640830993652344, "learning_rate": 1.3286554913370469e-05, "loss": 0.0642, "step": 136575 }, { "epoch": 2.0117524042355783, "grad_norm": 0.9086377024650574, "learning_rate": 1.3284918541403485e-05, "loss": 0.0565, "step": 136600 }, { "epoch": 2.0121205873256653, "grad_norm": 1.2068498134613037, "learning_rate": 1.32832821694365e-05, "loss": 0.055, "step": 136625 }, { "epoch": 2.0124887704157524, "grad_norm": 1.6517916917800903, "learning_rate": 1.3281645797469515e-05, "loss": 0.0588, "step": 136650 }, { "epoch": 2.0128569535058394, "grad_norm": 1.911002278327942, "learning_rate": 1.3280009425502532e-05, "loss": 0.0675, "step": 136675 }, { "epoch": 2.0132251365959264, "grad_norm": 1.4228066205978394, "learning_rate": 1.3278373053535548e-05, "loss": 0.0601, "step": 136700 }, { "epoch": 2.0135933196860134, "grad_norm": 1.662040114402771, "learning_rate": 1.3276736681568562e-05, "loss": 0.0597, "step": 136725 }, { "epoch": 2.0139615027761004, "grad_norm": 1.6213383674621582, "learning_rate": 1.3275100309601577e-05, "loss": 0.0668, "step": 136750 }, { "epoch": 2.0143296858661874, "grad_norm": 1.1999540328979492, "learning_rate": 1.3273463937634591e-05, "loss": 0.0648, "step": 136775 }, { "epoch": 2.0146978689562745, "grad_norm": 1.0933380126953125, "learning_rate": 1.3271827565667607e-05, "loss": 0.0683, "step": 136800 }, { "epoch": 2.0150660520463615, "grad_norm": 1.6969233751296997, "learning_rate": 1.3270191193700623e-05, "loss": 0.0613, "step": 136825 }, { "epoch": 2.0154342351364485, "grad_norm": 1.2042580842971802, "learning_rate": 1.326855482173364e-05, "loss": 0.0668, "step": 136850 }, { "epoch": 2.0158024182265355, "grad_norm": 1.8126479387283325, "learning_rate": 1.3266918449766654e-05, "loss": 0.0611, "step": 136875 }, { "epoch": 2.0161706013166225, "grad_norm": 0.9478511214256287, "learning_rate": 1.326528207779967e-05, "loss": 0.0673, "step": 136900 }, { "epoch": 2.01653878440671, "grad_norm": 1.2510234117507935, "learning_rate": 1.3263645705832686e-05, "loss": 0.0632, "step": 136925 }, { "epoch": 2.016906967496797, "grad_norm": 1.2847950458526611, "learning_rate": 1.3262009333865699e-05, "loss": 0.0567, "step": 136950 }, { "epoch": 2.017275150586884, "grad_norm": 1.6596367359161377, "learning_rate": 1.3260372961898715e-05, "loss": 0.0583, "step": 136975 }, { "epoch": 2.017643333676971, "grad_norm": 1.4504663944244385, "learning_rate": 1.3258736589931731e-05, "loss": 0.0572, "step": 137000 }, { "epoch": 2.018011516767058, "grad_norm": 1.1833807229995728, "learning_rate": 1.3257100217964748e-05, "loss": 0.0664, "step": 137025 }, { "epoch": 2.018379699857145, "grad_norm": 0.8727373480796814, "learning_rate": 1.3255463845997762e-05, "loss": 0.0654, "step": 137050 }, { "epoch": 2.018747882947232, "grad_norm": 1.294104814529419, "learning_rate": 1.3253827474030778e-05, "loss": 0.0648, "step": 137075 }, { "epoch": 2.019116066037319, "grad_norm": 1.1471279859542847, "learning_rate": 1.3252191102063794e-05, "loss": 0.0639, "step": 137100 }, { "epoch": 2.019484249127406, "grad_norm": 0.8571379780769348, "learning_rate": 1.3250554730096809e-05, "loss": 0.0604, "step": 137125 }, { "epoch": 2.019852432217493, "grad_norm": 1.138739824295044, "learning_rate": 1.3248918358129825e-05, "loss": 0.0595, "step": 137150 }, { "epoch": 2.02022061530758, "grad_norm": 0.9922412633895874, "learning_rate": 1.324728198616284e-05, "loss": 0.0587, "step": 137175 }, { "epoch": 2.020588798397667, "grad_norm": 1.403673529624939, "learning_rate": 1.3245645614195854e-05, "loss": 0.0629, "step": 137200 }, { "epoch": 2.020956981487754, "grad_norm": 1.358481526374817, "learning_rate": 1.324400924222887e-05, "loss": 0.0666, "step": 137225 }, { "epoch": 2.021325164577841, "grad_norm": 1.0647672414779663, "learning_rate": 1.3242372870261886e-05, "loss": 0.0615, "step": 137250 }, { "epoch": 2.021693347667928, "grad_norm": 1.1101211309432983, "learning_rate": 1.3240736498294902e-05, "loss": 0.0671, "step": 137275 }, { "epoch": 2.022061530758015, "grad_norm": 1.926060438156128, "learning_rate": 1.3239100126327917e-05, "loss": 0.0638, "step": 137300 }, { "epoch": 2.022429713848102, "grad_norm": 1.2956942319869995, "learning_rate": 1.3237463754360933e-05, "loss": 0.0673, "step": 137325 }, { "epoch": 2.0227978969381892, "grad_norm": 0.756001889705658, "learning_rate": 1.3235827382393949e-05, "loss": 0.0606, "step": 137350 }, { "epoch": 2.0231660800282762, "grad_norm": 0.7750321626663208, "learning_rate": 1.3234191010426962e-05, "loss": 0.061, "step": 137375 }, { "epoch": 2.0235342631183637, "grad_norm": 1.663071870803833, "learning_rate": 1.3232554638459978e-05, "loss": 0.0586, "step": 137400 }, { "epoch": 2.0239024462084507, "grad_norm": 1.4343326091766357, "learning_rate": 1.3230918266492994e-05, "loss": 0.0662, "step": 137425 }, { "epoch": 2.0242706292985377, "grad_norm": 1.3194102048873901, "learning_rate": 1.3229281894526009e-05, "loss": 0.0633, "step": 137450 }, { "epoch": 2.0246388123886248, "grad_norm": 1.8688809871673584, "learning_rate": 1.3227645522559025e-05, "loss": 0.0642, "step": 137475 }, { "epoch": 2.0250069954787118, "grad_norm": 1.8178691864013672, "learning_rate": 1.3226009150592041e-05, "loss": 0.0657, "step": 137500 }, { "epoch": 2.025375178568799, "grad_norm": 1.88641357421875, "learning_rate": 1.3224372778625057e-05, "loss": 0.0606, "step": 137525 }, { "epoch": 2.025743361658886, "grad_norm": 1.3782966136932373, "learning_rate": 1.3222736406658072e-05, "loss": 0.0629, "step": 137550 }, { "epoch": 2.026111544748973, "grad_norm": 1.296451210975647, "learning_rate": 1.3221100034691088e-05, "loss": 0.0609, "step": 137575 }, { "epoch": 2.02647972783906, "grad_norm": 1.3301911354064941, "learning_rate": 1.3219463662724102e-05, "loss": 0.0651, "step": 137600 }, { "epoch": 2.026847910929147, "grad_norm": 1.7005969285964966, "learning_rate": 1.3217827290757117e-05, "loss": 0.0602, "step": 137625 }, { "epoch": 2.027216094019234, "grad_norm": 1.0853757858276367, "learning_rate": 1.3216190918790133e-05, "loss": 0.0587, "step": 137650 }, { "epoch": 2.027584277109321, "grad_norm": 1.4156244993209839, "learning_rate": 1.3214554546823149e-05, "loss": 0.06, "step": 137675 }, { "epoch": 2.027952460199408, "grad_norm": 1.3988933563232422, "learning_rate": 1.3212918174856163e-05, "loss": 0.0617, "step": 137700 }, { "epoch": 2.028320643289495, "grad_norm": 1.0410761833190918, "learning_rate": 1.321128180288918e-05, "loss": 0.0665, "step": 137725 }, { "epoch": 2.028688826379582, "grad_norm": 1.1729612350463867, "learning_rate": 1.3209645430922196e-05, "loss": 0.0627, "step": 137750 }, { "epoch": 2.029057009469669, "grad_norm": 0.8793452382087708, "learning_rate": 1.3208009058955212e-05, "loss": 0.073, "step": 137775 }, { "epoch": 2.029425192559756, "grad_norm": 1.0352075099945068, "learning_rate": 1.3206372686988225e-05, "loss": 0.061, "step": 137800 }, { "epoch": 2.029793375649843, "grad_norm": 1.5649809837341309, "learning_rate": 1.320473631502124e-05, "loss": 0.0608, "step": 137825 }, { "epoch": 2.03016155873993, "grad_norm": 1.3111470937728882, "learning_rate": 1.3203099943054257e-05, "loss": 0.061, "step": 137850 }, { "epoch": 2.0305297418300174, "grad_norm": 1.7501057386398315, "learning_rate": 1.3201463571087271e-05, "loss": 0.0583, "step": 137875 }, { "epoch": 2.0308979249201045, "grad_norm": 1.616523027420044, "learning_rate": 1.3199827199120287e-05, "loss": 0.064, "step": 137900 }, { "epoch": 2.0312661080101915, "grad_norm": 1.742025375366211, "learning_rate": 1.3198190827153304e-05, "loss": 0.063, "step": 137925 }, { "epoch": 2.0316342911002785, "grad_norm": 1.0694652795791626, "learning_rate": 1.319655445518632e-05, "loss": 0.0586, "step": 137950 }, { "epoch": 2.0320024741903655, "grad_norm": 1.1424410343170166, "learning_rate": 1.3194918083219334e-05, "loss": 0.0621, "step": 137975 }, { "epoch": 2.0323706572804525, "grad_norm": 1.309401035308838, "learning_rate": 1.319328171125235e-05, "loss": 0.0643, "step": 138000 }, { "epoch": 2.0327388403705395, "grad_norm": 1.3246620893478394, "learning_rate": 1.3191645339285363e-05, "loss": 0.0622, "step": 138025 }, { "epoch": 2.0331070234606266, "grad_norm": 1.3904753923416138, "learning_rate": 1.319000896731838e-05, "loss": 0.0659, "step": 138050 }, { "epoch": 2.0334752065507136, "grad_norm": 1.8712338209152222, "learning_rate": 1.3188372595351395e-05, "loss": 0.0678, "step": 138075 }, { "epoch": 2.0338433896408006, "grad_norm": 1.263144612312317, "learning_rate": 1.3186736223384412e-05, "loss": 0.0656, "step": 138100 }, { "epoch": 2.0342115727308876, "grad_norm": 1.527934193611145, "learning_rate": 1.3185099851417426e-05, "loss": 0.0684, "step": 138125 }, { "epoch": 2.0345797558209746, "grad_norm": 1.329735517501831, "learning_rate": 1.3183463479450442e-05, "loss": 0.0657, "step": 138150 }, { "epoch": 2.0349479389110616, "grad_norm": Infinity, "learning_rate": 1.3181892562362138e-05, "loss": 0.0623, "step": 138175 }, { "epoch": 2.0353161220011486, "grad_norm": 1.3123729228973389, "learning_rate": 1.3180256190395154e-05, "loss": 0.0575, "step": 138200 }, { "epoch": 2.0356843050912357, "grad_norm": 1.5601123571395874, "learning_rate": 1.3178619818428167e-05, "loss": 0.0674, "step": 138225 }, { "epoch": 2.0360524881813227, "grad_norm": 1.5189870595932007, "learning_rate": 1.3176983446461183e-05, "loss": 0.0626, "step": 138250 }, { "epoch": 2.0364206712714097, "grad_norm": 1.7031035423278809, "learning_rate": 1.3175347074494197e-05, "loss": 0.0675, "step": 138275 }, { "epoch": 2.0367888543614967, "grad_norm": 0.8013625741004944, "learning_rate": 1.3173710702527213e-05, "loss": 0.0617, "step": 138300 }, { "epoch": 2.0371570374515837, "grad_norm": 1.5683618783950806, "learning_rate": 1.317207433056023e-05, "loss": 0.0657, "step": 138325 }, { "epoch": 2.037525220541671, "grad_norm": 1.5596259832382202, "learning_rate": 1.3170437958593246e-05, "loss": 0.0663, "step": 138350 }, { "epoch": 2.037893403631758, "grad_norm": 1.3564993143081665, "learning_rate": 1.316880158662626e-05, "loss": 0.0685, "step": 138375 }, { "epoch": 2.038261586721845, "grad_norm": 1.921115517616272, "learning_rate": 1.3167165214659276e-05, "loss": 0.0621, "step": 138400 }, { "epoch": 2.0386297698119322, "grad_norm": 0.9760974049568176, "learning_rate": 1.3165528842692293e-05, "loss": 0.063, "step": 138425 }, { "epoch": 2.0389979529020192, "grad_norm": 0.9111531972885132, "learning_rate": 1.3163892470725305e-05, "loss": 0.0624, "step": 138450 }, { "epoch": 2.0393661359921063, "grad_norm": 1.5316861867904663, "learning_rate": 1.3162256098758321e-05, "loss": 0.0691, "step": 138475 }, { "epoch": 2.0397343190821933, "grad_norm": 1.6244877576828003, "learning_rate": 1.3160619726791338e-05, "loss": 0.0635, "step": 138500 }, { "epoch": 2.0401025021722803, "grad_norm": 1.393915057182312, "learning_rate": 1.3158983354824352e-05, "loss": 0.0617, "step": 138525 }, { "epoch": 2.0404706852623673, "grad_norm": 1.1242659091949463, "learning_rate": 1.3157346982857368e-05, "loss": 0.0666, "step": 138550 }, { "epoch": 2.0408388683524543, "grad_norm": 1.4729751348495483, "learning_rate": 1.3155710610890384e-05, "loss": 0.069, "step": 138575 }, { "epoch": 2.0412070514425413, "grad_norm": 1.425998330116272, "learning_rate": 1.31540742389234e-05, "loss": 0.062, "step": 138600 }, { "epoch": 2.0415752345326283, "grad_norm": 1.3222010135650635, "learning_rate": 1.3152437866956415e-05, "loss": 0.0671, "step": 138625 }, { "epoch": 2.0419434176227154, "grad_norm": 1.410779356956482, "learning_rate": 1.315080149498943e-05, "loss": 0.0577, "step": 138650 }, { "epoch": 2.0423116007128024, "grad_norm": 1.0774645805358887, "learning_rate": 1.3149165123022446e-05, "loss": 0.0647, "step": 138675 }, { "epoch": 2.0426797838028894, "grad_norm": 1.0232930183410645, "learning_rate": 1.314752875105546e-05, "loss": 0.0578, "step": 138700 }, { "epoch": 2.0430479668929764, "grad_norm": 1.0589803457260132, "learning_rate": 1.3145892379088476e-05, "loss": 0.0574, "step": 138725 }, { "epoch": 2.0434161499830634, "grad_norm": 1.8932327032089233, "learning_rate": 1.3144256007121492e-05, "loss": 0.0644, "step": 138750 }, { "epoch": 2.0437843330731504, "grad_norm": 1.52449631690979, "learning_rate": 1.3142619635154508e-05, "loss": 0.0665, "step": 138775 }, { "epoch": 2.0441525161632375, "grad_norm": 1.403548002243042, "learning_rate": 1.3140983263187523e-05, "loss": 0.0571, "step": 138800 }, { "epoch": 2.044520699253325, "grad_norm": 1.053322672843933, "learning_rate": 1.3139346891220539e-05, "loss": 0.0611, "step": 138825 }, { "epoch": 2.044888882343412, "grad_norm": 1.0917490720748901, "learning_rate": 1.3137710519253552e-05, "loss": 0.0589, "step": 138850 }, { "epoch": 2.045257065433499, "grad_norm": 1.9904406070709229, "learning_rate": 1.3136074147286568e-05, "loss": 0.074, "step": 138875 }, { "epoch": 2.045625248523586, "grad_norm": 1.4356249570846558, "learning_rate": 1.3134437775319584e-05, "loss": 0.0706, "step": 138900 }, { "epoch": 2.045993431613673, "grad_norm": 1.5951532125473022, "learning_rate": 1.31328014033526e-05, "loss": 0.0692, "step": 138925 }, { "epoch": 2.04636161470376, "grad_norm": 1.652031421661377, "learning_rate": 1.3131165031385615e-05, "loss": 0.07, "step": 138950 }, { "epoch": 2.046729797793847, "grad_norm": 1.2981295585632324, "learning_rate": 1.3129528659418631e-05, "loss": 0.0616, "step": 138975 }, { "epoch": 2.047097980883934, "grad_norm": 1.3023178577423096, "learning_rate": 1.3127892287451647e-05, "loss": 0.0641, "step": 139000 }, { "epoch": 2.047466163974021, "grad_norm": 1.6468833684921265, "learning_rate": 1.3126255915484663e-05, "loss": 0.0597, "step": 139025 }, { "epoch": 2.047834347064108, "grad_norm": 0.7615518569946289, "learning_rate": 1.3124619543517678e-05, "loss": 0.0643, "step": 139050 }, { "epoch": 2.048202530154195, "grad_norm": 1.8502449989318848, "learning_rate": 1.3122983171550692e-05, "loss": 0.0587, "step": 139075 }, { "epoch": 2.048570713244282, "grad_norm": 1.102052092552185, "learning_rate": 1.3121346799583707e-05, "loss": 0.0599, "step": 139100 }, { "epoch": 2.048938896334369, "grad_norm": 1.6666117906570435, "learning_rate": 1.3119710427616723e-05, "loss": 0.0627, "step": 139125 }, { "epoch": 2.049307079424456, "grad_norm": 0.9726002812385559, "learning_rate": 1.3118074055649739e-05, "loss": 0.0576, "step": 139150 }, { "epoch": 2.049675262514543, "grad_norm": 1.3315876722335815, "learning_rate": 1.3116437683682755e-05, "loss": 0.0685, "step": 139175 }, { "epoch": 2.05004344560463, "grad_norm": 1.5628360509872437, "learning_rate": 1.311480131171577e-05, "loss": 0.062, "step": 139200 }, { "epoch": 2.050411628694717, "grad_norm": 1.05496346950531, "learning_rate": 1.3113164939748786e-05, "loss": 0.0645, "step": 139225 }, { "epoch": 2.050779811784804, "grad_norm": 1.903501272201538, "learning_rate": 1.3111528567781802e-05, "loss": 0.0617, "step": 139250 }, { "epoch": 2.051147994874891, "grad_norm": 1.390907883644104, "learning_rate": 1.3109892195814815e-05, "loss": 0.0646, "step": 139275 }, { "epoch": 2.051516177964978, "grad_norm": 0.9083699584007263, "learning_rate": 1.310825582384783e-05, "loss": 0.0629, "step": 139300 }, { "epoch": 2.0518843610550657, "grad_norm": 0.9529362916946411, "learning_rate": 1.3106619451880847e-05, "loss": 0.0576, "step": 139325 }, { "epoch": 2.0522525441451527, "grad_norm": 1.516104817390442, "learning_rate": 1.3104983079913863e-05, "loss": 0.0621, "step": 139350 }, { "epoch": 2.0526207272352397, "grad_norm": 1.1514497995376587, "learning_rate": 1.3103346707946877e-05, "loss": 0.0666, "step": 139375 }, { "epoch": 2.0529889103253267, "grad_norm": 1.3026103973388672, "learning_rate": 1.3101710335979894e-05, "loss": 0.0578, "step": 139400 }, { "epoch": 2.0533570934154137, "grad_norm": 1.6160250902175903, "learning_rate": 1.310007396401291e-05, "loss": 0.0665, "step": 139425 }, { "epoch": 2.0537252765055007, "grad_norm": 1.8178597688674927, "learning_rate": 1.3098437592045924e-05, "loss": 0.0696, "step": 139450 }, { "epoch": 2.0540934595955878, "grad_norm": 1.6499983072280884, "learning_rate": 1.309680122007894e-05, "loss": 0.0626, "step": 139475 }, { "epoch": 2.0544616426856748, "grad_norm": 1.3892712593078613, "learning_rate": 1.3095164848111955e-05, "loss": 0.0597, "step": 139500 }, { "epoch": 2.054829825775762, "grad_norm": 1.1013695001602173, "learning_rate": 1.309352847614497e-05, "loss": 0.0659, "step": 139525 }, { "epoch": 2.055198008865849, "grad_norm": 1.0680475234985352, "learning_rate": 1.3091892104177985e-05, "loss": 0.0615, "step": 139550 }, { "epoch": 2.055566191955936, "grad_norm": 1.365019679069519, "learning_rate": 1.3090255732211002e-05, "loss": 0.0607, "step": 139575 }, { "epoch": 2.055934375046023, "grad_norm": 1.6709725856781006, "learning_rate": 1.3088619360244018e-05, "loss": 0.069, "step": 139600 }, { "epoch": 2.05630255813611, "grad_norm": 1.7192907333374023, "learning_rate": 1.3086982988277032e-05, "loss": 0.0613, "step": 139625 }, { "epoch": 2.056670741226197, "grad_norm": 1.3610130548477173, "learning_rate": 1.3085346616310048e-05, "loss": 0.0677, "step": 139650 }, { "epoch": 2.057038924316284, "grad_norm": 1.378743052482605, "learning_rate": 1.3083710244343065e-05, "loss": 0.0727, "step": 139675 }, { "epoch": 2.057407107406371, "grad_norm": 0.9532351493835449, "learning_rate": 1.3082073872376077e-05, "loss": 0.0643, "step": 139700 }, { "epoch": 2.057775290496458, "grad_norm": 1.232985496520996, "learning_rate": 1.3080437500409093e-05, "loss": 0.0585, "step": 139725 }, { "epoch": 2.058143473586545, "grad_norm": 1.3010550737380981, "learning_rate": 1.307880112844211e-05, "loss": 0.0648, "step": 139750 }, { "epoch": 2.058511656676632, "grad_norm": 1.1119623184204102, "learning_rate": 1.3077164756475124e-05, "loss": 0.0562, "step": 139775 }, { "epoch": 2.0588798397667194, "grad_norm": 1.2800379991531372, "learning_rate": 1.307552838450814e-05, "loss": 0.0633, "step": 139800 }, { "epoch": 2.0592480228568064, "grad_norm": 1.5540469884872437, "learning_rate": 1.3073892012541156e-05, "loss": 0.0644, "step": 139825 }, { "epoch": 2.0596162059468934, "grad_norm": 1.0997648239135742, "learning_rate": 1.3072255640574173e-05, "loss": 0.0539, "step": 139850 }, { "epoch": 2.0599843890369804, "grad_norm": 1.4437756538391113, "learning_rate": 1.3070619268607187e-05, "loss": 0.07, "step": 139875 }, { "epoch": 2.0603525721270675, "grad_norm": 1.899660587310791, "learning_rate": 1.3068982896640203e-05, "loss": 0.0638, "step": 139900 }, { "epoch": 2.0607207552171545, "grad_norm": 1.098617672920227, "learning_rate": 1.3067346524673218e-05, "loss": 0.0655, "step": 139925 }, { "epoch": 2.0610889383072415, "grad_norm": 0.8406283259391785, "learning_rate": 1.3065710152706232e-05, "loss": 0.0675, "step": 139950 }, { "epoch": 2.0614571213973285, "grad_norm": 1.1740442514419556, "learning_rate": 1.3064073780739248e-05, "loss": 0.0562, "step": 139975 }, { "epoch": 2.0618253044874155, "grad_norm": 1.5432026386260986, "learning_rate": 1.3062437408772264e-05, "loss": 0.0563, "step": 140000 }, { "epoch": 2.0621934875775025, "grad_norm": 1.668741226196289, "learning_rate": 1.3060801036805279e-05, "loss": 0.0577, "step": 140025 }, { "epoch": 2.0625616706675896, "grad_norm": 1.3596270084381104, "learning_rate": 1.3059164664838295e-05, "loss": 0.0678, "step": 140050 }, { "epoch": 2.0629298537576766, "grad_norm": 1.7664932012557983, "learning_rate": 1.3057528292871311e-05, "loss": 0.0691, "step": 140075 }, { "epoch": 2.0632980368477636, "grad_norm": 1.3408604860305786, "learning_rate": 1.3055891920904327e-05, "loss": 0.0663, "step": 140100 }, { "epoch": 2.0636662199378506, "grad_norm": 1.409626841545105, "learning_rate": 1.305425554893734e-05, "loss": 0.0614, "step": 140125 }, { "epoch": 2.0640344030279376, "grad_norm": 1.6057690382003784, "learning_rate": 1.3052619176970356e-05, "loss": 0.0658, "step": 140150 }, { "epoch": 2.0644025861180246, "grad_norm": 1.4680137634277344, "learning_rate": 1.3050982805003372e-05, "loss": 0.0528, "step": 140175 }, { "epoch": 2.0647707692081116, "grad_norm": 1.5226285457611084, "learning_rate": 1.3049346433036387e-05, "loss": 0.0723, "step": 140200 }, { "epoch": 2.0651389522981987, "grad_norm": 1.073876142501831, "learning_rate": 1.3047710061069403e-05, "loss": 0.0638, "step": 140225 }, { "epoch": 2.0655071353882857, "grad_norm": 1.3069032430648804, "learning_rate": 1.3046073689102419e-05, "loss": 0.0664, "step": 140250 }, { "epoch": 2.065875318478373, "grad_norm": 1.4336020946502686, "learning_rate": 1.3044502772014113e-05, "loss": 0.0688, "step": 140275 }, { "epoch": 2.06624350156846, "grad_norm": 1.4683928489685059, "learning_rate": 1.3042866400047129e-05, "loss": 0.0638, "step": 140300 }, { "epoch": 2.066611684658547, "grad_norm": 1.3116852045059204, "learning_rate": 1.3041230028080144e-05, "loss": 0.063, "step": 140325 }, { "epoch": 2.066979867748634, "grad_norm": 1.498652696609497, "learning_rate": 1.3039593656113158e-05, "loss": 0.0675, "step": 140350 }, { "epoch": 2.067348050838721, "grad_norm": 1.148743987083435, "learning_rate": 1.3037957284146174e-05, "loss": 0.0615, "step": 140375 }, { "epoch": 2.067716233928808, "grad_norm": 1.1772137880325317, "learning_rate": 1.303632091217919e-05, "loss": 0.0593, "step": 140400 }, { "epoch": 2.0680844170188952, "grad_norm": 1.1782065629959106, "learning_rate": 1.3034684540212206e-05, "loss": 0.0593, "step": 140425 }, { "epoch": 2.0684526001089822, "grad_norm": 1.458560585975647, "learning_rate": 1.3033048168245221e-05, "loss": 0.0655, "step": 140450 }, { "epoch": 2.0688207831990693, "grad_norm": 0.7723190784454346, "learning_rate": 1.3031411796278237e-05, "loss": 0.0535, "step": 140475 }, { "epoch": 2.0691889662891563, "grad_norm": 1.4423205852508545, "learning_rate": 1.3029775424311253e-05, "loss": 0.0636, "step": 140500 }, { "epoch": 2.0695571493792433, "grad_norm": 0.8405426740646362, "learning_rate": 1.302813905234427e-05, "loss": 0.0653, "step": 140525 }, { "epoch": 2.0699253324693303, "grad_norm": 0.9008001685142517, "learning_rate": 1.3026502680377282e-05, "loss": 0.0636, "step": 140550 }, { "epoch": 2.0702935155594173, "grad_norm": 1.235060691833496, "learning_rate": 1.3024866308410298e-05, "loss": 0.0654, "step": 140575 }, { "epoch": 2.0706616986495043, "grad_norm": 1.0082287788391113, "learning_rate": 1.3023229936443313e-05, "loss": 0.0668, "step": 140600 }, { "epoch": 2.0710298817395913, "grad_norm": 1.3555642366409302, "learning_rate": 1.3021593564476329e-05, "loss": 0.0602, "step": 140625 }, { "epoch": 2.0713980648296784, "grad_norm": 1.2186473608016968, "learning_rate": 1.3019957192509345e-05, "loss": 0.061, "step": 140650 }, { "epoch": 2.0717662479197654, "grad_norm": 1.6986041069030762, "learning_rate": 1.3018320820542361e-05, "loss": 0.0568, "step": 140675 }, { "epoch": 2.0721344310098524, "grad_norm": 1.4862006902694702, "learning_rate": 1.3016684448575376e-05, "loss": 0.0643, "step": 140700 }, { "epoch": 2.0725026140999394, "grad_norm": 1.3662189245224, "learning_rate": 1.3015048076608392e-05, "loss": 0.064, "step": 140725 }, { "epoch": 2.072870797190027, "grad_norm": 1.5313546657562256, "learning_rate": 1.3013411704641406e-05, "loss": 0.0605, "step": 140750 }, { "epoch": 2.073238980280114, "grad_norm": 1.3113452196121216, "learning_rate": 1.301177533267442e-05, "loss": 0.0595, "step": 140775 }, { "epoch": 2.073607163370201, "grad_norm": 1.8002442121505737, "learning_rate": 1.3010138960707437e-05, "loss": 0.0626, "step": 140800 }, { "epoch": 2.073975346460288, "grad_norm": 1.6425055265426636, "learning_rate": 1.3008502588740453e-05, "loss": 0.0595, "step": 140825 }, { "epoch": 2.074343529550375, "grad_norm": 1.4305739402770996, "learning_rate": 1.3006866216773467e-05, "loss": 0.0609, "step": 140850 }, { "epoch": 2.074711712640462, "grad_norm": 1.2356306314468384, "learning_rate": 1.3005229844806484e-05, "loss": 0.07, "step": 140875 }, { "epoch": 2.075079895730549, "grad_norm": 1.367408275604248, "learning_rate": 1.30035934728395e-05, "loss": 0.0655, "step": 140900 }, { "epoch": 2.075448078820636, "grad_norm": 1.344552755355835, "learning_rate": 1.3001957100872516e-05, "loss": 0.0676, "step": 140925 }, { "epoch": 2.075816261910723, "grad_norm": 0.9702365398406982, "learning_rate": 1.300032072890553e-05, "loss": 0.0657, "step": 140950 }, { "epoch": 2.07618444500081, "grad_norm": 1.7759288549423218, "learning_rate": 1.2998684356938545e-05, "loss": 0.0649, "step": 140975 }, { "epoch": 2.076552628090897, "grad_norm": 1.168556571006775, "learning_rate": 1.2997047984971561e-05, "loss": 0.0662, "step": 141000 }, { "epoch": 2.076920811180984, "grad_norm": 0.9602246284484863, "learning_rate": 1.2995411613004575e-05, "loss": 0.0664, "step": 141025 }, { "epoch": 2.077288994271071, "grad_norm": 1.1011443138122559, "learning_rate": 1.2993775241037592e-05, "loss": 0.0581, "step": 141050 }, { "epoch": 2.077657177361158, "grad_norm": 1.2254912853240967, "learning_rate": 1.2992138869070608e-05, "loss": 0.0648, "step": 141075 }, { "epoch": 2.078025360451245, "grad_norm": 1.7616828680038452, "learning_rate": 1.2990502497103624e-05, "loss": 0.0611, "step": 141100 }, { "epoch": 2.078393543541332, "grad_norm": 0.9659159183502197, "learning_rate": 1.2988866125136638e-05, "loss": 0.0698, "step": 141125 }, { "epoch": 2.078761726631419, "grad_norm": 1.216340184211731, "learning_rate": 1.2987229753169655e-05, "loss": 0.0611, "step": 141150 }, { "epoch": 2.079129909721506, "grad_norm": 1.3510624170303345, "learning_rate": 1.2985593381202667e-05, "loss": 0.0646, "step": 141175 }, { "epoch": 2.079498092811593, "grad_norm": 1.135532259941101, "learning_rate": 1.2983957009235683e-05, "loss": 0.0589, "step": 141200 }, { "epoch": 2.07986627590168, "grad_norm": 1.712357521057129, "learning_rate": 1.29823206372687e-05, "loss": 0.0684, "step": 141225 }, { "epoch": 2.0802344589917676, "grad_norm": 1.687016248703003, "learning_rate": 1.2980684265301716e-05, "loss": 0.0692, "step": 141250 }, { "epoch": 2.0806026420818546, "grad_norm": 1.6153839826583862, "learning_rate": 1.297904789333473e-05, "loss": 0.0674, "step": 141275 }, { "epoch": 2.0809708251719417, "grad_norm": 1.5420087575912476, "learning_rate": 1.2977411521367746e-05, "loss": 0.0684, "step": 141300 }, { "epoch": 2.0813390082620287, "grad_norm": 1.437110185623169, "learning_rate": 1.2975775149400763e-05, "loss": 0.0558, "step": 141325 }, { "epoch": 2.0817071913521157, "grad_norm": 1.9535467624664307, "learning_rate": 1.2974138777433779e-05, "loss": 0.0594, "step": 141350 }, { "epoch": 2.0820753744422027, "grad_norm": 1.4665862321853638, "learning_rate": 1.2972502405466793e-05, "loss": 0.0544, "step": 141375 }, { "epoch": 2.0824435575322897, "grad_norm": 1.26799476146698, "learning_rate": 1.2970866033499808e-05, "loss": 0.0684, "step": 141400 }, { "epoch": 2.0828117406223767, "grad_norm": 1.584623098373413, "learning_rate": 1.2969229661532822e-05, "loss": 0.0646, "step": 141425 }, { "epoch": 2.0831799237124637, "grad_norm": 1.1859813928604126, "learning_rate": 1.2967593289565838e-05, "loss": 0.0679, "step": 141450 }, { "epoch": 2.0835481068025508, "grad_norm": 1.1526150703430176, "learning_rate": 1.2965956917598854e-05, "loss": 0.0648, "step": 141475 }, { "epoch": 2.0839162898926378, "grad_norm": 1.5065057277679443, "learning_rate": 1.296432054563187e-05, "loss": 0.064, "step": 141500 }, { "epoch": 2.084284472982725, "grad_norm": 1.4624722003936768, "learning_rate": 1.2962684173664885e-05, "loss": 0.0673, "step": 141525 }, { "epoch": 2.084652656072812, "grad_norm": 1.3338077068328857, "learning_rate": 1.2961047801697901e-05, "loss": 0.057, "step": 141550 }, { "epoch": 2.085020839162899, "grad_norm": 1.2845184803009033, "learning_rate": 1.2959411429730917e-05, "loss": 0.0663, "step": 141575 }, { "epoch": 2.085389022252986, "grad_norm": 1.2876492738723755, "learning_rate": 1.295777505776393e-05, "loss": 0.0634, "step": 141600 }, { "epoch": 2.085757205343073, "grad_norm": 1.4167022705078125, "learning_rate": 1.2956138685796946e-05, "loss": 0.0629, "step": 141625 }, { "epoch": 2.08612538843316, "grad_norm": 1.4296507835388184, "learning_rate": 1.2954502313829962e-05, "loss": 0.0631, "step": 141650 }, { "epoch": 2.086493571523247, "grad_norm": 1.6763315200805664, "learning_rate": 1.2952865941862978e-05, "loss": 0.0639, "step": 141675 }, { "epoch": 2.0868617546133343, "grad_norm": 1.530309796333313, "learning_rate": 1.2951229569895993e-05, "loss": 0.0592, "step": 141700 }, { "epoch": 2.0872299377034214, "grad_norm": 0.849713921546936, "learning_rate": 1.2949593197929009e-05, "loss": 0.0664, "step": 141725 }, { "epoch": 2.0875981207935084, "grad_norm": 1.0692070722579956, "learning_rate": 1.2947956825962025e-05, "loss": 0.073, "step": 141750 }, { "epoch": 2.0879663038835954, "grad_norm": 0.98299241065979, "learning_rate": 1.294632045399504e-05, "loss": 0.0624, "step": 141775 }, { "epoch": 2.0883344869736824, "grad_norm": 1.4880268573760986, "learning_rate": 1.2944684082028054e-05, "loss": 0.0604, "step": 141800 }, { "epoch": 2.0887026700637694, "grad_norm": 1.196270227432251, "learning_rate": 1.294304771006107e-05, "loss": 0.0681, "step": 141825 }, { "epoch": 2.0890708531538564, "grad_norm": 1.146098017692566, "learning_rate": 1.2941411338094085e-05, "loss": 0.0634, "step": 141850 }, { "epoch": 2.0894390362439434, "grad_norm": 1.51388680934906, "learning_rate": 1.2939774966127101e-05, "loss": 0.0664, "step": 141875 }, { "epoch": 2.0898072193340305, "grad_norm": 1.2342647314071655, "learning_rate": 1.2938138594160117e-05, "loss": 0.0637, "step": 141900 }, { "epoch": 2.0901754024241175, "grad_norm": 1.394655704498291, "learning_rate": 1.2936502222193133e-05, "loss": 0.0635, "step": 141925 }, { "epoch": 2.0905435855142045, "grad_norm": 1.622645616531372, "learning_rate": 1.2934865850226148e-05, "loss": 0.0642, "step": 141950 }, { "epoch": 2.0909117686042915, "grad_norm": 1.768418550491333, "learning_rate": 1.2933229478259164e-05, "loss": 0.0633, "step": 141975 }, { "epoch": 2.0912799516943785, "grad_norm": 1.3926706314086914, "learning_rate": 1.293159310629218e-05, "loss": 0.0604, "step": 142000 }, { "epoch": 2.0916481347844655, "grad_norm": 1.6714481115341187, "learning_rate": 1.2929956734325193e-05, "loss": 0.0634, "step": 142025 }, { "epoch": 2.0920163178745526, "grad_norm": 1.5839020013809204, "learning_rate": 1.2928320362358209e-05, "loss": 0.0706, "step": 142050 }, { "epoch": 2.0923845009646396, "grad_norm": 1.0985029935836792, "learning_rate": 1.2926683990391225e-05, "loss": 0.0571, "step": 142075 }, { "epoch": 2.0927526840547266, "grad_norm": 1.868914246559143, "learning_rate": 1.292504761842424e-05, "loss": 0.067, "step": 142100 }, { "epoch": 2.0931208671448136, "grad_norm": 1.129977822303772, "learning_rate": 1.2923411246457256e-05, "loss": 0.0612, "step": 142125 }, { "epoch": 2.0934890502349006, "grad_norm": 1.0147048234939575, "learning_rate": 1.2921774874490272e-05, "loss": 0.058, "step": 142150 }, { "epoch": 2.0938572333249876, "grad_norm": 1.3958303928375244, "learning_rate": 1.2920138502523288e-05, "loss": 0.0679, "step": 142175 }, { "epoch": 2.094225416415075, "grad_norm": 1.1584354639053345, "learning_rate": 1.2918502130556302e-05, "loss": 0.0635, "step": 142200 }, { "epoch": 2.094593599505162, "grad_norm": 1.5718311071395874, "learning_rate": 1.2916865758589317e-05, "loss": 0.0684, "step": 142225 }, { "epoch": 2.094961782595249, "grad_norm": 1.5085594654083252, "learning_rate": 1.2915229386622333e-05, "loss": 0.0635, "step": 142250 }, { "epoch": 2.095329965685336, "grad_norm": 1.1011391878128052, "learning_rate": 1.2913593014655347e-05, "loss": 0.0657, "step": 142275 }, { "epoch": 2.095698148775423, "grad_norm": 1.568986177444458, "learning_rate": 1.2911956642688364e-05, "loss": 0.0606, "step": 142300 }, { "epoch": 2.09606633186551, "grad_norm": 1.4383682012557983, "learning_rate": 1.291032027072138e-05, "loss": 0.0571, "step": 142325 }, { "epoch": 2.096434514955597, "grad_norm": 1.510115146636963, "learning_rate": 1.2908683898754394e-05, "loss": 0.0629, "step": 142350 }, { "epoch": 2.096802698045684, "grad_norm": 1.0108559131622314, "learning_rate": 1.290704752678741e-05, "loss": 0.0597, "step": 142375 }, { "epoch": 2.097170881135771, "grad_norm": 1.4723358154296875, "learning_rate": 1.2905411154820427e-05, "loss": 0.0604, "step": 142400 }, { "epoch": 2.0975390642258582, "grad_norm": 1.4640063047409058, "learning_rate": 1.2903774782853443e-05, "loss": 0.0629, "step": 142425 }, { "epoch": 2.0979072473159452, "grad_norm": 1.620958924293518, "learning_rate": 1.2902138410886455e-05, "loss": 0.0619, "step": 142450 }, { "epoch": 2.0982754304060323, "grad_norm": 1.749944806098938, "learning_rate": 1.2900502038919472e-05, "loss": 0.0672, "step": 142475 }, { "epoch": 2.0986436134961193, "grad_norm": 1.1632822751998901, "learning_rate": 1.2898865666952488e-05, "loss": 0.0622, "step": 142500 }, { "epoch": 2.0990117965862063, "grad_norm": 0.9090245962142944, "learning_rate": 1.2897229294985502e-05, "loss": 0.0599, "step": 142525 }, { "epoch": 2.0993799796762933, "grad_norm": 0.9051256775856018, "learning_rate": 1.2895592923018518e-05, "loss": 0.062, "step": 142550 }, { "epoch": 2.0997481627663803, "grad_norm": 0.896317183971405, "learning_rate": 1.2893956551051535e-05, "loss": 0.0732, "step": 142575 }, { "epoch": 2.1001163458564673, "grad_norm": 1.3453454971313477, "learning_rate": 1.2892320179084549e-05, "loss": 0.0658, "step": 142600 }, { "epoch": 2.1004845289465544, "grad_norm": 1.448994517326355, "learning_rate": 1.2890683807117565e-05, "loss": 0.0631, "step": 142625 }, { "epoch": 2.1008527120366414, "grad_norm": 1.9118424654006958, "learning_rate": 1.288904743515058e-05, "loss": 0.0581, "step": 142650 }, { "epoch": 2.101220895126729, "grad_norm": 1.576361060142517, "learning_rate": 1.2887411063183594e-05, "loss": 0.0626, "step": 142675 }, { "epoch": 2.101589078216816, "grad_norm": 1.3878475427627563, "learning_rate": 1.288577469121661e-05, "loss": 0.0625, "step": 142700 }, { "epoch": 2.101957261306903, "grad_norm": 1.9188108444213867, "learning_rate": 1.2884138319249626e-05, "loss": 0.059, "step": 142725 }, { "epoch": 2.10232544439699, "grad_norm": 1.5311640501022339, "learning_rate": 1.2882501947282642e-05, "loss": 0.0696, "step": 142750 }, { "epoch": 2.102693627487077, "grad_norm": 1.0558075904846191, "learning_rate": 1.2880865575315657e-05, "loss": 0.065, "step": 142775 }, { "epoch": 2.103061810577164, "grad_norm": 1.0887057781219482, "learning_rate": 1.2879229203348673e-05, "loss": 0.0595, "step": 142800 }, { "epoch": 2.103429993667251, "grad_norm": 1.369795322418213, "learning_rate": 1.287759283138169e-05, "loss": 0.0623, "step": 142825 }, { "epoch": 2.103798176757338, "grad_norm": 1.280044674873352, "learning_rate": 1.2875956459414705e-05, "loss": 0.0615, "step": 142850 }, { "epoch": 2.104166359847425, "grad_norm": 1.3107632398605347, "learning_rate": 1.2874320087447718e-05, "loss": 0.0603, "step": 142875 }, { "epoch": 2.104534542937512, "grad_norm": 1.8252980709075928, "learning_rate": 1.2872683715480734e-05, "loss": 0.0658, "step": 142900 }, { "epoch": 2.104902726027599, "grad_norm": 1.4435657262802124, "learning_rate": 1.2871047343513749e-05, "loss": 0.0643, "step": 142925 }, { "epoch": 2.105270909117686, "grad_norm": 1.8467187881469727, "learning_rate": 1.2869410971546765e-05, "loss": 0.0571, "step": 142950 }, { "epoch": 2.105639092207773, "grad_norm": 1.1174285411834717, "learning_rate": 1.2867774599579781e-05, "loss": 0.0596, "step": 142975 }, { "epoch": 2.10600727529786, "grad_norm": 0.9031580090522766, "learning_rate": 1.2866138227612797e-05, "loss": 0.0627, "step": 143000 }, { "epoch": 2.106375458387947, "grad_norm": 1.3084290027618408, "learning_rate": 1.2864501855645812e-05, "loss": 0.0583, "step": 143025 }, { "epoch": 2.106743641478034, "grad_norm": 1.6340535879135132, "learning_rate": 1.2862865483678828e-05, "loss": 0.0581, "step": 143050 }, { "epoch": 2.107111824568121, "grad_norm": 1.4646821022033691, "learning_rate": 1.2861229111711842e-05, "loss": 0.0684, "step": 143075 }, { "epoch": 2.107480007658208, "grad_norm": 1.251671314239502, "learning_rate": 1.2859592739744857e-05, "loss": 0.0629, "step": 143100 }, { "epoch": 2.107848190748295, "grad_norm": 1.1170073747634888, "learning_rate": 1.2857956367777873e-05, "loss": 0.0582, "step": 143125 }, { "epoch": 2.1082163738383826, "grad_norm": 1.2538365125656128, "learning_rate": 1.2856319995810889e-05, "loss": 0.0631, "step": 143150 }, { "epoch": 2.1085845569284696, "grad_norm": 1.4987280368804932, "learning_rate": 1.2854683623843904e-05, "loss": 0.0649, "step": 143175 }, { "epoch": 2.1089527400185566, "grad_norm": 1.2383800745010376, "learning_rate": 1.285304725187692e-05, "loss": 0.0639, "step": 143200 }, { "epoch": 2.1093209231086436, "grad_norm": 0.8765453696250916, "learning_rate": 1.2851410879909936e-05, "loss": 0.062, "step": 143225 }, { "epoch": 2.1096891061987306, "grad_norm": 1.73635733127594, "learning_rate": 1.2849774507942952e-05, "loss": 0.0696, "step": 143250 }, { "epoch": 2.1100572892888176, "grad_norm": 1.0011390447616577, "learning_rate": 1.2848138135975965e-05, "loss": 0.0668, "step": 143275 }, { "epoch": 2.1104254723789047, "grad_norm": 1.3832238912582397, "learning_rate": 1.2846501764008981e-05, "loss": 0.0562, "step": 143300 }, { "epoch": 2.1107936554689917, "grad_norm": 1.3932617902755737, "learning_rate": 1.2844865392041997e-05, "loss": 0.0607, "step": 143325 }, { "epoch": 2.1111618385590787, "grad_norm": 1.249355673789978, "learning_rate": 1.2843229020075011e-05, "loss": 0.0603, "step": 143350 }, { "epoch": 2.1115300216491657, "grad_norm": 1.4410150051116943, "learning_rate": 1.2841592648108028e-05, "loss": 0.0645, "step": 143375 }, { "epoch": 2.1118982047392527, "grad_norm": 1.1042981147766113, "learning_rate": 1.2839956276141044e-05, "loss": 0.0566, "step": 143400 }, { "epoch": 2.1122663878293397, "grad_norm": 1.2523338794708252, "learning_rate": 1.283831990417406e-05, "loss": 0.0698, "step": 143425 }, { "epoch": 2.1126345709194267, "grad_norm": 0.9360388517379761, "learning_rate": 1.2836683532207074e-05, "loss": 0.061, "step": 143450 }, { "epoch": 2.1130027540095138, "grad_norm": 1.7112327814102173, "learning_rate": 1.283504716024009e-05, "loss": 0.0603, "step": 143475 }, { "epoch": 2.1133709370996008, "grad_norm": 1.3567159175872803, "learning_rate": 1.2833410788273103e-05, "loss": 0.0626, "step": 143500 }, { "epoch": 2.113739120189688, "grad_norm": 0.863154411315918, "learning_rate": 1.283177441630612e-05, "loss": 0.0591, "step": 143525 }, { "epoch": 2.114107303279775, "grad_norm": 1.3425259590148926, "learning_rate": 1.2830138044339136e-05, "loss": 0.0602, "step": 143550 }, { "epoch": 2.114475486369862, "grad_norm": Infinity, "learning_rate": 1.2828567127250831e-05, "loss": 0.0602, "step": 143575 }, { "epoch": 2.114843669459949, "grad_norm": 1.4608036279678345, "learning_rate": 1.2826930755283846e-05, "loss": 0.0579, "step": 143600 }, { "epoch": 2.1152118525500363, "grad_norm": 1.1670485734939575, "learning_rate": 1.2825294383316862e-05, "loss": 0.064, "step": 143625 }, { "epoch": 2.1155800356401233, "grad_norm": 1.3197723627090454, "learning_rate": 1.2823658011349878e-05, "loss": 0.0537, "step": 143650 }, { "epoch": 2.1159482187302103, "grad_norm": 0.9913374781608582, "learning_rate": 1.2822021639382894e-05, "loss": 0.0617, "step": 143675 }, { "epoch": 2.1163164018202973, "grad_norm": 1.4832210540771484, "learning_rate": 1.2820385267415907e-05, "loss": 0.0607, "step": 143700 }, { "epoch": 2.1166845849103844, "grad_norm": 1.2942887544631958, "learning_rate": 1.2818748895448923e-05, "loss": 0.0627, "step": 143725 }, { "epoch": 2.1170527680004714, "grad_norm": 1.3849095106124878, "learning_rate": 1.2817112523481937e-05, "loss": 0.0566, "step": 143750 }, { "epoch": 2.1174209510905584, "grad_norm": 1.3855611085891724, "learning_rate": 1.2815476151514954e-05, "loss": 0.0586, "step": 143775 }, { "epoch": 2.1177891341806454, "grad_norm": 1.02754545211792, "learning_rate": 1.281383977954797e-05, "loss": 0.0603, "step": 143800 }, { "epoch": 2.1181573172707324, "grad_norm": 1.3677605390548706, "learning_rate": 1.2812203407580986e-05, "loss": 0.0622, "step": 143825 }, { "epoch": 2.1185255003608194, "grad_norm": 1.3518853187561035, "learning_rate": 1.2810567035614e-05, "loss": 0.0648, "step": 143850 }, { "epoch": 2.1188936834509065, "grad_norm": 1.235385537147522, "learning_rate": 1.2808930663647017e-05, "loss": 0.0633, "step": 143875 }, { "epoch": 2.1192618665409935, "grad_norm": 1.441364049911499, "learning_rate": 1.2807294291680033e-05, "loss": 0.0611, "step": 143900 }, { "epoch": 2.1196300496310805, "grad_norm": 1.1982982158660889, "learning_rate": 1.2805657919713045e-05, "loss": 0.0605, "step": 143925 }, { "epoch": 2.1199982327211675, "grad_norm": 1.425902009010315, "learning_rate": 1.2804021547746062e-05, "loss": 0.0586, "step": 143950 }, { "epoch": 2.1203664158112545, "grad_norm": 1.070468544960022, "learning_rate": 1.2802385175779078e-05, "loss": 0.0619, "step": 143975 }, { "epoch": 2.1207345989013415, "grad_norm": 1.27206289768219, "learning_rate": 1.2800748803812094e-05, "loss": 0.0612, "step": 144000 }, { "epoch": 2.1211027819914285, "grad_norm": 1.3330292701721191, "learning_rate": 1.2799112431845108e-05, "loss": 0.0646, "step": 144025 }, { "epoch": 2.1214709650815156, "grad_norm": 1.4780975580215454, "learning_rate": 1.2797476059878125e-05, "loss": 0.0615, "step": 144050 }, { "epoch": 2.1218391481716026, "grad_norm": 1.175407886505127, "learning_rate": 1.279583968791114e-05, "loss": 0.0621, "step": 144075 }, { "epoch": 2.1222073312616896, "grad_norm": 1.5601645708084106, "learning_rate": 1.2794203315944155e-05, "loss": 0.0707, "step": 144100 }, { "epoch": 2.122575514351777, "grad_norm": 1.0417650938034058, "learning_rate": 1.279256694397717e-05, "loss": 0.0578, "step": 144125 }, { "epoch": 2.122943697441864, "grad_norm": 1.3065272569656372, "learning_rate": 1.2790930572010186e-05, "loss": 0.0621, "step": 144150 }, { "epoch": 2.123311880531951, "grad_norm": 1.6906054019927979, "learning_rate": 1.27892942000432e-05, "loss": 0.0677, "step": 144175 }, { "epoch": 2.123680063622038, "grad_norm": 0.9042579531669617, "learning_rate": 1.2787657828076216e-05, "loss": 0.0617, "step": 144200 }, { "epoch": 2.124048246712125, "grad_norm": 0.9982229471206665, "learning_rate": 1.2786021456109232e-05, "loss": 0.058, "step": 144225 }, { "epoch": 2.124416429802212, "grad_norm": 0.9489299058914185, "learning_rate": 1.2784385084142249e-05, "loss": 0.0606, "step": 144250 }, { "epoch": 2.124784612892299, "grad_norm": 1.6039745807647705, "learning_rate": 1.2782748712175263e-05, "loss": 0.0661, "step": 144275 }, { "epoch": 2.125152795982386, "grad_norm": 1.6360671520233154, "learning_rate": 1.278111234020828e-05, "loss": 0.0602, "step": 144300 }, { "epoch": 2.125520979072473, "grad_norm": 1.4091622829437256, "learning_rate": 1.2779475968241295e-05, "loss": 0.0568, "step": 144325 }, { "epoch": 2.12588916216256, "grad_norm": 1.6434584856033325, "learning_rate": 1.2777839596274308e-05, "loss": 0.0647, "step": 144350 }, { "epoch": 2.126257345252647, "grad_norm": 0.9404569864273071, "learning_rate": 1.2776203224307324e-05, "loss": 0.0595, "step": 144375 }, { "epoch": 2.126625528342734, "grad_norm": 1.516420841217041, "learning_rate": 1.277456685234034e-05, "loss": 0.0702, "step": 144400 }, { "epoch": 2.1269937114328212, "grad_norm": 1.3707910776138306, "learning_rate": 1.2772930480373355e-05, "loss": 0.0567, "step": 144425 }, { "epoch": 2.1273618945229082, "grad_norm": 1.2299379110336304, "learning_rate": 1.2771294108406371e-05, "loss": 0.0606, "step": 144450 }, { "epoch": 2.1277300776129953, "grad_norm": 0.957705557346344, "learning_rate": 1.2769657736439387e-05, "loss": 0.0609, "step": 144475 }, { "epoch": 2.1280982607030823, "grad_norm": 1.8743137121200562, "learning_rate": 1.2768021364472403e-05, "loss": 0.0581, "step": 144500 }, { "epoch": 2.1284664437931693, "grad_norm": 1.2197659015655518, "learning_rate": 1.2766384992505418e-05, "loss": 0.0678, "step": 144525 }, { "epoch": 2.1288346268832563, "grad_norm": 1.4386900663375854, "learning_rate": 1.2764748620538432e-05, "loss": 0.0629, "step": 144550 }, { "epoch": 2.1292028099733438, "grad_norm": 1.5029577016830444, "learning_rate": 1.2763112248571447e-05, "loss": 0.0631, "step": 144575 }, { "epoch": 2.129570993063431, "grad_norm": 1.4606939554214478, "learning_rate": 1.2761475876604463e-05, "loss": 0.0641, "step": 144600 }, { "epoch": 2.129939176153518, "grad_norm": 1.5000444650650024, "learning_rate": 1.2759839504637479e-05, "loss": 0.0654, "step": 144625 }, { "epoch": 2.130307359243605, "grad_norm": 1.001732349395752, "learning_rate": 1.2758203132670495e-05, "loss": 0.0565, "step": 144650 }, { "epoch": 2.130675542333692, "grad_norm": 1.2616219520568848, "learning_rate": 1.275656676070351e-05, "loss": 0.0604, "step": 144675 }, { "epoch": 2.131043725423779, "grad_norm": 1.4114136695861816, "learning_rate": 1.2754930388736526e-05, "loss": 0.0602, "step": 144700 }, { "epoch": 2.131411908513866, "grad_norm": 1.4554424285888672, "learning_rate": 1.2753294016769542e-05, "loss": 0.0607, "step": 144725 }, { "epoch": 2.131780091603953, "grad_norm": 1.5529040098190308, "learning_rate": 1.2751657644802555e-05, "loss": 0.0559, "step": 144750 }, { "epoch": 2.13214827469404, "grad_norm": 1.816643476486206, "learning_rate": 1.2750021272835571e-05, "loss": 0.0637, "step": 144775 }, { "epoch": 2.132516457784127, "grad_norm": 1.499562382698059, "learning_rate": 1.2748384900868587e-05, "loss": 0.0595, "step": 144800 }, { "epoch": 2.132884640874214, "grad_norm": 1.5631037950515747, "learning_rate": 1.2746748528901603e-05, "loss": 0.0606, "step": 144825 }, { "epoch": 2.133252823964301, "grad_norm": 1.1813325881958008, "learning_rate": 1.2745112156934618e-05, "loss": 0.0619, "step": 144850 }, { "epoch": 2.133621007054388, "grad_norm": 1.0146390199661255, "learning_rate": 1.2743475784967634e-05, "loss": 0.0617, "step": 144875 }, { "epoch": 2.133989190144475, "grad_norm": 1.4431804418563843, "learning_rate": 1.274183941300065e-05, "loss": 0.0678, "step": 144900 }, { "epoch": 2.134357373234562, "grad_norm": 1.3941853046417236, "learning_rate": 1.2740203041033664e-05, "loss": 0.0624, "step": 144925 }, { "epoch": 2.134725556324649, "grad_norm": 1.2117805480957031, "learning_rate": 1.273856666906668e-05, "loss": 0.0615, "step": 144950 }, { "epoch": 2.135093739414736, "grad_norm": 1.2068432569503784, "learning_rate": 1.2736930297099695e-05, "loss": 0.0627, "step": 144975 }, { "epoch": 2.135461922504823, "grad_norm": 1.3667653799057007, "learning_rate": 1.273529392513271e-05, "loss": 0.0674, "step": 145000 }, { "epoch": 2.13583010559491, "grad_norm": 1.786485195159912, "learning_rate": 1.2733657553165726e-05, "loss": 0.055, "step": 145025 }, { "epoch": 2.136198288684997, "grad_norm": 1.6920074224472046, "learning_rate": 1.2732021181198742e-05, "loss": 0.0671, "step": 145050 }, { "epoch": 2.1365664717750845, "grad_norm": 1.8676539659500122, "learning_rate": 1.2730384809231758e-05, "loss": 0.0652, "step": 145075 }, { "epoch": 2.1369346548651715, "grad_norm": 1.3670507669448853, "learning_rate": 1.2728748437264772e-05, "loss": 0.0608, "step": 145100 }, { "epoch": 2.1373028379552585, "grad_norm": 1.4896103143692017, "learning_rate": 1.2727112065297789e-05, "loss": 0.0599, "step": 145125 }, { "epoch": 2.1376710210453456, "grad_norm": 1.1572341918945312, "learning_rate": 1.2725475693330805e-05, "loss": 0.0598, "step": 145150 }, { "epoch": 2.1380392041354326, "grad_norm": 1.333055019378662, "learning_rate": 1.2723839321363817e-05, "loss": 0.0611, "step": 145175 }, { "epoch": 2.1384073872255196, "grad_norm": 1.245370864868164, "learning_rate": 1.2722202949396834e-05, "loss": 0.0633, "step": 145200 }, { "epoch": 2.1387755703156066, "grad_norm": 1.2429883480072021, "learning_rate": 1.272056657742985e-05, "loss": 0.066, "step": 145225 }, { "epoch": 2.1391437534056936, "grad_norm": 1.1446257829666138, "learning_rate": 1.2718930205462864e-05, "loss": 0.0656, "step": 145250 }, { "epoch": 2.1395119364957806, "grad_norm": 1.1383090019226074, "learning_rate": 1.271729383349588e-05, "loss": 0.0674, "step": 145275 }, { "epoch": 2.1398801195858677, "grad_norm": 1.2570135593414307, "learning_rate": 1.2715657461528897e-05, "loss": 0.063, "step": 145300 }, { "epoch": 2.1402483026759547, "grad_norm": 1.4377793073654175, "learning_rate": 1.2714021089561913e-05, "loss": 0.0596, "step": 145325 }, { "epoch": 2.1406164857660417, "grad_norm": 1.6421830654144287, "learning_rate": 1.2712384717594927e-05, "loss": 0.0625, "step": 145350 }, { "epoch": 2.1409846688561287, "grad_norm": 1.458400011062622, "learning_rate": 1.2710748345627943e-05, "loss": 0.0622, "step": 145375 }, { "epoch": 2.1413528519462157, "grad_norm": 1.1557886600494385, "learning_rate": 1.2709111973660958e-05, "loss": 0.0623, "step": 145400 }, { "epoch": 2.1417210350363027, "grad_norm": 1.8212825059890747, "learning_rate": 1.2707475601693972e-05, "loss": 0.0607, "step": 145425 }, { "epoch": 2.1420892181263897, "grad_norm": 1.9266533851623535, "learning_rate": 1.2705839229726988e-05, "loss": 0.0663, "step": 145450 }, { "epoch": 2.1424574012164768, "grad_norm": 1.2210057973861694, "learning_rate": 1.2704202857760004e-05, "loss": 0.0634, "step": 145475 }, { "epoch": 2.142825584306564, "grad_norm": 1.2419432401657104, "learning_rate": 1.2702566485793019e-05, "loss": 0.0637, "step": 145500 }, { "epoch": 2.1431937673966512, "grad_norm": 1.3512849807739258, "learning_rate": 1.2700930113826035e-05, "loss": 0.0619, "step": 145525 }, { "epoch": 2.1435619504867383, "grad_norm": 1.3220046758651733, "learning_rate": 1.2699293741859051e-05, "loss": 0.0598, "step": 145550 }, { "epoch": 2.1439301335768253, "grad_norm": 1.4884276390075684, "learning_rate": 1.2697657369892067e-05, "loss": 0.0633, "step": 145575 }, { "epoch": 2.1442983166669123, "grad_norm": 1.504096508026123, "learning_rate": 1.269602099792508e-05, "loss": 0.0596, "step": 145600 }, { "epoch": 2.1446664997569993, "grad_norm": 1.3510218858718872, "learning_rate": 1.2694384625958096e-05, "loss": 0.0626, "step": 145625 }, { "epoch": 2.1450346828470863, "grad_norm": 0.9822535514831543, "learning_rate": 1.2692748253991112e-05, "loss": 0.0547, "step": 145650 }, { "epoch": 2.1454028659371733, "grad_norm": 1.5289455652236938, "learning_rate": 1.2691111882024127e-05, "loss": 0.0665, "step": 145675 }, { "epoch": 2.1457710490272603, "grad_norm": 1.4685269594192505, "learning_rate": 1.2689475510057143e-05, "loss": 0.056, "step": 145700 }, { "epoch": 2.1461392321173474, "grad_norm": 1.6888009309768677, "learning_rate": 1.268783913809016e-05, "loss": 0.0641, "step": 145725 }, { "epoch": 2.1465074152074344, "grad_norm": 1.2333053350448608, "learning_rate": 1.2686202766123175e-05, "loss": 0.0586, "step": 145750 }, { "epoch": 2.1468755982975214, "grad_norm": 1.2496823072433472, "learning_rate": 1.268456639415619e-05, "loss": 0.058, "step": 145775 }, { "epoch": 2.1472437813876084, "grad_norm": 1.2895691394805908, "learning_rate": 1.2682930022189206e-05, "loss": 0.0575, "step": 145800 }, { "epoch": 2.1476119644776954, "grad_norm": 1.2195422649383545, "learning_rate": 1.2681293650222219e-05, "loss": 0.0603, "step": 145825 }, { "epoch": 2.1479801475677824, "grad_norm": 1.4415476322174072, "learning_rate": 1.2679657278255235e-05, "loss": 0.0507, "step": 145850 }, { "epoch": 2.1483483306578695, "grad_norm": 1.3907585144042969, "learning_rate": 1.2678020906288251e-05, "loss": 0.0594, "step": 145875 }, { "epoch": 2.1487165137479565, "grad_norm": 0.7849710583686829, "learning_rate": 1.2676384534321267e-05, "loss": 0.0668, "step": 145900 }, { "epoch": 2.1490846968380435, "grad_norm": 0.9230354428291321, "learning_rate": 1.2674748162354282e-05, "loss": 0.065, "step": 145925 }, { "epoch": 2.1494528799281305, "grad_norm": 1.080701470375061, "learning_rate": 1.2673111790387298e-05, "loss": 0.0564, "step": 145950 }, { "epoch": 2.1498210630182175, "grad_norm": 1.5436818599700928, "learning_rate": 1.2671475418420314e-05, "loss": 0.0632, "step": 145975 }, { "epoch": 2.1501892461083045, "grad_norm": 0.9225010275840759, "learning_rate": 1.266983904645333e-05, "loss": 0.0647, "step": 146000 }, { "epoch": 2.1505574291983915, "grad_norm": 1.6171057224273682, "learning_rate": 1.2668202674486343e-05, "loss": 0.062, "step": 146025 }, { "epoch": 2.150925612288479, "grad_norm": 1.8668445348739624, "learning_rate": 1.2666566302519359e-05, "loss": 0.061, "step": 146050 }, { "epoch": 2.151293795378566, "grad_norm": 1.1698726415634155, "learning_rate": 1.2664929930552373e-05, "loss": 0.0636, "step": 146075 }, { "epoch": 2.151661978468653, "grad_norm": 1.7313923835754395, "learning_rate": 1.266329355858539e-05, "loss": 0.0632, "step": 146100 }, { "epoch": 2.15203016155874, "grad_norm": 1.4423819780349731, "learning_rate": 1.2661657186618406e-05, "loss": 0.0607, "step": 146125 }, { "epoch": 2.152398344648827, "grad_norm": 1.4605655670166016, "learning_rate": 1.2660020814651422e-05, "loss": 0.0614, "step": 146150 }, { "epoch": 2.152766527738914, "grad_norm": 1.4070481061935425, "learning_rate": 1.2658384442684436e-05, "loss": 0.0625, "step": 146175 }, { "epoch": 2.153134710829001, "grad_norm": 1.1662936210632324, "learning_rate": 1.2656748070717453e-05, "loss": 0.0532, "step": 146200 }, { "epoch": 2.153502893919088, "grad_norm": 2.1889052391052246, "learning_rate": 1.2655111698750467e-05, "loss": 0.0554, "step": 146225 }, { "epoch": 2.153871077009175, "grad_norm": 1.0931512117385864, "learning_rate": 1.2653475326783481e-05, "loss": 0.0544, "step": 146250 }, { "epoch": 2.154239260099262, "grad_norm": 1.5517659187316895, "learning_rate": 1.2651838954816498e-05, "loss": 0.0653, "step": 146275 }, { "epoch": 2.154607443189349, "grad_norm": 1.179627776145935, "learning_rate": 1.2650202582849514e-05, "loss": 0.0655, "step": 146300 }, { "epoch": 2.154975626279436, "grad_norm": 1.2843074798583984, "learning_rate": 1.264856621088253e-05, "loss": 0.0588, "step": 146325 }, { "epoch": 2.155343809369523, "grad_norm": 1.0195711851119995, "learning_rate": 1.2646929838915544e-05, "loss": 0.0586, "step": 146350 }, { "epoch": 2.15571199245961, "grad_norm": 0.9936036467552185, "learning_rate": 1.264535892182724e-05, "loss": 0.0617, "step": 146375 }, { "epoch": 2.156080175549697, "grad_norm": 1.2481807470321655, "learning_rate": 1.2643722549860256e-05, "loss": 0.0586, "step": 146400 }, { "epoch": 2.1564483586397842, "grad_norm": 1.269595980644226, "learning_rate": 1.264208617789327e-05, "loss": 0.0644, "step": 146425 }, { "epoch": 2.1568165417298713, "grad_norm": 1.208337426185608, "learning_rate": 1.2640449805926285e-05, "loss": 0.0647, "step": 146450 }, { "epoch": 2.1571847248199583, "grad_norm": 1.532379150390625, "learning_rate": 1.2638813433959301e-05, "loss": 0.0624, "step": 146475 }, { "epoch": 2.1575529079100457, "grad_norm": 1.3638733625411987, "learning_rate": 1.2637177061992316e-05, "loss": 0.0658, "step": 146500 }, { "epoch": 2.1579210910001327, "grad_norm": 1.1447440385818481, "learning_rate": 1.2635540690025332e-05, "loss": 0.0587, "step": 146525 }, { "epoch": 2.1582892740902198, "grad_norm": 1.1884541511535645, "learning_rate": 1.2633904318058348e-05, "loss": 0.0631, "step": 146550 }, { "epoch": 2.1586574571803068, "grad_norm": 1.6917191743850708, "learning_rate": 1.2632267946091364e-05, "loss": 0.0628, "step": 146575 }, { "epoch": 2.159025640270394, "grad_norm": 1.0723440647125244, "learning_rate": 1.2630631574124379e-05, "loss": 0.0691, "step": 146600 }, { "epoch": 2.159393823360481, "grad_norm": 1.411756992340088, "learning_rate": 1.2628995202157395e-05, "loss": 0.0688, "step": 146625 }, { "epoch": 2.159762006450568, "grad_norm": 1.2132647037506104, "learning_rate": 1.2627358830190407e-05, "loss": 0.0551, "step": 146650 }, { "epoch": 2.160130189540655, "grad_norm": 1.6361756324768066, "learning_rate": 1.2625722458223424e-05, "loss": 0.068, "step": 146675 }, { "epoch": 2.160498372630742, "grad_norm": 1.1633211374282837, "learning_rate": 1.262408608625644e-05, "loss": 0.064, "step": 146700 }, { "epoch": 2.160866555720829, "grad_norm": 0.9648963809013367, "learning_rate": 1.2622449714289456e-05, "loss": 0.0577, "step": 146725 }, { "epoch": 2.161234738810916, "grad_norm": 1.452364206314087, "learning_rate": 1.262081334232247e-05, "loss": 0.06, "step": 146750 }, { "epoch": 2.161602921901003, "grad_norm": 1.4971153736114502, "learning_rate": 1.2619176970355487e-05, "loss": 0.0698, "step": 146775 }, { "epoch": 2.16197110499109, "grad_norm": 1.226670265197754, "learning_rate": 1.2617540598388503e-05, "loss": 0.0654, "step": 146800 }, { "epoch": 2.162339288081177, "grad_norm": 1.5489416122436523, "learning_rate": 1.2615904226421519e-05, "loss": 0.0622, "step": 146825 }, { "epoch": 2.162707471171264, "grad_norm": 1.175428867340088, "learning_rate": 1.2614267854454533e-05, "loss": 0.0605, "step": 146850 }, { "epoch": 2.163075654261351, "grad_norm": 1.4683444499969482, "learning_rate": 1.2612631482487548e-05, "loss": 0.0672, "step": 146875 }, { "epoch": 2.163443837351438, "grad_norm": 1.3437857627868652, "learning_rate": 1.2610995110520562e-05, "loss": 0.0609, "step": 146900 }, { "epoch": 2.163812020441525, "grad_norm": 1.406295657157898, "learning_rate": 1.2609358738553578e-05, "loss": 0.0565, "step": 146925 }, { "epoch": 2.164180203531612, "grad_norm": 1.2059030532836914, "learning_rate": 1.2607722366586594e-05, "loss": 0.0608, "step": 146950 }, { "epoch": 2.164548386621699, "grad_norm": 1.5873130559921265, "learning_rate": 1.260608599461961e-05, "loss": 0.0653, "step": 146975 }, { "epoch": 2.1649165697117865, "grad_norm": 1.0940394401550293, "learning_rate": 1.2604449622652625e-05, "loss": 0.0591, "step": 147000 }, { "epoch": 2.1652847528018735, "grad_norm": 1.469257116317749, "learning_rate": 1.2602813250685641e-05, "loss": 0.0638, "step": 147025 }, { "epoch": 2.1656529358919605, "grad_norm": 1.5114831924438477, "learning_rate": 1.2601176878718657e-05, "loss": 0.0666, "step": 147050 }, { "epoch": 2.1660211189820475, "grad_norm": 1.5783467292785645, "learning_rate": 1.259954050675167e-05, "loss": 0.0589, "step": 147075 }, { "epoch": 2.1663893020721345, "grad_norm": 1.2109485864639282, "learning_rate": 1.2597904134784686e-05, "loss": 0.063, "step": 147100 }, { "epoch": 2.1667574851622216, "grad_norm": 1.6290923357009888, "learning_rate": 1.2596267762817702e-05, "loss": 0.0618, "step": 147125 }, { "epoch": 2.1671256682523086, "grad_norm": 1.8701249361038208, "learning_rate": 1.2594631390850719e-05, "loss": 0.0622, "step": 147150 }, { "epoch": 2.1674938513423956, "grad_norm": 1.2110137939453125, "learning_rate": 1.2592995018883733e-05, "loss": 0.0533, "step": 147175 }, { "epoch": 2.1678620344324826, "grad_norm": 1.2196629047393799, "learning_rate": 1.259135864691675e-05, "loss": 0.0628, "step": 147200 }, { "epoch": 2.1682302175225696, "grad_norm": 1.3484305143356323, "learning_rate": 1.2589722274949765e-05, "loss": 0.0616, "step": 147225 }, { "epoch": 2.1685984006126566, "grad_norm": 1.7585560083389282, "learning_rate": 1.258808590298278e-05, "loss": 0.0661, "step": 147250 }, { "epoch": 2.1689665837027436, "grad_norm": 0.9657861590385437, "learning_rate": 1.2586449531015796e-05, "loss": 0.059, "step": 147275 }, { "epoch": 2.1693347667928307, "grad_norm": 1.6587380170822144, "learning_rate": 1.258481315904881e-05, "loss": 0.0629, "step": 147300 }, { "epoch": 2.1697029498829177, "grad_norm": 1.3727116584777832, "learning_rate": 1.2583176787081825e-05, "loss": 0.0577, "step": 147325 }, { "epoch": 2.1700711329730047, "grad_norm": 1.3495436906814575, "learning_rate": 1.2581540415114841e-05, "loss": 0.0566, "step": 147350 }, { "epoch": 2.1704393160630917, "grad_norm": 1.3399096727371216, "learning_rate": 1.2579904043147857e-05, "loss": 0.0592, "step": 147375 }, { "epoch": 2.1708074991531787, "grad_norm": 1.6004917621612549, "learning_rate": 1.2578267671180873e-05, "loss": 0.067, "step": 147400 }, { "epoch": 2.1711756822432657, "grad_norm": 1.8596436977386475, "learning_rate": 1.2576631299213888e-05, "loss": 0.0564, "step": 147425 }, { "epoch": 2.171543865333353, "grad_norm": 1.4039843082427979, "learning_rate": 1.2574994927246904e-05, "loss": 0.0603, "step": 147450 }, { "epoch": 2.17191204842344, "grad_norm": 0.8630525469779968, "learning_rate": 1.257335855527992e-05, "loss": 0.0621, "step": 147475 }, { "epoch": 2.1722802315135272, "grad_norm": 1.2812976837158203, "learning_rate": 1.2571722183312933e-05, "loss": 0.0636, "step": 147500 }, { "epoch": 2.1726484146036142, "grad_norm": 1.1953637599945068, "learning_rate": 1.2570085811345949e-05, "loss": 0.0627, "step": 147525 }, { "epoch": 2.1730165976937013, "grad_norm": 1.6726806163787842, "learning_rate": 1.2568449439378965e-05, "loss": 0.0616, "step": 147550 }, { "epoch": 2.1733847807837883, "grad_norm": 1.4168367385864258, "learning_rate": 1.256681306741198e-05, "loss": 0.0627, "step": 147575 }, { "epoch": 2.1737529638738753, "grad_norm": 1.8928165435791016, "learning_rate": 1.2565176695444996e-05, "loss": 0.0625, "step": 147600 }, { "epoch": 2.1741211469639623, "grad_norm": 1.7232677936553955, "learning_rate": 1.2563540323478012e-05, "loss": 0.0614, "step": 147625 }, { "epoch": 2.1744893300540493, "grad_norm": 0.9866328239440918, "learning_rate": 1.2561903951511028e-05, "loss": 0.061, "step": 147650 }, { "epoch": 2.1748575131441363, "grad_norm": 1.1385637521743774, "learning_rate": 1.2560267579544043e-05, "loss": 0.0597, "step": 147675 }, { "epoch": 2.1752256962342233, "grad_norm": 1.1440985202789307, "learning_rate": 1.2558631207577059e-05, "loss": 0.0728, "step": 147700 }, { "epoch": 2.1755938793243104, "grad_norm": 1.4859899282455444, "learning_rate": 1.2556994835610073e-05, "loss": 0.0533, "step": 147725 }, { "epoch": 2.1759620624143974, "grad_norm": 0.6152855753898621, "learning_rate": 1.2555358463643088e-05, "loss": 0.0552, "step": 147750 }, { "epoch": 2.1763302455044844, "grad_norm": 1.1117513179779053, "learning_rate": 1.2553722091676104e-05, "loss": 0.0595, "step": 147775 }, { "epoch": 2.1766984285945714, "grad_norm": 1.3607580661773682, "learning_rate": 1.255208571970912e-05, "loss": 0.0591, "step": 147800 }, { "epoch": 2.1770666116846584, "grad_norm": 1.2065541744232178, "learning_rate": 1.2550449347742134e-05, "loss": 0.0529, "step": 147825 }, { "epoch": 2.1774347947747454, "grad_norm": 1.242772102355957, "learning_rate": 1.254881297577515e-05, "loss": 0.067, "step": 147850 }, { "epoch": 2.1778029778648325, "grad_norm": 1.523486852645874, "learning_rate": 1.2547176603808167e-05, "loss": 0.0628, "step": 147875 }, { "epoch": 2.1781711609549195, "grad_norm": 1.2696503400802612, "learning_rate": 1.2545540231841183e-05, "loss": 0.062, "step": 147900 }, { "epoch": 2.1785393440450065, "grad_norm": 1.4526379108428955, "learning_rate": 1.2543903859874196e-05, "loss": 0.0622, "step": 147925 }, { "epoch": 2.178907527135094, "grad_norm": 1.4397584199905396, "learning_rate": 1.2542267487907212e-05, "loss": 0.0716, "step": 147950 }, { "epoch": 2.179275710225181, "grad_norm": 1.6296885013580322, "learning_rate": 1.2540631115940228e-05, "loss": 0.0656, "step": 147975 }, { "epoch": 2.179643893315268, "grad_norm": 0.8643805384635925, "learning_rate": 1.2538994743973242e-05, "loss": 0.0666, "step": 148000 }, { "epoch": 2.180012076405355, "grad_norm": 1.0706731081008911, "learning_rate": 1.2537358372006259e-05, "loss": 0.0616, "step": 148025 }, { "epoch": 2.180380259495442, "grad_norm": 1.2822765111923218, "learning_rate": 1.2535722000039275e-05, "loss": 0.0686, "step": 148050 }, { "epoch": 2.180748442585529, "grad_norm": 1.0711289644241333, "learning_rate": 1.253408562807229e-05, "loss": 0.0665, "step": 148075 }, { "epoch": 2.181116625675616, "grad_norm": 1.1549954414367676, "learning_rate": 1.2532449256105305e-05, "loss": 0.0662, "step": 148100 }, { "epoch": 2.181484808765703, "grad_norm": 1.2080084085464478, "learning_rate": 1.253081288413832e-05, "loss": 0.0662, "step": 148125 }, { "epoch": 2.18185299185579, "grad_norm": 1.492305040359497, "learning_rate": 1.2529176512171334e-05, "loss": 0.0643, "step": 148150 }, { "epoch": 2.182221174945877, "grad_norm": 1.7838488817214966, "learning_rate": 1.252754014020435e-05, "loss": 0.0648, "step": 148175 }, { "epoch": 2.182589358035964, "grad_norm": 1.157466173171997, "learning_rate": 1.2525903768237366e-05, "loss": 0.0649, "step": 148200 }, { "epoch": 2.182957541126051, "grad_norm": 0.857884407043457, "learning_rate": 1.2524267396270383e-05, "loss": 0.0665, "step": 148225 }, { "epoch": 2.183325724216138, "grad_norm": 1.0878560543060303, "learning_rate": 1.2522631024303397e-05, "loss": 0.0607, "step": 148250 }, { "epoch": 2.183693907306225, "grad_norm": 1.3036624193191528, "learning_rate": 1.2520994652336413e-05, "loss": 0.0635, "step": 148275 }, { "epoch": 2.184062090396312, "grad_norm": 1.7599238157272339, "learning_rate": 1.251935828036943e-05, "loss": 0.0621, "step": 148300 }, { "epoch": 2.184430273486399, "grad_norm": 1.70572829246521, "learning_rate": 1.2517721908402446e-05, "loss": 0.0637, "step": 148325 }, { "epoch": 2.184798456576486, "grad_norm": 1.495478630065918, "learning_rate": 1.2516085536435458e-05, "loss": 0.0621, "step": 148350 }, { "epoch": 2.185166639666573, "grad_norm": 1.394152045249939, "learning_rate": 1.2514449164468474e-05, "loss": 0.0671, "step": 148375 }, { "epoch": 2.1855348227566607, "grad_norm": 1.3999066352844238, "learning_rate": 1.2512812792501489e-05, "loss": 0.0645, "step": 148400 }, { "epoch": 2.1859030058467477, "grad_norm": 1.5172370672225952, "learning_rate": 1.2511176420534505e-05, "loss": 0.0662, "step": 148425 }, { "epoch": 2.1862711889368347, "grad_norm": 1.1504250764846802, "learning_rate": 1.2509540048567521e-05, "loss": 0.0614, "step": 148450 }, { "epoch": 2.1866393720269217, "grad_norm": 1.031248688697815, "learning_rate": 1.2507903676600537e-05, "loss": 0.0672, "step": 148475 }, { "epoch": 2.1870075551170087, "grad_norm": 1.552599549293518, "learning_rate": 1.2506267304633552e-05, "loss": 0.0581, "step": 148500 }, { "epoch": 2.1873757382070957, "grad_norm": 1.2594009637832642, "learning_rate": 1.2504630932666568e-05, "loss": 0.0646, "step": 148525 }, { "epoch": 2.1877439212971828, "grad_norm": 1.7048428058624268, "learning_rate": 1.2502994560699582e-05, "loss": 0.0673, "step": 148550 }, { "epoch": 2.1881121043872698, "grad_norm": 1.3809187412261963, "learning_rate": 1.2501358188732597e-05, "loss": 0.0596, "step": 148575 }, { "epoch": 2.188480287477357, "grad_norm": 1.5433809757232666, "learning_rate": 1.2499721816765613e-05, "loss": 0.0628, "step": 148600 }, { "epoch": 2.188848470567444, "grad_norm": 1.3568446636199951, "learning_rate": 1.249808544479863e-05, "loss": 0.0603, "step": 148625 }, { "epoch": 2.189216653657531, "grad_norm": 1.197313666343689, "learning_rate": 1.2496449072831645e-05, "loss": 0.0582, "step": 148650 }, { "epoch": 2.189584836747618, "grad_norm": 1.5377166271209717, "learning_rate": 1.249481270086466e-05, "loss": 0.0634, "step": 148675 }, { "epoch": 2.189953019837705, "grad_norm": 0.996576189994812, "learning_rate": 1.2493176328897676e-05, "loss": 0.0638, "step": 148700 }, { "epoch": 2.190321202927792, "grad_norm": 1.5729095935821533, "learning_rate": 1.2491539956930692e-05, "loss": 0.0635, "step": 148725 }, { "epoch": 2.190689386017879, "grad_norm": 1.2386174201965332, "learning_rate": 1.2489903584963707e-05, "loss": 0.0601, "step": 148750 }, { "epoch": 2.191057569107966, "grad_norm": 1.6151951551437378, "learning_rate": 1.2488267212996721e-05, "loss": 0.0654, "step": 148775 }, { "epoch": 2.191425752198053, "grad_norm": 1.192921757698059, "learning_rate": 1.2486630841029737e-05, "loss": 0.0646, "step": 148800 }, { "epoch": 2.19179393528814, "grad_norm": 1.41513192653656, "learning_rate": 1.2484994469062752e-05, "loss": 0.0665, "step": 148825 }, { "epoch": 2.192162118378227, "grad_norm": 1.1037499904632568, "learning_rate": 1.2483358097095768e-05, "loss": 0.0646, "step": 148850 }, { "epoch": 2.192530301468314, "grad_norm": 1.6659947633743286, "learning_rate": 1.2481721725128784e-05, "loss": 0.061, "step": 148875 }, { "epoch": 2.192898484558401, "grad_norm": 1.3756612539291382, "learning_rate": 1.24800853531618e-05, "loss": 0.0603, "step": 148900 }, { "epoch": 2.1932666676484884, "grad_norm": 1.5419251918792725, "learning_rate": 1.2478448981194815e-05, "loss": 0.0596, "step": 148925 }, { "epoch": 2.1936348507385754, "grad_norm": 1.496291995048523, "learning_rate": 1.247681260922783e-05, "loss": 0.0608, "step": 148950 }, { "epoch": 2.1940030338286625, "grad_norm": 1.7220516204833984, "learning_rate": 1.2475176237260843e-05, "loss": 0.0694, "step": 148975 }, { "epoch": 2.1943712169187495, "grad_norm": 1.1940354108810425, "learning_rate": 1.247353986529386e-05, "loss": 0.0609, "step": 149000 }, { "epoch": 2.1947394000088365, "grad_norm": 1.6296696662902832, "learning_rate": 1.2471903493326876e-05, "loss": 0.0591, "step": 149025 }, { "epoch": 2.1951075830989235, "grad_norm": 1.7357245683670044, "learning_rate": 1.2470267121359892e-05, "loss": 0.0613, "step": 149050 }, { "epoch": 2.1954757661890105, "grad_norm": 1.6018069982528687, "learning_rate": 1.2468630749392906e-05, "loss": 0.0601, "step": 149075 }, { "epoch": 2.1958439492790975, "grad_norm": 1.5958993434906006, "learning_rate": 1.2466994377425923e-05, "loss": 0.065, "step": 149100 }, { "epoch": 2.1962121323691846, "grad_norm": 1.0688061714172363, "learning_rate": 1.2465358005458939e-05, "loss": 0.0582, "step": 149125 }, { "epoch": 2.1965803154592716, "grad_norm": 1.1995221376419067, "learning_rate": 1.2463721633491955e-05, "loss": 0.0645, "step": 149150 }, { "epoch": 2.1969484985493586, "grad_norm": 1.3568192720413208, "learning_rate": 1.246208526152497e-05, "loss": 0.0595, "step": 149175 }, { "epoch": 2.1973166816394456, "grad_norm": 1.1569491624832153, "learning_rate": 1.2460448889557984e-05, "loss": 0.063, "step": 149200 }, { "epoch": 2.1976848647295326, "grad_norm": 1.4922245740890503, "learning_rate": 1.2458812517591e-05, "loss": 0.0671, "step": 149225 }, { "epoch": 2.1980530478196196, "grad_norm": 1.4305675029754639, "learning_rate": 1.2457176145624014e-05, "loss": 0.0633, "step": 149250 }, { "epoch": 2.1984212309097066, "grad_norm": 1.384347677230835, "learning_rate": 1.245553977365703e-05, "loss": 0.0571, "step": 149275 }, { "epoch": 2.1987894139997937, "grad_norm": 1.5995211601257324, "learning_rate": 1.2453903401690047e-05, "loss": 0.0657, "step": 149300 }, { "epoch": 2.1991575970898807, "grad_norm": 1.1055463552474976, "learning_rate": 1.2452267029723061e-05, "loss": 0.0663, "step": 149325 }, { "epoch": 2.1995257801799677, "grad_norm": 1.339615821838379, "learning_rate": 1.2450630657756077e-05, "loss": 0.0598, "step": 149350 }, { "epoch": 2.199893963270055, "grad_norm": 0.9209982752799988, "learning_rate": 1.2448994285789093e-05, "loss": 0.0553, "step": 149375 }, { "epoch": 2.200262146360142, "grad_norm": 0.9548072814941406, "learning_rate": 1.2447357913822106e-05, "loss": 0.0649, "step": 149400 }, { "epoch": 2.200630329450229, "grad_norm": 1.3747440576553345, "learning_rate": 1.2445721541855122e-05, "loss": 0.0544, "step": 149425 }, { "epoch": 2.200998512540316, "grad_norm": 1.2757736444473267, "learning_rate": 1.2444085169888138e-05, "loss": 0.0634, "step": 149450 }, { "epoch": 2.201366695630403, "grad_norm": 1.3781328201293945, "learning_rate": 1.2442448797921155e-05, "loss": 0.0633, "step": 149475 }, { "epoch": 2.2017348787204902, "grad_norm": 1.3538663387298584, "learning_rate": 1.2440812425954169e-05, "loss": 0.065, "step": 149500 }, { "epoch": 2.2021030618105772, "grad_norm": 1.1612451076507568, "learning_rate": 1.2439176053987185e-05, "loss": 0.0641, "step": 149525 }, { "epoch": 2.2024712449006643, "grad_norm": 1.6553928852081299, "learning_rate": 1.2437539682020201e-05, "loss": 0.0643, "step": 149550 }, { "epoch": 2.2028394279907513, "grad_norm": 1.4857532978057861, "learning_rate": 1.2435903310053218e-05, "loss": 0.0601, "step": 149575 }, { "epoch": 2.2032076110808383, "grad_norm": 1.23832106590271, "learning_rate": 1.243426693808623e-05, "loss": 0.0599, "step": 149600 }, { "epoch": 2.2035757941709253, "grad_norm": 1.535791277885437, "learning_rate": 1.2432630566119246e-05, "loss": 0.0617, "step": 149625 }, { "epoch": 2.2039439772610123, "grad_norm": 1.5874391794204712, "learning_rate": 1.2430994194152261e-05, "loss": 0.0652, "step": 149650 }, { "epoch": 2.2043121603510993, "grad_norm": 1.652728796005249, "learning_rate": 1.2429357822185277e-05, "loss": 0.066, "step": 149675 }, { "epoch": 2.2046803434411864, "grad_norm": 1.1046514511108398, "learning_rate": 1.2427721450218293e-05, "loss": 0.0631, "step": 149700 }, { "epoch": 2.2050485265312734, "grad_norm": 1.078290343284607, "learning_rate": 1.242608507825131e-05, "loss": 0.0604, "step": 149725 }, { "epoch": 2.2054167096213604, "grad_norm": 1.287468671798706, "learning_rate": 1.2424448706284324e-05, "loss": 0.0669, "step": 149750 }, { "epoch": 2.2057848927114474, "grad_norm": 1.630241870880127, "learning_rate": 1.242281233431734e-05, "loss": 0.07, "step": 149775 }, { "epoch": 2.2061530758015344, "grad_norm": 1.7726587057113647, "learning_rate": 1.2421175962350356e-05, "loss": 0.061, "step": 149800 }, { "epoch": 2.2065212588916214, "grad_norm": 0.9498474597930908, "learning_rate": 1.2419539590383369e-05, "loss": 0.0532, "step": 149825 }, { "epoch": 2.2068894419817084, "grad_norm": 1.8226207494735718, "learning_rate": 1.2417903218416385e-05, "loss": 0.0644, "step": 149850 }, { "epoch": 2.207257625071796, "grad_norm": 1.028624176979065, "learning_rate": 1.2416266846449401e-05, "loss": 0.0667, "step": 149875 }, { "epoch": 2.207625808161883, "grad_norm": 1.1617398262023926, "learning_rate": 1.2414630474482416e-05, "loss": 0.0567, "step": 149900 }, { "epoch": 2.20799399125197, "grad_norm": 0.8688008189201355, "learning_rate": 1.2412994102515432e-05, "loss": 0.0622, "step": 149925 }, { "epoch": 2.208362174342057, "grad_norm": 0.8188631534576416, "learning_rate": 1.2411357730548448e-05, "loss": 0.0598, "step": 149950 }, { "epoch": 2.208730357432144, "grad_norm": 0.9632925391197205, "learning_rate": 1.2409721358581464e-05, "loss": 0.0625, "step": 149975 }, { "epoch": 2.209098540522231, "grad_norm": 1.6538541316986084, "learning_rate": 1.2408084986614479e-05, "loss": 0.0526, "step": 150000 }, { "epoch": 2.209466723612318, "grad_norm": 1.8903040885925293, "learning_rate": 1.2406448614647493e-05, "loss": 0.0631, "step": 150025 }, { "epoch": 2.209834906702405, "grad_norm": 1.2473844289779663, "learning_rate": 1.240481224268051e-05, "loss": 0.0646, "step": 150050 }, { "epoch": 2.210203089792492, "grad_norm": 1.612788200378418, "learning_rate": 1.2403175870713524e-05, "loss": 0.0646, "step": 150075 }, { "epoch": 2.210571272882579, "grad_norm": 1.7383086681365967, "learning_rate": 1.240153949874654e-05, "loss": 0.0675, "step": 150100 }, { "epoch": 2.210939455972666, "grad_norm": 1.6361398696899414, "learning_rate": 1.2399903126779556e-05, "loss": 0.068, "step": 150125 }, { "epoch": 2.211307639062753, "grad_norm": 1.5248494148254395, "learning_rate": 1.239826675481257e-05, "loss": 0.0663, "step": 150150 }, { "epoch": 2.21167582215284, "grad_norm": 0.9361855387687683, "learning_rate": 1.2396630382845587e-05, "loss": 0.0566, "step": 150175 }, { "epoch": 2.212044005242927, "grad_norm": 1.2662063837051392, "learning_rate": 1.2394994010878603e-05, "loss": 0.0606, "step": 150200 }, { "epoch": 2.212412188333014, "grad_norm": 1.3367865085601807, "learning_rate": 1.2393357638911619e-05, "loss": 0.0647, "step": 150225 }, { "epoch": 2.212780371423101, "grad_norm": 0.9757417440414429, "learning_rate": 1.2391721266944632e-05, "loss": 0.0543, "step": 150250 }, { "epoch": 2.213148554513188, "grad_norm": 1.5465177297592163, "learning_rate": 1.2390084894977648e-05, "loss": 0.0638, "step": 150275 }, { "epoch": 2.213516737603275, "grad_norm": 1.637668251991272, "learning_rate": 1.2388448523010664e-05, "loss": 0.0589, "step": 150300 }, { "epoch": 2.2138849206933626, "grad_norm": 1.2209950685501099, "learning_rate": 1.2386877605922358e-05, "loss": 0.0575, "step": 150325 }, { "epoch": 2.2142531037834496, "grad_norm": 1.781244158744812, "learning_rate": 1.2385241233955374e-05, "loss": 0.0675, "step": 150350 }, { "epoch": 2.2146212868735367, "grad_norm": 1.7301344871520996, "learning_rate": 1.238360486198839e-05, "loss": 0.0661, "step": 150375 }, { "epoch": 2.2149894699636237, "grad_norm": 1.3790034055709839, "learning_rate": 1.2381968490021406e-05, "loss": 0.0611, "step": 150400 }, { "epoch": 2.2153576530537107, "grad_norm": 1.5492733716964722, "learning_rate": 1.238033211805442e-05, "loss": 0.0658, "step": 150425 }, { "epoch": 2.2157258361437977, "grad_norm": 1.0602071285247803, "learning_rate": 1.2378695746087435e-05, "loss": 0.0636, "step": 150450 }, { "epoch": 2.2160940192338847, "grad_norm": 1.7389768362045288, "learning_rate": 1.237705937412045e-05, "loss": 0.0647, "step": 150475 }, { "epoch": 2.2164622023239717, "grad_norm": 1.8014004230499268, "learning_rate": 1.2375423002153466e-05, "loss": 0.0681, "step": 150500 }, { "epoch": 2.2168303854140587, "grad_norm": 1.4853734970092773, "learning_rate": 1.2373786630186482e-05, "loss": 0.0641, "step": 150525 }, { "epoch": 2.2171985685041458, "grad_norm": 1.3523290157318115, "learning_rate": 1.2372150258219498e-05, "loss": 0.0647, "step": 150550 }, { "epoch": 2.2175667515942328, "grad_norm": 1.4068225622177124, "learning_rate": 1.2370513886252513e-05, "loss": 0.0635, "step": 150575 }, { "epoch": 2.21793493468432, "grad_norm": 1.6109051704406738, "learning_rate": 1.2368877514285529e-05, "loss": 0.0644, "step": 150600 }, { "epoch": 2.218303117774407, "grad_norm": 1.5518189668655396, "learning_rate": 1.2367241142318545e-05, "loss": 0.063, "step": 150625 }, { "epoch": 2.218671300864494, "grad_norm": 1.8075902462005615, "learning_rate": 1.2365604770351561e-05, "loss": 0.064, "step": 150650 }, { "epoch": 2.219039483954581, "grad_norm": 1.2301523685455322, "learning_rate": 1.2363968398384574e-05, "loss": 0.0655, "step": 150675 }, { "epoch": 2.219407667044668, "grad_norm": 1.6648341417312622, "learning_rate": 1.236233202641759e-05, "loss": 0.0653, "step": 150700 }, { "epoch": 2.219775850134755, "grad_norm": 1.4048888683319092, "learning_rate": 1.2360695654450604e-05, "loss": 0.0665, "step": 150725 }, { "epoch": 2.220144033224842, "grad_norm": 1.1980386972427368, "learning_rate": 1.235905928248362e-05, "loss": 0.0635, "step": 150750 }, { "epoch": 2.220512216314929, "grad_norm": 1.6498727798461914, "learning_rate": 1.2357422910516637e-05, "loss": 0.0652, "step": 150775 }, { "epoch": 2.220880399405016, "grad_norm": 1.7174055576324463, "learning_rate": 1.2355786538549653e-05, "loss": 0.0659, "step": 150800 }, { "epoch": 2.221248582495103, "grad_norm": 1.707761287689209, "learning_rate": 1.2354150166582667e-05, "loss": 0.0656, "step": 150825 }, { "epoch": 2.2216167655851904, "grad_norm": 1.082489252090454, "learning_rate": 1.2352513794615683e-05, "loss": 0.0508, "step": 150850 }, { "epoch": 2.2219849486752774, "grad_norm": 1.2071428298950195, "learning_rate": 1.2350877422648698e-05, "loss": 0.062, "step": 150875 }, { "epoch": 2.2223531317653644, "grad_norm": 1.1847529411315918, "learning_rate": 1.2349241050681712e-05, "loss": 0.0659, "step": 150900 }, { "epoch": 2.2227213148554514, "grad_norm": 1.320918083190918, "learning_rate": 1.2347604678714728e-05, "loss": 0.0652, "step": 150925 }, { "epoch": 2.2230894979455385, "grad_norm": 1.723616361618042, "learning_rate": 1.2345968306747745e-05, "loss": 0.0621, "step": 150950 }, { "epoch": 2.2234576810356255, "grad_norm": 1.40606689453125, "learning_rate": 1.234433193478076e-05, "loss": 0.0639, "step": 150975 }, { "epoch": 2.2238258641257125, "grad_norm": 1.8838179111480713, "learning_rate": 1.2342695562813775e-05, "loss": 0.0627, "step": 151000 }, { "epoch": 2.2241940472157995, "grad_norm": 1.4595658779144287, "learning_rate": 1.2341059190846791e-05, "loss": 0.0657, "step": 151025 }, { "epoch": 2.2245622303058865, "grad_norm": 1.5074530839920044, "learning_rate": 1.2339422818879808e-05, "loss": 0.064, "step": 151050 }, { "epoch": 2.2249304133959735, "grad_norm": 1.2347718477249146, "learning_rate": 1.233778644691282e-05, "loss": 0.0615, "step": 151075 }, { "epoch": 2.2252985964860605, "grad_norm": 1.1741864681243896, "learning_rate": 1.2336150074945836e-05, "loss": 0.0666, "step": 151100 }, { "epoch": 2.2256667795761476, "grad_norm": 1.230065941810608, "learning_rate": 1.2334513702978853e-05, "loss": 0.0655, "step": 151125 }, { "epoch": 2.2260349626662346, "grad_norm": 1.2888094186782837, "learning_rate": 1.2332877331011867e-05, "loss": 0.0628, "step": 151150 }, { "epoch": 2.2264031457563216, "grad_norm": 1.5288974046707153, "learning_rate": 1.2331240959044883e-05, "loss": 0.0627, "step": 151175 }, { "epoch": 2.2267713288464086, "grad_norm": 1.4964913129806519, "learning_rate": 1.23296045870779e-05, "loss": 0.0656, "step": 151200 }, { "epoch": 2.2271395119364956, "grad_norm": 1.3611366748809814, "learning_rate": 1.2327968215110916e-05, "loss": 0.0623, "step": 151225 }, { "epoch": 2.2275076950265826, "grad_norm": 1.4730292558670044, "learning_rate": 1.232633184314393e-05, "loss": 0.0574, "step": 151250 }, { "epoch": 2.2278758781166697, "grad_norm": 1.308506727218628, "learning_rate": 1.2324695471176946e-05, "loss": 0.057, "step": 151275 }, { "epoch": 2.228244061206757, "grad_norm": 1.8183484077453613, "learning_rate": 1.2323059099209959e-05, "loss": 0.0613, "step": 151300 }, { "epoch": 2.228612244296844, "grad_norm": 0.9702703356742859, "learning_rate": 1.2321422727242975e-05, "loss": 0.0628, "step": 151325 }, { "epoch": 2.228980427386931, "grad_norm": 1.2962535619735718, "learning_rate": 1.2319786355275991e-05, "loss": 0.0573, "step": 151350 }, { "epoch": 2.229348610477018, "grad_norm": 0.9257969856262207, "learning_rate": 1.2318149983309007e-05, "loss": 0.0642, "step": 151375 }, { "epoch": 2.229716793567105, "grad_norm": 1.5734418630599976, "learning_rate": 1.2316513611342022e-05, "loss": 0.0607, "step": 151400 }, { "epoch": 2.230084976657192, "grad_norm": 1.3924126625061035, "learning_rate": 1.2314877239375038e-05, "loss": 0.0624, "step": 151425 }, { "epoch": 2.230453159747279, "grad_norm": 1.410024642944336, "learning_rate": 1.2313240867408054e-05, "loss": 0.065, "step": 151450 }, { "epoch": 2.230821342837366, "grad_norm": 1.294050931930542, "learning_rate": 1.231160449544107e-05, "loss": 0.0573, "step": 151475 }, { "epoch": 2.2311895259274532, "grad_norm": 0.7892852425575256, "learning_rate": 1.2309968123474083e-05, "loss": 0.0581, "step": 151500 }, { "epoch": 2.2315577090175402, "grad_norm": 1.2217223644256592, "learning_rate": 1.23083317515071e-05, "loss": 0.0573, "step": 151525 }, { "epoch": 2.2319258921076273, "grad_norm": 1.3074737787246704, "learning_rate": 1.2306695379540115e-05, "loss": 0.0601, "step": 151550 }, { "epoch": 2.2322940751977143, "grad_norm": 1.367594599723816, "learning_rate": 1.230505900757313e-05, "loss": 0.062, "step": 151575 }, { "epoch": 2.2326622582878013, "grad_norm": 0.9579832553863525, "learning_rate": 1.2303422635606146e-05, "loss": 0.0584, "step": 151600 }, { "epoch": 2.2330304413778883, "grad_norm": 1.3861565589904785, "learning_rate": 1.2301786263639162e-05, "loss": 0.0562, "step": 151625 }, { "epoch": 2.2333986244679753, "grad_norm": 1.655278205871582, "learning_rate": 1.2300149891672177e-05, "loss": 0.0644, "step": 151650 }, { "epoch": 2.2337668075580623, "grad_norm": 1.6077373027801514, "learning_rate": 1.2298513519705193e-05, "loss": 0.0639, "step": 151675 }, { "epoch": 2.2341349906481494, "grad_norm": 1.3243197202682495, "learning_rate": 1.2296877147738209e-05, "loss": 0.0613, "step": 151700 }, { "epoch": 2.2345031737382364, "grad_norm": 1.8628376722335815, "learning_rate": 1.2295240775771222e-05, "loss": 0.0715, "step": 151725 }, { "epoch": 2.2348713568283234, "grad_norm": 1.076183557510376, "learning_rate": 1.2293604403804238e-05, "loss": 0.0642, "step": 151750 }, { "epoch": 2.2352395399184104, "grad_norm": 1.0358619689941406, "learning_rate": 1.2291968031837254e-05, "loss": 0.0638, "step": 151775 }, { "epoch": 2.235607723008498, "grad_norm": 1.3663173913955688, "learning_rate": 1.229033165987027e-05, "loss": 0.0629, "step": 151800 }, { "epoch": 2.235975906098585, "grad_norm": 1.3101141452789307, "learning_rate": 1.2288695287903285e-05, "loss": 0.0538, "step": 151825 }, { "epoch": 2.236344089188672, "grad_norm": 1.257340908050537, "learning_rate": 1.22870589159363e-05, "loss": 0.0585, "step": 151850 }, { "epoch": 2.236712272278759, "grad_norm": 1.3124793767929077, "learning_rate": 1.2285422543969317e-05, "loss": 0.057, "step": 151875 }, { "epoch": 2.237080455368846, "grad_norm": 1.228428840637207, "learning_rate": 1.2283786172002331e-05, "loss": 0.0623, "step": 151900 }, { "epoch": 2.237448638458933, "grad_norm": 1.3018091917037964, "learning_rate": 1.2282149800035346e-05, "loss": 0.0522, "step": 151925 }, { "epoch": 2.23781682154902, "grad_norm": 0.9292869567871094, "learning_rate": 1.2280513428068362e-05, "loss": 0.0632, "step": 151950 }, { "epoch": 2.238185004639107, "grad_norm": 1.6583223342895508, "learning_rate": 1.2278877056101376e-05, "loss": 0.061, "step": 151975 }, { "epoch": 2.238553187729194, "grad_norm": 1.4275884628295898, "learning_rate": 1.2277240684134393e-05, "loss": 0.0609, "step": 152000 }, { "epoch": 2.238921370819281, "grad_norm": 1.466074824333191, "learning_rate": 1.2275604312167409e-05, "loss": 0.0615, "step": 152025 }, { "epoch": 2.239289553909368, "grad_norm": 1.985863447189331, "learning_rate": 1.2273967940200425e-05, "loss": 0.0631, "step": 152050 }, { "epoch": 2.239657736999455, "grad_norm": 2.0387206077575684, "learning_rate": 1.227233156823344e-05, "loss": 0.0668, "step": 152075 }, { "epoch": 2.240025920089542, "grad_norm": 1.4524468183517456, "learning_rate": 1.2270695196266455e-05, "loss": 0.0593, "step": 152100 }, { "epoch": 2.240394103179629, "grad_norm": 1.2152190208435059, "learning_rate": 1.2269058824299472e-05, "loss": 0.0629, "step": 152125 }, { "epoch": 2.240762286269716, "grad_norm": 1.3854385614395142, "learning_rate": 1.2267422452332484e-05, "loss": 0.065, "step": 152150 }, { "epoch": 2.241130469359803, "grad_norm": 1.3988215923309326, "learning_rate": 1.22657860803655e-05, "loss": 0.0657, "step": 152175 }, { "epoch": 2.24149865244989, "grad_norm": 1.4388787746429443, "learning_rate": 1.2264149708398517e-05, "loss": 0.0628, "step": 152200 }, { "epoch": 2.241866835539977, "grad_norm": 1.0519180297851562, "learning_rate": 1.2262513336431531e-05, "loss": 0.0517, "step": 152225 }, { "epoch": 2.2422350186300646, "grad_norm": 1.5220091342926025, "learning_rate": 1.2260876964464547e-05, "loss": 0.0617, "step": 152250 }, { "epoch": 2.2426032017201516, "grad_norm": 1.2312278747558594, "learning_rate": 1.2259240592497563e-05, "loss": 0.0621, "step": 152275 }, { "epoch": 2.2429713848102386, "grad_norm": 1.4729104042053223, "learning_rate": 1.225760422053058e-05, "loss": 0.058, "step": 152300 }, { "epoch": 2.2433395679003256, "grad_norm": 1.2740864753723145, "learning_rate": 1.2255967848563594e-05, "loss": 0.0637, "step": 152325 }, { "epoch": 2.2437077509904126, "grad_norm": 1.3165504932403564, "learning_rate": 1.2254331476596608e-05, "loss": 0.061, "step": 152350 }, { "epoch": 2.2440759340804997, "grad_norm": 0.680292010307312, "learning_rate": 1.2252695104629625e-05, "loss": 0.0606, "step": 152375 }, { "epoch": 2.2444441171705867, "grad_norm": 1.5281325578689575, "learning_rate": 1.2251058732662639e-05, "loss": 0.0655, "step": 152400 }, { "epoch": 2.2448123002606737, "grad_norm": 0.857114851474762, "learning_rate": 1.2249422360695655e-05, "loss": 0.0532, "step": 152425 }, { "epoch": 2.2451804833507607, "grad_norm": 1.3292726278305054, "learning_rate": 1.224785144360735e-05, "loss": 0.0672, "step": 152450 }, { "epoch": 2.2455486664408477, "grad_norm": 1.6671534776687622, "learning_rate": 1.2246215071640365e-05, "loss": 0.0605, "step": 152475 }, { "epoch": 2.2459168495309347, "grad_norm": 1.8260165452957153, "learning_rate": 1.2244578699673381e-05, "loss": 0.0674, "step": 152500 }, { "epoch": 2.2462850326210217, "grad_norm": 1.723912000656128, "learning_rate": 1.2242942327706398e-05, "loss": 0.0534, "step": 152525 }, { "epoch": 2.2466532157111088, "grad_norm": 1.1101588010787964, "learning_rate": 1.224130595573941e-05, "loss": 0.0578, "step": 152550 }, { "epoch": 2.247021398801196, "grad_norm": 1.4244182109832764, "learning_rate": 1.2239669583772426e-05, "loss": 0.0595, "step": 152575 }, { "epoch": 2.247389581891283, "grad_norm": 1.4533981084823608, "learning_rate": 1.2238033211805443e-05, "loss": 0.0614, "step": 152600 }, { "epoch": 2.24775776498137, "grad_norm": 1.7205830812454224, "learning_rate": 1.2236396839838459e-05, "loss": 0.0659, "step": 152625 }, { "epoch": 2.248125948071457, "grad_norm": 1.5929219722747803, "learning_rate": 1.2234760467871473e-05, "loss": 0.0622, "step": 152650 }, { "epoch": 2.248494131161544, "grad_norm": 1.1005336046218872, "learning_rate": 1.223312409590449e-05, "loss": 0.0578, "step": 152675 }, { "epoch": 2.248862314251631, "grad_norm": 1.2769103050231934, "learning_rate": 1.2231487723937506e-05, "loss": 0.0615, "step": 152700 }, { "epoch": 2.249230497341718, "grad_norm": 1.293278694152832, "learning_rate": 1.2229851351970522e-05, "loss": 0.061, "step": 152725 }, { "epoch": 2.2495986804318053, "grad_norm": 1.5609077215194702, "learning_rate": 1.2228214980003536e-05, "loss": 0.0753, "step": 152750 }, { "epoch": 2.2499668635218923, "grad_norm": 1.4379901885986328, "learning_rate": 1.222657860803655e-05, "loss": 0.0593, "step": 152775 }, { "epoch": 2.2503350466119794, "grad_norm": 1.272094488143921, "learning_rate": 1.2224942236069565e-05, "loss": 0.0686, "step": 152800 }, { "epoch": 2.2507032297020664, "grad_norm": 1.719151258468628, "learning_rate": 1.2223305864102581e-05, "loss": 0.0607, "step": 152825 }, { "epoch": 2.2510714127921534, "grad_norm": 1.6205966472625732, "learning_rate": 1.2221669492135597e-05, "loss": 0.0575, "step": 152850 }, { "epoch": 2.2514395958822404, "grad_norm": 1.1187539100646973, "learning_rate": 1.2220033120168614e-05, "loss": 0.0602, "step": 152875 }, { "epoch": 2.2518077789723274, "grad_norm": 1.6845979690551758, "learning_rate": 1.2218396748201628e-05, "loss": 0.0622, "step": 152900 }, { "epoch": 2.2521759620624144, "grad_norm": 1.2623274326324463, "learning_rate": 1.2216760376234644e-05, "loss": 0.0594, "step": 152925 }, { "epoch": 2.2525441451525015, "grad_norm": 1.4932770729064941, "learning_rate": 1.221512400426766e-05, "loss": 0.0556, "step": 152950 }, { "epoch": 2.2529123282425885, "grad_norm": 1.5640065670013428, "learning_rate": 1.2213487632300673e-05, "loss": 0.0653, "step": 152975 }, { "epoch": 2.2532805113326755, "grad_norm": 1.2047258615493774, "learning_rate": 1.221185126033369e-05, "loss": 0.0613, "step": 153000 }, { "epoch": 2.2536486944227625, "grad_norm": 1.4226864576339722, "learning_rate": 1.2210214888366705e-05, "loss": 0.0624, "step": 153025 }, { "epoch": 2.2540168775128495, "grad_norm": 1.3719414472579956, "learning_rate": 1.220857851639972e-05, "loss": 0.0673, "step": 153050 }, { "epoch": 2.2543850606029365, "grad_norm": 1.4957154989242554, "learning_rate": 1.2206942144432736e-05, "loss": 0.0679, "step": 153075 }, { "epoch": 2.2547532436930235, "grad_norm": 1.5672136545181274, "learning_rate": 1.2205305772465752e-05, "loss": 0.0661, "step": 153100 }, { "epoch": 2.2551214267831106, "grad_norm": 1.0923973321914673, "learning_rate": 1.2203669400498768e-05, "loss": 0.0665, "step": 153125 }, { "epoch": 2.2554896098731976, "grad_norm": 1.3810386657714844, "learning_rate": 1.2202033028531783e-05, "loss": 0.0647, "step": 153150 }, { "epoch": 2.2558577929632846, "grad_norm": 1.309795618057251, "learning_rate": 1.2200396656564799e-05, "loss": 0.0588, "step": 153175 }, { "epoch": 2.256225976053372, "grad_norm": 1.5182901620864868, "learning_rate": 1.2198760284597813e-05, "loss": 0.0611, "step": 153200 }, { "epoch": 2.256594159143459, "grad_norm": 1.2813936471939087, "learning_rate": 1.2197123912630828e-05, "loss": 0.0618, "step": 153225 }, { "epoch": 2.256962342233546, "grad_norm": 1.1370495557785034, "learning_rate": 1.2195487540663844e-05, "loss": 0.0632, "step": 153250 }, { "epoch": 2.257330525323633, "grad_norm": 1.2709547281265259, "learning_rate": 1.219385116869686e-05, "loss": 0.064, "step": 153275 }, { "epoch": 2.25769870841372, "grad_norm": 1.9685618877410889, "learning_rate": 1.2192214796729876e-05, "loss": 0.0681, "step": 153300 }, { "epoch": 2.258066891503807, "grad_norm": 1.3484185934066772, "learning_rate": 1.219057842476289e-05, "loss": 0.058, "step": 153325 }, { "epoch": 2.258435074593894, "grad_norm": 1.2409491539001465, "learning_rate": 1.2188942052795907e-05, "loss": 0.0629, "step": 153350 }, { "epoch": 2.258803257683981, "grad_norm": 1.8287829160690308, "learning_rate": 1.2187305680828923e-05, "loss": 0.0575, "step": 153375 }, { "epoch": 2.259171440774068, "grad_norm": 1.5454610586166382, "learning_rate": 1.2185669308861936e-05, "loss": 0.0636, "step": 153400 }, { "epoch": 2.259539623864155, "grad_norm": 1.5059220790863037, "learning_rate": 1.2184032936894952e-05, "loss": 0.0634, "step": 153425 }, { "epoch": 2.259907806954242, "grad_norm": 1.4627835750579834, "learning_rate": 1.2182396564927968e-05, "loss": 0.0597, "step": 153450 }, { "epoch": 2.260275990044329, "grad_norm": 1.3997286558151245, "learning_rate": 1.2180760192960983e-05, "loss": 0.0601, "step": 153475 }, { "epoch": 2.2606441731344162, "grad_norm": 1.4016356468200684, "learning_rate": 1.2179123820993999e-05, "loss": 0.0603, "step": 153500 }, { "epoch": 2.2610123562245033, "grad_norm": 1.1766282320022583, "learning_rate": 1.2177487449027015e-05, "loss": 0.0594, "step": 153525 }, { "epoch": 2.2613805393145903, "grad_norm": 1.4122215509414673, "learning_rate": 1.2175851077060031e-05, "loss": 0.0626, "step": 153550 }, { "epoch": 2.2617487224046773, "grad_norm": 1.4782017469406128, "learning_rate": 1.2174214705093045e-05, "loss": 0.0567, "step": 153575 }, { "epoch": 2.2621169054947643, "grad_norm": 1.488305687904358, "learning_rate": 1.2172578333126062e-05, "loss": 0.0611, "step": 153600 }, { "epoch": 2.2624850885848513, "grad_norm": 0.9970173239707947, "learning_rate": 1.2170941961159074e-05, "loss": 0.064, "step": 153625 }, { "epoch": 2.2628532716749383, "grad_norm": 0.7006500363349915, "learning_rate": 1.216930558919209e-05, "loss": 0.062, "step": 153650 }, { "epoch": 2.2632214547650253, "grad_norm": 1.6163110733032227, "learning_rate": 1.2167669217225107e-05, "loss": 0.062, "step": 153675 }, { "epoch": 2.2635896378551124, "grad_norm": 1.6576616764068604, "learning_rate": 1.2166032845258123e-05, "loss": 0.0632, "step": 153700 }, { "epoch": 2.2639578209452, "grad_norm": 1.7387374639511108, "learning_rate": 1.2164396473291137e-05, "loss": 0.065, "step": 153725 }, { "epoch": 2.264326004035287, "grad_norm": 1.4592851400375366, "learning_rate": 1.2162760101324153e-05, "loss": 0.0617, "step": 153750 }, { "epoch": 2.264694187125374, "grad_norm": 1.2785307168960571, "learning_rate": 1.216112372935717e-05, "loss": 0.0675, "step": 153775 }, { "epoch": 2.265062370215461, "grad_norm": 1.5926411151885986, "learning_rate": 1.2159487357390186e-05, "loss": 0.0581, "step": 153800 }, { "epoch": 2.265430553305548, "grad_norm": 1.4977892637252808, "learning_rate": 1.2157850985423198e-05, "loss": 0.0556, "step": 153825 }, { "epoch": 2.265798736395635, "grad_norm": 2.1493022441864014, "learning_rate": 1.2156214613456215e-05, "loss": 0.065, "step": 153850 }, { "epoch": 2.266166919485722, "grad_norm": 1.4621464014053345, "learning_rate": 1.2154578241489229e-05, "loss": 0.0607, "step": 153875 }, { "epoch": 2.266535102575809, "grad_norm": 1.6883494853973389, "learning_rate": 1.2152941869522245e-05, "loss": 0.0713, "step": 153900 }, { "epoch": 2.266903285665896, "grad_norm": 1.9697834253311157, "learning_rate": 1.2151305497555261e-05, "loss": 0.0613, "step": 153925 }, { "epoch": 2.267271468755983, "grad_norm": 1.4346647262573242, "learning_rate": 1.2149669125588278e-05, "loss": 0.0599, "step": 153950 }, { "epoch": 2.26763965184607, "grad_norm": 1.4374207258224487, "learning_rate": 1.2148032753621292e-05, "loss": 0.0609, "step": 153975 }, { "epoch": 2.268007834936157, "grad_norm": 1.854790210723877, "learning_rate": 1.2146396381654308e-05, "loss": 0.0643, "step": 154000 }, { "epoch": 2.268376018026244, "grad_norm": 1.3829092979431152, "learning_rate": 1.2144760009687323e-05, "loss": 0.0579, "step": 154025 }, { "epoch": 2.268744201116331, "grad_norm": 1.3426350355148315, "learning_rate": 1.2143123637720337e-05, "loss": 0.0678, "step": 154050 }, { "epoch": 2.269112384206418, "grad_norm": 1.077064871788025, "learning_rate": 1.2141487265753353e-05, "loss": 0.0552, "step": 154075 }, { "epoch": 2.269480567296505, "grad_norm": 1.560431957244873, "learning_rate": 1.213985089378637e-05, "loss": 0.0575, "step": 154100 }, { "epoch": 2.269848750386592, "grad_norm": 1.2780370712280273, "learning_rate": 1.2138214521819386e-05, "loss": 0.0619, "step": 154125 }, { "epoch": 2.2702169334766795, "grad_norm": 1.0501247644424438, "learning_rate": 1.21365781498524e-05, "loss": 0.0562, "step": 154150 }, { "epoch": 2.2705851165667665, "grad_norm": 1.45809006690979, "learning_rate": 1.2134941777885416e-05, "loss": 0.062, "step": 154175 }, { "epoch": 2.2709532996568536, "grad_norm": 1.7347477674484253, "learning_rate": 1.2133305405918432e-05, "loss": 0.0632, "step": 154200 }, { "epoch": 2.2713214827469406, "grad_norm": 1.1819517612457275, "learning_rate": 1.2131669033951447e-05, "loss": 0.0628, "step": 154225 }, { "epoch": 2.2716896658370276, "grad_norm": 1.4354586601257324, "learning_rate": 1.2130032661984461e-05, "loss": 0.0612, "step": 154250 }, { "epoch": 2.2720578489271146, "grad_norm": 1.2080940008163452, "learning_rate": 1.2128396290017477e-05, "loss": 0.0647, "step": 154275 }, { "epoch": 2.2724260320172016, "grad_norm": 1.4879984855651855, "learning_rate": 1.2126759918050492e-05, "loss": 0.0531, "step": 154300 }, { "epoch": 2.2727942151072886, "grad_norm": 1.7485198974609375, "learning_rate": 1.2125123546083508e-05, "loss": 0.0605, "step": 154325 }, { "epoch": 2.2731623981973756, "grad_norm": 1.7189886569976807, "learning_rate": 1.2123487174116524e-05, "loss": 0.0605, "step": 154350 }, { "epoch": 2.2735305812874627, "grad_norm": 1.657499074935913, "learning_rate": 1.212185080214954e-05, "loss": 0.0596, "step": 154375 }, { "epoch": 2.2738987643775497, "grad_norm": 1.442579984664917, "learning_rate": 1.2120214430182555e-05, "loss": 0.0698, "step": 154400 }, { "epoch": 2.2742669474676367, "grad_norm": 1.1019314527511597, "learning_rate": 1.2118578058215571e-05, "loss": 0.0531, "step": 154425 }, { "epoch": 2.2746351305577237, "grad_norm": 1.1446802616119385, "learning_rate": 1.2116941686248584e-05, "loss": 0.0541, "step": 154450 }, { "epoch": 2.2750033136478107, "grad_norm": 1.2468680143356323, "learning_rate": 1.21153053142816e-05, "loss": 0.0628, "step": 154475 }, { "epoch": 2.2753714967378977, "grad_norm": 1.595442771911621, "learning_rate": 1.2113668942314616e-05, "loss": 0.0626, "step": 154500 }, { "epoch": 2.2757396798279848, "grad_norm": 1.4726616144180298, "learning_rate": 1.2112032570347632e-05, "loss": 0.0652, "step": 154525 }, { "epoch": 2.2761078629180718, "grad_norm": 1.5682780742645264, "learning_rate": 1.2110396198380647e-05, "loss": 0.055, "step": 154550 }, { "epoch": 2.276476046008159, "grad_norm": 1.6534937620162964, "learning_rate": 1.2108759826413663e-05, "loss": 0.0743, "step": 154575 }, { "epoch": 2.276844229098246, "grad_norm": 1.19683039188385, "learning_rate": 1.2107123454446679e-05, "loss": 0.0602, "step": 154600 }, { "epoch": 2.277212412188333, "grad_norm": 1.3283883333206177, "learning_rate": 1.2105487082479695e-05, "loss": 0.062, "step": 154625 }, { "epoch": 2.27758059527842, "grad_norm": 1.3543940782546997, "learning_rate": 1.2103916165391389e-05, "loss": 0.0675, "step": 154650 }, { "epoch": 2.277948778368507, "grad_norm": 1.4500526189804077, "learning_rate": 1.2102279793424403e-05, "loss": 0.0584, "step": 154675 }, { "epoch": 2.2783169614585943, "grad_norm": 1.8910281658172607, "learning_rate": 1.210064342145742e-05, "loss": 0.0647, "step": 154700 }, { "epoch": 2.2786851445486813, "grad_norm": 1.2388887405395508, "learning_rate": 1.2099007049490434e-05, "loss": 0.0639, "step": 154725 }, { "epoch": 2.2790533276387683, "grad_norm": 1.852287769317627, "learning_rate": 1.209737067752345e-05, "loss": 0.0618, "step": 154750 }, { "epoch": 2.2794215107288553, "grad_norm": 1.2877410650253296, "learning_rate": 1.2095734305556466e-05, "loss": 0.0622, "step": 154775 }, { "epoch": 2.2797896938189424, "grad_norm": 1.0189391374588013, "learning_rate": 1.209409793358948e-05, "loss": 0.0559, "step": 154800 }, { "epoch": 2.2801578769090294, "grad_norm": 1.432690143585205, "learning_rate": 1.2092461561622497e-05, "loss": 0.055, "step": 154825 }, { "epoch": 2.2805260599991164, "grad_norm": 1.3184486627578735, "learning_rate": 1.2090825189655513e-05, "loss": 0.0616, "step": 154850 }, { "epoch": 2.2808942430892034, "grad_norm": 1.423694372177124, "learning_rate": 1.2089188817688526e-05, "loss": 0.0651, "step": 154875 }, { "epoch": 2.2812624261792904, "grad_norm": 1.3236284255981445, "learning_rate": 1.2087552445721542e-05, "loss": 0.0569, "step": 154900 }, { "epoch": 2.2816306092693774, "grad_norm": 1.3500868082046509, "learning_rate": 1.2085916073754558e-05, "loss": 0.0577, "step": 154925 }, { "epoch": 2.2819987923594645, "grad_norm": 1.4023374319076538, "learning_rate": 1.2084279701787574e-05, "loss": 0.0606, "step": 154950 }, { "epoch": 2.2823669754495515, "grad_norm": 1.2842373847961426, "learning_rate": 1.2082643329820589e-05, "loss": 0.0618, "step": 154975 }, { "epoch": 2.2827351585396385, "grad_norm": 1.4571912288665771, "learning_rate": 1.2081006957853605e-05, "loss": 0.0689, "step": 155000 }, { "epoch": 2.2831033416297255, "grad_norm": 0.9824802875518799, "learning_rate": 1.2079370585886621e-05, "loss": 0.0605, "step": 155025 }, { "epoch": 2.2834715247198125, "grad_norm": 1.2202214002609253, "learning_rate": 1.2077734213919637e-05, "loss": 0.0625, "step": 155050 }, { "epoch": 2.2838397078098995, "grad_norm": 1.3202885389328003, "learning_rate": 1.2076097841952652e-05, "loss": 0.0613, "step": 155075 }, { "epoch": 2.284207890899987, "grad_norm": 1.7728911638259888, "learning_rate": 1.2074461469985666e-05, "loss": 0.0651, "step": 155100 }, { "epoch": 2.284576073990074, "grad_norm": 1.0166760683059692, "learning_rate": 1.207282509801868e-05, "loss": 0.0599, "step": 155125 }, { "epoch": 2.284944257080161, "grad_norm": 1.1950476169586182, "learning_rate": 1.2071188726051697e-05, "loss": 0.0578, "step": 155150 }, { "epoch": 2.285312440170248, "grad_norm": 1.2729220390319824, "learning_rate": 1.2069552354084713e-05, "loss": 0.0606, "step": 155175 }, { "epoch": 2.285680623260335, "grad_norm": 0.963126540184021, "learning_rate": 1.2067915982117729e-05, "loss": 0.0615, "step": 155200 }, { "epoch": 2.286048806350422, "grad_norm": 1.4809589385986328, "learning_rate": 1.2066279610150743e-05, "loss": 0.0575, "step": 155225 }, { "epoch": 2.286416989440509, "grad_norm": 1.026487946510315, "learning_rate": 1.206464323818376e-05, "loss": 0.0563, "step": 155250 }, { "epoch": 2.286785172530596, "grad_norm": 1.2711668014526367, "learning_rate": 1.2063006866216776e-05, "loss": 0.0674, "step": 155275 }, { "epoch": 2.287153355620683, "grad_norm": 1.0773683786392212, "learning_rate": 1.2061370494249788e-05, "loss": 0.053, "step": 155300 }, { "epoch": 2.28752153871077, "grad_norm": 1.8069260120391846, "learning_rate": 1.2059734122282805e-05, "loss": 0.0618, "step": 155325 }, { "epoch": 2.287889721800857, "grad_norm": 1.054559588432312, "learning_rate": 1.205809775031582e-05, "loss": 0.0615, "step": 155350 }, { "epoch": 2.288257904890944, "grad_norm": 1.3881711959838867, "learning_rate": 1.2056461378348835e-05, "loss": 0.0621, "step": 155375 }, { "epoch": 2.288626087981031, "grad_norm": 1.3065428733825684, "learning_rate": 1.2054825006381851e-05, "loss": 0.0596, "step": 155400 }, { "epoch": 2.288994271071118, "grad_norm": 1.3255311250686646, "learning_rate": 1.2053188634414868e-05, "loss": 0.0581, "step": 155425 }, { "epoch": 2.289362454161205, "grad_norm": 1.5815212726593018, "learning_rate": 1.2051552262447884e-05, "loss": 0.0615, "step": 155450 }, { "epoch": 2.2897306372512922, "grad_norm": 1.2780965566635132, "learning_rate": 1.2049915890480898e-05, "loss": 0.0615, "step": 155475 }, { "epoch": 2.2900988203413792, "grad_norm": 1.1252162456512451, "learning_rate": 1.2048279518513914e-05, "loss": 0.0592, "step": 155500 }, { "epoch": 2.2904670034314663, "grad_norm": 1.3305015563964844, "learning_rate": 1.2046643146546929e-05, "loss": 0.0592, "step": 155525 }, { "epoch": 2.2908351865215533, "grad_norm": 1.5482666492462158, "learning_rate": 1.2045006774579943e-05, "loss": 0.0645, "step": 155550 }, { "epoch": 2.2912033696116403, "grad_norm": 1.7357757091522217, "learning_rate": 1.204337040261296e-05, "loss": 0.0651, "step": 155575 }, { "epoch": 2.2915715527017273, "grad_norm": 1.1715476512908936, "learning_rate": 1.2041734030645976e-05, "loss": 0.0628, "step": 155600 }, { "epoch": 2.2919397357918143, "grad_norm": 1.4161438941955566, "learning_rate": 1.204009765867899e-05, "loss": 0.0606, "step": 155625 }, { "epoch": 2.2923079188819018, "grad_norm": 1.7011719942092896, "learning_rate": 1.2038461286712006e-05, "loss": 0.0632, "step": 155650 }, { "epoch": 2.292676101971989, "grad_norm": 1.2964603900909424, "learning_rate": 1.2036824914745022e-05, "loss": 0.0648, "step": 155675 }, { "epoch": 2.293044285062076, "grad_norm": 1.2411048412322998, "learning_rate": 1.2035188542778038e-05, "loss": 0.0674, "step": 155700 }, { "epoch": 2.293412468152163, "grad_norm": 1.055909514427185, "learning_rate": 1.2033552170811051e-05, "loss": 0.0589, "step": 155725 }, { "epoch": 2.29378065124225, "grad_norm": 1.439857006072998, "learning_rate": 1.2031915798844067e-05, "loss": 0.075, "step": 155750 }, { "epoch": 2.294148834332337, "grad_norm": 2.148745536804199, "learning_rate": 1.2030279426877084e-05, "loss": 0.0606, "step": 155775 }, { "epoch": 2.294517017422424, "grad_norm": 1.3419593572616577, "learning_rate": 1.2028643054910098e-05, "loss": 0.0685, "step": 155800 }, { "epoch": 2.294885200512511, "grad_norm": 1.2838846445083618, "learning_rate": 1.2027006682943114e-05, "loss": 0.0606, "step": 155825 }, { "epoch": 2.295253383602598, "grad_norm": 1.3703469038009644, "learning_rate": 1.202537031097613e-05, "loss": 0.0548, "step": 155850 }, { "epoch": 2.295621566692685, "grad_norm": 1.9246338605880737, "learning_rate": 1.2023733939009146e-05, "loss": 0.068, "step": 155875 }, { "epoch": 2.295989749782772, "grad_norm": 1.618470549583435, "learning_rate": 1.2022097567042161e-05, "loss": 0.0687, "step": 155900 }, { "epoch": 2.296357932872859, "grad_norm": 1.2991777658462524, "learning_rate": 1.2020461195075175e-05, "loss": 0.0656, "step": 155925 }, { "epoch": 2.296726115962946, "grad_norm": 2.071720600128174, "learning_rate": 1.201882482310819e-05, "loss": 0.0573, "step": 155950 }, { "epoch": 2.297094299053033, "grad_norm": 0.9859101176261902, "learning_rate": 1.2017188451141206e-05, "loss": 0.0568, "step": 155975 }, { "epoch": 2.29746248214312, "grad_norm": 1.6727458238601685, "learning_rate": 1.2015552079174222e-05, "loss": 0.0618, "step": 156000 }, { "epoch": 2.297830665233207, "grad_norm": 1.6102430820465088, "learning_rate": 1.2013915707207238e-05, "loss": 0.0624, "step": 156025 }, { "epoch": 2.298198848323294, "grad_norm": 1.8287855386734009, "learning_rate": 1.2012279335240253e-05, "loss": 0.0679, "step": 156050 }, { "epoch": 2.2985670314133815, "grad_norm": 1.5875651836395264, "learning_rate": 1.2010642963273269e-05, "loss": 0.0614, "step": 156075 }, { "epoch": 2.2989352145034685, "grad_norm": 1.2037783861160278, "learning_rate": 1.2009006591306285e-05, "loss": 0.0595, "step": 156100 }, { "epoch": 2.2993033975935555, "grad_norm": 1.473384976387024, "learning_rate": 1.2007370219339301e-05, "loss": 0.0602, "step": 156125 }, { "epoch": 2.2996715806836425, "grad_norm": 1.6098628044128418, "learning_rate": 1.2005733847372314e-05, "loss": 0.0604, "step": 156150 }, { "epoch": 2.3000397637737295, "grad_norm": 1.4224539995193481, "learning_rate": 1.200409747540533e-05, "loss": 0.059, "step": 156175 }, { "epoch": 2.3004079468638166, "grad_norm": 1.2859063148498535, "learning_rate": 1.2002461103438345e-05, "loss": 0.0588, "step": 156200 }, { "epoch": 2.3007761299539036, "grad_norm": 1.760091781616211, "learning_rate": 1.200082473147136e-05, "loss": 0.0661, "step": 156225 }, { "epoch": 2.3011443130439906, "grad_norm": 1.1600080728530884, "learning_rate": 1.1999188359504377e-05, "loss": 0.0615, "step": 156250 }, { "epoch": 2.3015124961340776, "grad_norm": 1.7958961725234985, "learning_rate": 1.1997551987537393e-05, "loss": 0.0643, "step": 156275 }, { "epoch": 2.3018806792241646, "grad_norm": 1.1491156816482544, "learning_rate": 1.1995915615570407e-05, "loss": 0.0601, "step": 156300 }, { "epoch": 2.3022488623142516, "grad_norm": 1.23212730884552, "learning_rate": 1.1994279243603424e-05, "loss": 0.0581, "step": 156325 }, { "epoch": 2.3026170454043386, "grad_norm": 1.3952423334121704, "learning_rate": 1.1992642871636438e-05, "loss": 0.0641, "step": 156350 }, { "epoch": 2.3029852284944257, "grad_norm": 1.6260968446731567, "learning_rate": 1.1991006499669453e-05, "loss": 0.0646, "step": 156375 }, { "epoch": 2.3033534115845127, "grad_norm": 0.9654199481010437, "learning_rate": 1.1989370127702469e-05, "loss": 0.0738, "step": 156400 }, { "epoch": 2.3037215946745997, "grad_norm": 0.9737640619277954, "learning_rate": 1.1987733755735485e-05, "loss": 0.0594, "step": 156425 }, { "epoch": 2.3040897777646867, "grad_norm": 1.3626937866210938, "learning_rate": 1.1986097383768501e-05, "loss": 0.0613, "step": 156450 }, { "epoch": 2.3044579608547737, "grad_norm": 0.9062378406524658, "learning_rate": 1.1984461011801515e-05, "loss": 0.0644, "step": 156475 }, { "epoch": 2.3048261439448607, "grad_norm": 0.6740528345108032, "learning_rate": 1.1982824639834532e-05, "loss": 0.0589, "step": 156500 }, { "epoch": 2.3051943270349478, "grad_norm": 1.082948088645935, "learning_rate": 1.1981188267867548e-05, "loss": 0.0622, "step": 156525 }, { "epoch": 2.3055625101250348, "grad_norm": 1.3589931726455688, "learning_rate": 1.1979551895900562e-05, "loss": 0.0543, "step": 156550 }, { "epoch": 2.305930693215122, "grad_norm": 1.412954330444336, "learning_rate": 1.1977915523933577e-05, "loss": 0.0517, "step": 156575 }, { "epoch": 2.3062988763052092, "grad_norm": 1.567301869392395, "learning_rate": 1.1976279151966593e-05, "loss": 0.059, "step": 156600 }, { "epoch": 2.3066670593952963, "grad_norm": 1.3498588800430298, "learning_rate": 1.1974642779999607e-05, "loss": 0.0587, "step": 156625 }, { "epoch": 2.3070352424853833, "grad_norm": 1.5007323026657104, "learning_rate": 1.1973006408032623e-05, "loss": 0.062, "step": 156650 }, { "epoch": 2.3074034255754703, "grad_norm": 1.3714014291763306, "learning_rate": 1.197137003606564e-05, "loss": 0.0606, "step": 156675 }, { "epoch": 2.3077716086655573, "grad_norm": 1.85383939743042, "learning_rate": 1.1969733664098656e-05, "loss": 0.066, "step": 156700 }, { "epoch": 2.3081397917556443, "grad_norm": 1.467696189880371, "learning_rate": 1.196809729213167e-05, "loss": 0.0655, "step": 156725 }, { "epoch": 2.3085079748457313, "grad_norm": 1.5151034593582153, "learning_rate": 1.1966460920164686e-05, "loss": 0.0667, "step": 156750 }, { "epoch": 2.3088761579358184, "grad_norm": 1.274662733078003, "learning_rate": 1.1964824548197699e-05, "loss": 0.06, "step": 156775 }, { "epoch": 2.3092443410259054, "grad_norm": 1.2610223293304443, "learning_rate": 1.1963188176230715e-05, "loss": 0.0609, "step": 156800 }, { "epoch": 2.3096125241159924, "grad_norm": 1.4022438526153564, "learning_rate": 1.1961551804263731e-05, "loss": 0.0571, "step": 156825 }, { "epoch": 2.3099807072060794, "grad_norm": 1.190034031867981, "learning_rate": 1.1959915432296748e-05, "loss": 0.0598, "step": 156850 }, { "epoch": 2.3103488902961664, "grad_norm": 1.7318341732025146, "learning_rate": 1.1958279060329762e-05, "loss": 0.0634, "step": 156875 }, { "epoch": 2.3107170733862534, "grad_norm": 1.3399244546890259, "learning_rate": 1.1956642688362778e-05, "loss": 0.0589, "step": 156900 }, { "epoch": 2.3110852564763404, "grad_norm": 1.327311396598816, "learning_rate": 1.1955006316395794e-05, "loss": 0.0652, "step": 156925 }, { "epoch": 2.3114534395664275, "grad_norm": 1.235003113746643, "learning_rate": 1.195343539930749e-05, "loss": 0.0675, "step": 156950 }, { "epoch": 2.3118216226565145, "grad_norm": 1.0948799848556519, "learning_rate": 1.1951799027340504e-05, "loss": 0.0621, "step": 156975 }, { "epoch": 2.3121898057466015, "grad_norm": 1.446100115776062, "learning_rate": 1.1950162655373519e-05, "loss": 0.0587, "step": 157000 }, { "epoch": 2.312557988836689, "grad_norm": 1.5128823518753052, "learning_rate": 1.1948526283406535e-05, "loss": 0.0602, "step": 157025 }, { "epoch": 2.312926171926776, "grad_norm": 1.706076741218567, "learning_rate": 1.194688991143955e-05, "loss": 0.0578, "step": 157050 }, { "epoch": 2.313294355016863, "grad_norm": 1.1051557064056396, "learning_rate": 1.1945253539472566e-05, "loss": 0.0544, "step": 157075 }, { "epoch": 2.31366253810695, "grad_norm": 1.1977298259735107, "learning_rate": 1.1943617167505582e-05, "loss": 0.0622, "step": 157100 }, { "epoch": 2.314030721197037, "grad_norm": 1.512346625328064, "learning_rate": 1.1941980795538596e-05, "loss": 0.0595, "step": 157125 }, { "epoch": 2.314398904287124, "grad_norm": 1.0016157627105713, "learning_rate": 1.1940344423571612e-05, "loss": 0.0646, "step": 157150 }, { "epoch": 2.314767087377211, "grad_norm": 1.4108219146728516, "learning_rate": 1.1938708051604628e-05, "loss": 0.0576, "step": 157175 }, { "epoch": 2.315135270467298, "grad_norm": 1.4841899871826172, "learning_rate": 1.1937071679637641e-05, "loss": 0.0578, "step": 157200 }, { "epoch": 2.315503453557385, "grad_norm": 1.4161380529403687, "learning_rate": 1.1935435307670657e-05, "loss": 0.063, "step": 157225 }, { "epoch": 2.315871636647472, "grad_norm": 1.1011296510696411, "learning_rate": 1.1933798935703674e-05, "loss": 0.0533, "step": 157250 }, { "epoch": 2.316239819737559, "grad_norm": 1.3405083417892456, "learning_rate": 1.193216256373669e-05, "loss": 0.0645, "step": 157275 }, { "epoch": 2.316608002827646, "grad_norm": 1.6028244495391846, "learning_rate": 1.1930526191769704e-05, "loss": 0.0645, "step": 157300 }, { "epoch": 2.316976185917733, "grad_norm": 1.0115848779678345, "learning_rate": 1.192888981980272e-05, "loss": 0.0536, "step": 157325 }, { "epoch": 2.31734436900782, "grad_norm": 1.0993667840957642, "learning_rate": 1.1927253447835736e-05, "loss": 0.06, "step": 157350 }, { "epoch": 2.317712552097907, "grad_norm": 1.1778128147125244, "learning_rate": 1.1925617075868751e-05, "loss": 0.0643, "step": 157375 }, { "epoch": 2.318080735187994, "grad_norm": 1.7951834201812744, "learning_rate": 1.1923980703901765e-05, "loss": 0.0676, "step": 157400 }, { "epoch": 2.318448918278081, "grad_norm": 1.0738455057144165, "learning_rate": 1.1922344331934781e-05, "loss": 0.0577, "step": 157425 }, { "epoch": 2.318817101368168, "grad_norm": 1.2450037002563477, "learning_rate": 1.1920707959967796e-05, "loss": 0.0617, "step": 157450 }, { "epoch": 2.3191852844582552, "grad_norm": 0.9938094615936279, "learning_rate": 1.1919071588000812e-05, "loss": 0.0565, "step": 157475 }, { "epoch": 2.3195534675483422, "grad_norm": 1.3486922979354858, "learning_rate": 1.1917435216033828e-05, "loss": 0.0554, "step": 157500 }, { "epoch": 2.3199216506384293, "grad_norm": 1.1815550327301025, "learning_rate": 1.1915798844066844e-05, "loss": 0.0592, "step": 157525 }, { "epoch": 2.3202898337285163, "grad_norm": 0.7472695112228394, "learning_rate": 1.1914162472099859e-05, "loss": 0.0625, "step": 157550 }, { "epoch": 2.3206580168186037, "grad_norm": 1.0809193849563599, "learning_rate": 1.1912526100132875e-05, "loss": 0.0566, "step": 157575 }, { "epoch": 2.3210261999086907, "grad_norm": 0.8146666884422302, "learning_rate": 1.1910889728165891e-05, "loss": 0.0531, "step": 157600 }, { "epoch": 2.3213943829987778, "grad_norm": 1.1890565156936646, "learning_rate": 1.1909253356198904e-05, "loss": 0.0612, "step": 157625 }, { "epoch": 2.3217625660888648, "grad_norm": 1.3931154012680054, "learning_rate": 1.190761698423192e-05, "loss": 0.0609, "step": 157650 }, { "epoch": 2.322130749178952, "grad_norm": 1.0479291677474976, "learning_rate": 1.1905980612264936e-05, "loss": 0.0605, "step": 157675 }, { "epoch": 2.322498932269039, "grad_norm": 1.5202665328979492, "learning_rate": 1.190434424029795e-05, "loss": 0.0629, "step": 157700 }, { "epoch": 2.322867115359126, "grad_norm": 1.2968895435333252, "learning_rate": 1.1902707868330967e-05, "loss": 0.0572, "step": 157725 }, { "epoch": 2.323235298449213, "grad_norm": 1.9595389366149902, "learning_rate": 1.1901071496363983e-05, "loss": 0.0663, "step": 157750 }, { "epoch": 2.3236034815393, "grad_norm": 1.2240461111068726, "learning_rate": 1.1899435124396999e-05, "loss": 0.0659, "step": 157775 }, { "epoch": 2.323971664629387, "grad_norm": 1.2727786302566528, "learning_rate": 1.1897798752430014e-05, "loss": 0.0623, "step": 157800 }, { "epoch": 2.324339847719474, "grad_norm": 1.2986425161361694, "learning_rate": 1.1896162380463028e-05, "loss": 0.0651, "step": 157825 }, { "epoch": 2.324708030809561, "grad_norm": 1.1853927373886108, "learning_rate": 1.1894526008496044e-05, "loss": 0.0558, "step": 157850 }, { "epoch": 2.325076213899648, "grad_norm": 1.7799198627471924, "learning_rate": 1.1892889636529059e-05, "loss": 0.0598, "step": 157875 }, { "epoch": 2.325444396989735, "grad_norm": 1.1868650913238525, "learning_rate": 1.1891253264562075e-05, "loss": 0.06, "step": 157900 }, { "epoch": 2.325812580079822, "grad_norm": 1.1879189014434814, "learning_rate": 1.1889616892595091e-05, "loss": 0.0605, "step": 157925 }, { "epoch": 2.326180763169909, "grad_norm": 1.43667733669281, "learning_rate": 1.1887980520628105e-05, "loss": 0.0625, "step": 157950 }, { "epoch": 2.3265489462599964, "grad_norm": 1.117746114730835, "learning_rate": 1.1886344148661122e-05, "loss": 0.0587, "step": 157975 }, { "epoch": 2.3269171293500834, "grad_norm": 1.7764755487442017, "learning_rate": 1.1884707776694138e-05, "loss": 0.0647, "step": 158000 }, { "epoch": 2.3272853124401705, "grad_norm": 1.1765077114105225, "learning_rate": 1.1883071404727154e-05, "loss": 0.0546, "step": 158025 }, { "epoch": 2.3276534955302575, "grad_norm": 1.383188247680664, "learning_rate": 1.1881435032760167e-05, "loss": 0.0584, "step": 158050 }, { "epoch": 2.3280216786203445, "grad_norm": 1.0768449306488037, "learning_rate": 1.1879798660793183e-05, "loss": 0.0577, "step": 158075 }, { "epoch": 2.3283898617104315, "grad_norm": 1.5985080003738403, "learning_rate": 1.1878162288826199e-05, "loss": 0.068, "step": 158100 }, { "epoch": 2.3287580448005185, "grad_norm": 1.1434884071350098, "learning_rate": 1.1876525916859213e-05, "loss": 0.06, "step": 158125 }, { "epoch": 2.3291262278906055, "grad_norm": 1.3976846933364868, "learning_rate": 1.187488954489223e-05, "loss": 0.0613, "step": 158150 }, { "epoch": 2.3294944109806925, "grad_norm": 0.9468764066696167, "learning_rate": 1.1873253172925246e-05, "loss": 0.0592, "step": 158175 }, { "epoch": 2.3298625940707796, "grad_norm": 1.5256427526474, "learning_rate": 1.1871616800958262e-05, "loss": 0.0641, "step": 158200 }, { "epoch": 2.3302307771608666, "grad_norm": 1.3651533126831055, "learning_rate": 1.1869980428991276e-05, "loss": 0.0662, "step": 158225 }, { "epoch": 2.3305989602509536, "grad_norm": 1.3118497133255005, "learning_rate": 1.186834405702429e-05, "loss": 0.0662, "step": 158250 }, { "epoch": 2.3309671433410406, "grad_norm": 1.3593465089797974, "learning_rate": 1.1866707685057305e-05, "loss": 0.0617, "step": 158275 }, { "epoch": 2.3313353264311276, "grad_norm": 1.5234158039093018, "learning_rate": 1.1865071313090321e-05, "loss": 0.0627, "step": 158300 }, { "epoch": 2.3317035095212146, "grad_norm": 1.2156411409378052, "learning_rate": 1.1863434941123338e-05, "loss": 0.0621, "step": 158325 }, { "epoch": 2.3320716926113016, "grad_norm": 1.3550735712051392, "learning_rate": 1.1861798569156354e-05, "loss": 0.0585, "step": 158350 }, { "epoch": 2.3324398757013887, "grad_norm": 1.3472539186477661, "learning_rate": 1.1860162197189368e-05, "loss": 0.0586, "step": 158375 }, { "epoch": 2.3328080587914757, "grad_norm": 1.8611524105072021, "learning_rate": 1.1858525825222384e-05, "loss": 0.0604, "step": 158400 }, { "epoch": 2.3331762418815627, "grad_norm": 1.456943392753601, "learning_rate": 1.18568894532554e-05, "loss": 0.0693, "step": 158425 }, { "epoch": 2.3335444249716497, "grad_norm": 1.9718719720840454, "learning_rate": 1.1855253081288417e-05, "loss": 0.0622, "step": 158450 }, { "epoch": 2.3339126080617367, "grad_norm": 1.6115795373916626, "learning_rate": 1.185361670932143e-05, "loss": 0.0623, "step": 158475 }, { "epoch": 2.3342807911518237, "grad_norm": 1.6292591094970703, "learning_rate": 1.1851980337354446e-05, "loss": 0.0582, "step": 158500 }, { "epoch": 2.334648974241911, "grad_norm": 1.5077576637268066, "learning_rate": 1.185034396538746e-05, "loss": 0.0562, "step": 158525 }, { "epoch": 2.335017157331998, "grad_norm": 1.3961241245269775, "learning_rate": 1.1848707593420476e-05, "loss": 0.0666, "step": 158550 }, { "epoch": 2.3353853404220852, "grad_norm": 1.9080363512039185, "learning_rate": 1.1847071221453492e-05, "loss": 0.0641, "step": 158575 }, { "epoch": 2.3357535235121722, "grad_norm": 1.4271209239959717, "learning_rate": 1.1845434849486508e-05, "loss": 0.0591, "step": 158600 }, { "epoch": 2.3361217066022593, "grad_norm": 1.1924054622650146, "learning_rate": 1.1843798477519523e-05, "loss": 0.0626, "step": 158625 }, { "epoch": 2.3364898896923463, "grad_norm": 1.6412979364395142, "learning_rate": 1.1842162105552539e-05, "loss": 0.0736, "step": 158650 }, { "epoch": 2.3368580727824333, "grad_norm": 1.3793342113494873, "learning_rate": 1.1840525733585553e-05, "loss": 0.0608, "step": 158675 }, { "epoch": 2.3372262558725203, "grad_norm": 1.7255667448043823, "learning_rate": 1.1838889361618568e-05, "loss": 0.0653, "step": 158700 }, { "epoch": 2.3375944389626073, "grad_norm": 1.3862090110778809, "learning_rate": 1.1837252989651584e-05, "loss": 0.0594, "step": 158725 }, { "epoch": 2.3379626220526943, "grad_norm": 1.1461907625198364, "learning_rate": 1.18356166176846e-05, "loss": 0.0604, "step": 158750 }, { "epoch": 2.3383308051427814, "grad_norm": 1.108074426651001, "learning_rate": 1.1833980245717616e-05, "loss": 0.0636, "step": 158775 }, { "epoch": 2.3386989882328684, "grad_norm": 1.3630497455596924, "learning_rate": 1.1832343873750631e-05, "loss": 0.0607, "step": 158800 }, { "epoch": 2.3390671713229554, "grad_norm": 1.5388332605361938, "learning_rate": 1.1830707501783647e-05, "loss": 0.06, "step": 158825 }, { "epoch": 2.3394353544130424, "grad_norm": 0.6975927948951721, "learning_rate": 1.1829071129816663e-05, "loss": 0.0649, "step": 158850 }, { "epoch": 2.3398035375031294, "grad_norm": 1.3464326858520508, "learning_rate": 1.1827434757849676e-05, "loss": 0.0565, "step": 158875 }, { "epoch": 2.3401717205932164, "grad_norm": 1.3845411539077759, "learning_rate": 1.1825798385882692e-05, "loss": 0.0579, "step": 158900 }, { "epoch": 2.3405399036833034, "grad_norm": 1.7086868286132812, "learning_rate": 1.1824162013915708e-05, "loss": 0.0588, "step": 158925 }, { "epoch": 2.340908086773391, "grad_norm": 1.5713566541671753, "learning_rate": 1.1822525641948723e-05, "loss": 0.0571, "step": 158950 }, { "epoch": 2.341276269863478, "grad_norm": 1.0964891910552979, "learning_rate": 1.1820889269981739e-05, "loss": 0.0618, "step": 158975 }, { "epoch": 2.341644452953565, "grad_norm": 1.443109393119812, "learning_rate": 1.1819252898014755e-05, "loss": 0.0568, "step": 159000 }, { "epoch": 2.342012636043652, "grad_norm": 1.610060453414917, "learning_rate": 1.1817616526047771e-05, "loss": 0.0608, "step": 159025 }, { "epoch": 2.342380819133739, "grad_norm": 2.101233959197998, "learning_rate": 1.1815980154080786e-05, "loss": 0.063, "step": 159050 }, { "epoch": 2.342749002223826, "grad_norm": 1.2336227893829346, "learning_rate": 1.1814343782113802e-05, "loss": 0.0589, "step": 159075 }, { "epoch": 2.343117185313913, "grad_norm": 1.50912344455719, "learning_rate": 1.1812707410146815e-05, "loss": 0.0705, "step": 159100 }, { "epoch": 2.343485368404, "grad_norm": 1.507298231124878, "learning_rate": 1.181107103817983e-05, "loss": 0.0624, "step": 159125 }, { "epoch": 2.343853551494087, "grad_norm": 0.8697091341018677, "learning_rate": 1.1809434666212847e-05, "loss": 0.0592, "step": 159150 }, { "epoch": 2.344221734584174, "grad_norm": 0.6300997138023376, "learning_rate": 1.1807798294245863e-05, "loss": 0.0596, "step": 159175 }, { "epoch": 2.344589917674261, "grad_norm": 0.9704003930091858, "learning_rate": 1.1806161922278877e-05, "loss": 0.0517, "step": 159200 }, { "epoch": 2.344958100764348, "grad_norm": 1.3032547235488892, "learning_rate": 1.1804591005190573e-05, "loss": 0.0634, "step": 159225 }, { "epoch": 2.345326283854435, "grad_norm": 1.7722431421279907, "learning_rate": 1.1802954633223589e-05, "loss": 0.0586, "step": 159250 }, { "epoch": 2.345694466944522, "grad_norm": 1.3605221509933472, "learning_rate": 1.1801318261256605e-05, "loss": 0.0634, "step": 159275 }, { "epoch": 2.346062650034609, "grad_norm": 1.5096231698989868, "learning_rate": 1.1799681889289618e-05, "loss": 0.0621, "step": 159300 }, { "epoch": 2.346430833124696, "grad_norm": 1.2020301818847656, "learning_rate": 1.1798045517322634e-05, "loss": 0.06, "step": 159325 }, { "epoch": 2.346799016214783, "grad_norm": 1.5449987649917603, "learning_rate": 1.1796409145355649e-05, "loss": 0.0608, "step": 159350 }, { "epoch": 2.34716719930487, "grad_norm": 1.3683671951293945, "learning_rate": 1.1794772773388665e-05, "loss": 0.0639, "step": 159375 }, { "epoch": 2.347535382394957, "grad_norm": 1.0007554292678833, "learning_rate": 1.1793136401421681e-05, "loss": 0.0586, "step": 159400 }, { "epoch": 2.347903565485044, "grad_norm": 1.7813198566436768, "learning_rate": 1.1791500029454697e-05, "loss": 0.0613, "step": 159425 }, { "epoch": 2.348271748575131, "grad_norm": 1.5094960927963257, "learning_rate": 1.1789863657487712e-05, "loss": 0.059, "step": 159450 }, { "epoch": 2.3486399316652187, "grad_norm": 1.5805097818374634, "learning_rate": 1.1788227285520728e-05, "loss": 0.0659, "step": 159475 }, { "epoch": 2.3490081147553057, "grad_norm": 1.2663806676864624, "learning_rate": 1.1786590913553744e-05, "loss": 0.0547, "step": 159500 }, { "epoch": 2.3493762978453927, "grad_norm": 1.15206778049469, "learning_rate": 1.1784954541586757e-05, "loss": 0.059, "step": 159525 }, { "epoch": 2.3497444809354797, "grad_norm": 1.3665211200714111, "learning_rate": 1.1783318169619773e-05, "loss": 0.0583, "step": 159550 }, { "epoch": 2.3501126640255667, "grad_norm": 0.8548767566680908, "learning_rate": 1.1781681797652789e-05, "loss": 0.0592, "step": 159575 }, { "epoch": 2.3504808471156537, "grad_norm": 1.1451300382614136, "learning_rate": 1.1780045425685805e-05, "loss": 0.0665, "step": 159600 }, { "epoch": 2.3508490302057408, "grad_norm": 1.5223888158798218, "learning_rate": 1.177840905371882e-05, "loss": 0.0622, "step": 159625 }, { "epoch": 2.351217213295828, "grad_norm": 1.9416120052337646, "learning_rate": 1.1776772681751836e-05, "loss": 0.0625, "step": 159650 }, { "epoch": 2.351585396385915, "grad_norm": 1.0877821445465088, "learning_rate": 1.1775136309784852e-05, "loss": 0.0626, "step": 159675 }, { "epoch": 2.351953579476002, "grad_norm": 1.207059383392334, "learning_rate": 1.1773499937817866e-05, "loss": 0.0612, "step": 159700 }, { "epoch": 2.352321762566089, "grad_norm": 1.0630658864974976, "learning_rate": 1.177186356585088e-05, "loss": 0.0602, "step": 159725 }, { "epoch": 2.352689945656176, "grad_norm": 1.6132158041000366, "learning_rate": 1.1770227193883897e-05, "loss": 0.0611, "step": 159750 }, { "epoch": 2.353058128746263, "grad_norm": 0.8465059399604797, "learning_rate": 1.1768590821916911e-05, "loss": 0.0634, "step": 159775 }, { "epoch": 2.35342631183635, "grad_norm": 1.100665807723999, "learning_rate": 1.1766954449949928e-05, "loss": 0.0551, "step": 159800 }, { "epoch": 2.353794494926437, "grad_norm": 1.456581473350525, "learning_rate": 1.1765318077982944e-05, "loss": 0.0585, "step": 159825 }, { "epoch": 2.354162678016524, "grad_norm": 1.5542956590652466, "learning_rate": 1.176368170601596e-05, "loss": 0.0635, "step": 159850 }, { "epoch": 2.354530861106611, "grad_norm": 1.1105351448059082, "learning_rate": 1.1762045334048974e-05, "loss": 0.0601, "step": 159875 }, { "epoch": 2.3548990441966984, "grad_norm": 1.045846939086914, "learning_rate": 1.176040896208199e-05, "loss": 0.0564, "step": 159900 }, { "epoch": 2.3552672272867854, "grad_norm": 1.3951945304870605, "learning_rate": 1.1758772590115007e-05, "loss": 0.0596, "step": 159925 }, { "epoch": 2.3556354103768724, "grad_norm": 0.9000104069709778, "learning_rate": 1.175713621814802e-05, "loss": 0.0494, "step": 159950 }, { "epoch": 2.3560035934669594, "grad_norm": 1.3923025131225586, "learning_rate": 1.1755499846181036e-05, "loss": 0.0725, "step": 159975 }, { "epoch": 2.3563717765570464, "grad_norm": 1.5339983701705933, "learning_rate": 1.1753863474214052e-05, "loss": 0.0619, "step": 160000 }, { "epoch": 2.3567399596471335, "grad_norm": 1.526166558265686, "learning_rate": 1.1752227102247066e-05, "loss": 0.0671, "step": 160025 }, { "epoch": 2.3571081427372205, "grad_norm": 1.3462245464324951, "learning_rate": 1.1750590730280082e-05, "loss": 0.0609, "step": 160050 }, { "epoch": 2.3574763258273075, "grad_norm": 1.4161617755889893, "learning_rate": 1.1748954358313098e-05, "loss": 0.0627, "step": 160075 }, { "epoch": 2.3578445089173945, "grad_norm": 1.2137469053268433, "learning_rate": 1.1747317986346115e-05, "loss": 0.0588, "step": 160100 }, { "epoch": 2.3582126920074815, "grad_norm": 1.8838942050933838, "learning_rate": 1.1745681614379129e-05, "loss": 0.0626, "step": 160125 }, { "epoch": 2.3585808750975685, "grad_norm": 1.1949462890625, "learning_rate": 1.1744045242412143e-05, "loss": 0.0582, "step": 160150 }, { "epoch": 2.3589490581876555, "grad_norm": 1.2934921979904175, "learning_rate": 1.174240887044516e-05, "loss": 0.0571, "step": 160175 }, { "epoch": 2.3593172412777426, "grad_norm": 0.8720579147338867, "learning_rate": 1.1740772498478174e-05, "loss": 0.0563, "step": 160200 }, { "epoch": 2.3596854243678296, "grad_norm": 1.112901210784912, "learning_rate": 1.173913612651119e-05, "loss": 0.0649, "step": 160225 }, { "epoch": 2.3600536074579166, "grad_norm": 1.310280442237854, "learning_rate": 1.1737499754544206e-05, "loss": 0.0492, "step": 160250 }, { "epoch": 2.3604217905480036, "grad_norm": 1.4563299417495728, "learning_rate": 1.1735863382577221e-05, "loss": 0.0647, "step": 160275 }, { "epoch": 2.3607899736380906, "grad_norm": 1.1201913356781006, "learning_rate": 1.1734227010610237e-05, "loss": 0.0592, "step": 160300 }, { "epoch": 2.3611581567281776, "grad_norm": 1.393536925315857, "learning_rate": 1.1732590638643253e-05, "loss": 0.0669, "step": 160325 }, { "epoch": 2.3615263398182647, "grad_norm": 1.3970112800598145, "learning_rate": 1.1730954266676266e-05, "loss": 0.0627, "step": 160350 }, { "epoch": 2.3618945229083517, "grad_norm": 1.3412425518035889, "learning_rate": 1.1729317894709282e-05, "loss": 0.0596, "step": 160375 }, { "epoch": 2.3622627059984387, "grad_norm": 1.4249790906906128, "learning_rate": 1.1727681522742298e-05, "loss": 0.0685, "step": 160400 }, { "epoch": 2.3626308890885257, "grad_norm": 1.6485384702682495, "learning_rate": 1.1726045150775314e-05, "loss": 0.0557, "step": 160425 }, { "epoch": 2.362999072178613, "grad_norm": 1.3390308618545532, "learning_rate": 1.1724408778808329e-05, "loss": 0.0627, "step": 160450 }, { "epoch": 2.3633672552687, "grad_norm": 1.3587732315063477, "learning_rate": 1.1722772406841345e-05, "loss": 0.0667, "step": 160475 }, { "epoch": 2.363735438358787, "grad_norm": 1.5958024263381958, "learning_rate": 1.1721136034874361e-05, "loss": 0.0626, "step": 160500 }, { "epoch": 2.364103621448874, "grad_norm": 1.5098737478256226, "learning_rate": 1.1719499662907377e-05, "loss": 0.0643, "step": 160525 }, { "epoch": 2.364471804538961, "grad_norm": 0.945949375629425, "learning_rate": 1.1717863290940392e-05, "loss": 0.0651, "step": 160550 }, { "epoch": 2.3648399876290482, "grad_norm": 1.5439256429672241, "learning_rate": 1.1716226918973406e-05, "loss": 0.0645, "step": 160575 }, { "epoch": 2.3652081707191352, "grad_norm": 1.4468693733215332, "learning_rate": 1.171459054700642e-05, "loss": 0.0613, "step": 160600 }, { "epoch": 2.3655763538092223, "grad_norm": 1.000960350036621, "learning_rate": 1.1712954175039437e-05, "loss": 0.0565, "step": 160625 }, { "epoch": 2.3659445368993093, "grad_norm": 1.3444818258285522, "learning_rate": 1.1711317803072453e-05, "loss": 0.0558, "step": 160650 }, { "epoch": 2.3663127199893963, "grad_norm": 1.2307623624801636, "learning_rate": 1.1709681431105469e-05, "loss": 0.0613, "step": 160675 }, { "epoch": 2.3666809030794833, "grad_norm": 1.2238882780075073, "learning_rate": 1.1708045059138484e-05, "loss": 0.0602, "step": 160700 }, { "epoch": 2.3670490861695703, "grad_norm": 1.5728293657302856, "learning_rate": 1.17064086871715e-05, "loss": 0.0584, "step": 160725 }, { "epoch": 2.3674172692596573, "grad_norm": 1.3549801111221313, "learning_rate": 1.1704772315204516e-05, "loss": 0.0595, "step": 160750 }, { "epoch": 2.3677854523497444, "grad_norm": 1.713382363319397, "learning_rate": 1.1703135943237529e-05, "loss": 0.0636, "step": 160775 }, { "epoch": 2.3681536354398314, "grad_norm": 1.4275914430618286, "learning_rate": 1.1701499571270545e-05, "loss": 0.0639, "step": 160800 }, { "epoch": 2.3685218185299184, "grad_norm": 1.5068387985229492, "learning_rate": 1.1699863199303561e-05, "loss": 0.0628, "step": 160825 }, { "epoch": 2.3688900016200054, "grad_norm": 1.2079943418502808, "learning_rate": 1.1698226827336575e-05, "loss": 0.0617, "step": 160850 }, { "epoch": 2.369258184710093, "grad_norm": 1.4133532047271729, "learning_rate": 1.1696590455369592e-05, "loss": 0.0692, "step": 160875 }, { "epoch": 2.36962636780018, "grad_norm": 1.2821435928344727, "learning_rate": 1.1694954083402608e-05, "loss": 0.0564, "step": 160900 }, { "epoch": 2.369994550890267, "grad_norm": 1.1887179613113403, "learning_rate": 1.1693317711435624e-05, "loss": 0.0603, "step": 160925 }, { "epoch": 2.370362733980354, "grad_norm": 1.1365894079208374, "learning_rate": 1.1691681339468638e-05, "loss": 0.0576, "step": 160950 }, { "epoch": 2.370730917070441, "grad_norm": 1.0952905416488647, "learning_rate": 1.1690044967501654e-05, "loss": 0.0641, "step": 160975 }, { "epoch": 2.371099100160528, "grad_norm": 1.5661420822143555, "learning_rate": 1.1688408595534669e-05, "loss": 0.0652, "step": 161000 }, { "epoch": 2.371467283250615, "grad_norm": 1.1456741094589233, "learning_rate": 1.1686772223567683e-05, "loss": 0.0643, "step": 161025 }, { "epoch": 2.371835466340702, "grad_norm": 1.2231078147888184, "learning_rate": 1.16851358516007e-05, "loss": 0.0606, "step": 161050 }, { "epoch": 2.372203649430789, "grad_norm": 1.5011032819747925, "learning_rate": 1.1683499479633716e-05, "loss": 0.0574, "step": 161075 }, { "epoch": 2.372571832520876, "grad_norm": 1.4581764936447144, "learning_rate": 1.1681863107666732e-05, "loss": 0.0565, "step": 161100 }, { "epoch": 2.372940015610963, "grad_norm": 1.2929635047912598, "learning_rate": 1.1680226735699746e-05, "loss": 0.0612, "step": 161125 }, { "epoch": 2.37330819870105, "grad_norm": 1.8550294637680054, "learning_rate": 1.1678590363732762e-05, "loss": 0.0621, "step": 161150 }, { "epoch": 2.373676381791137, "grad_norm": 1.3191032409667969, "learning_rate": 1.1676953991765779e-05, "loss": 0.0609, "step": 161175 }, { "epoch": 2.374044564881224, "grad_norm": 1.435896635055542, "learning_rate": 1.1675317619798791e-05, "loss": 0.0619, "step": 161200 }, { "epoch": 2.374412747971311, "grad_norm": 1.0836715698242188, "learning_rate": 1.1673681247831808e-05, "loss": 0.0559, "step": 161225 }, { "epoch": 2.374780931061398, "grad_norm": 1.4972012042999268, "learning_rate": 1.1672044875864824e-05, "loss": 0.0602, "step": 161250 }, { "epoch": 2.375149114151485, "grad_norm": 1.6567645072937012, "learning_rate": 1.1670408503897838e-05, "loss": 0.0634, "step": 161275 }, { "epoch": 2.375517297241572, "grad_norm": 1.0782804489135742, "learning_rate": 1.1668772131930854e-05, "loss": 0.0598, "step": 161300 }, { "epoch": 2.375885480331659, "grad_norm": 1.5296897888183594, "learning_rate": 1.166713575996387e-05, "loss": 0.0632, "step": 161325 }, { "epoch": 2.376253663421746, "grad_norm": 1.4100537300109863, "learning_rate": 1.1665499387996887e-05, "loss": 0.0629, "step": 161350 }, { "epoch": 2.376621846511833, "grad_norm": 1.6575449705123901, "learning_rate": 1.1663863016029901e-05, "loss": 0.058, "step": 161375 }, { "epoch": 2.3769900296019206, "grad_norm": 1.2804405689239502, "learning_rate": 1.1662226644062917e-05, "loss": 0.0611, "step": 161400 }, { "epoch": 2.3773582126920076, "grad_norm": 1.5220341682434082, "learning_rate": 1.166065572697461e-05, "loss": 0.0617, "step": 161425 }, { "epoch": 2.3777263957820947, "grad_norm": 1.1972874402999878, "learning_rate": 1.1659019355007626e-05, "loss": 0.0541, "step": 161450 }, { "epoch": 2.3780945788721817, "grad_norm": 1.238508701324463, "learning_rate": 1.1657382983040642e-05, "loss": 0.0598, "step": 161475 }, { "epoch": 2.3784627619622687, "grad_norm": 1.4524447917938232, "learning_rate": 1.1655746611073658e-05, "loss": 0.0625, "step": 161500 }, { "epoch": 2.3788309450523557, "grad_norm": 1.0858079195022583, "learning_rate": 1.1654110239106672e-05, "loss": 0.0596, "step": 161525 }, { "epoch": 2.3791991281424427, "grad_norm": 0.8202728033065796, "learning_rate": 1.1652473867139688e-05, "loss": 0.0607, "step": 161550 }, { "epoch": 2.3795673112325297, "grad_norm": 1.1476420164108276, "learning_rate": 1.1650837495172705e-05, "loss": 0.06, "step": 161575 }, { "epoch": 2.3799354943226168, "grad_norm": 1.1841585636138916, "learning_rate": 1.164920112320572e-05, "loss": 0.0588, "step": 161600 }, { "epoch": 2.3803036774127038, "grad_norm": 0.8197386264801025, "learning_rate": 1.1647564751238734e-05, "loss": 0.0653, "step": 161625 }, { "epoch": 2.380671860502791, "grad_norm": 1.3758143186569214, "learning_rate": 1.164592837927175e-05, "loss": 0.0572, "step": 161650 }, { "epoch": 2.381040043592878, "grad_norm": 1.315423846244812, "learning_rate": 1.1644292007304764e-05, "loss": 0.0581, "step": 161675 }, { "epoch": 2.381408226682965, "grad_norm": 1.294613003730774, "learning_rate": 1.164265563533778e-05, "loss": 0.0611, "step": 161700 }, { "epoch": 2.381776409773052, "grad_norm": 1.762940526008606, "learning_rate": 1.1641019263370796e-05, "loss": 0.0663, "step": 161725 }, { "epoch": 2.382144592863139, "grad_norm": 0.876607358455658, "learning_rate": 1.1639382891403813e-05, "loss": 0.0638, "step": 161750 }, { "epoch": 2.382512775953226, "grad_norm": 1.163330316543579, "learning_rate": 1.1637746519436827e-05, "loss": 0.0664, "step": 161775 }, { "epoch": 2.382880959043313, "grad_norm": 1.4517558813095093, "learning_rate": 1.1636110147469843e-05, "loss": 0.0638, "step": 161800 }, { "epoch": 2.3832491421334003, "grad_norm": 1.4793587923049927, "learning_rate": 1.1634473775502858e-05, "loss": 0.0574, "step": 161825 }, { "epoch": 2.3836173252234873, "grad_norm": 1.6949927806854248, "learning_rate": 1.1632837403535872e-05, "loss": 0.0626, "step": 161850 }, { "epoch": 2.3839855083135744, "grad_norm": 1.3969281911849976, "learning_rate": 1.1631201031568888e-05, "loss": 0.0549, "step": 161875 }, { "epoch": 2.3843536914036614, "grad_norm": 1.4277383089065552, "learning_rate": 1.1629564659601904e-05, "loss": 0.0642, "step": 161900 }, { "epoch": 2.3847218744937484, "grad_norm": 1.18282949924469, "learning_rate": 1.162792828763492e-05, "loss": 0.0611, "step": 161925 }, { "epoch": 2.3850900575838354, "grad_norm": 1.4680016040802002, "learning_rate": 1.1626291915667935e-05, "loss": 0.059, "step": 161950 }, { "epoch": 2.3854582406739224, "grad_norm": 2.0155391693115234, "learning_rate": 1.1624655543700951e-05, "loss": 0.0674, "step": 161975 }, { "epoch": 2.3858264237640094, "grad_norm": 0.9847386479377747, "learning_rate": 1.1623019171733967e-05, "loss": 0.0558, "step": 162000 }, { "epoch": 2.3861946068540965, "grad_norm": 0.9667565822601318, "learning_rate": 1.1621382799766982e-05, "loss": 0.0646, "step": 162025 }, { "epoch": 2.3865627899441835, "grad_norm": 1.3795044422149658, "learning_rate": 1.1619746427799996e-05, "loss": 0.0615, "step": 162050 }, { "epoch": 2.3869309730342705, "grad_norm": 1.7952836751937866, "learning_rate": 1.1618110055833012e-05, "loss": 0.0587, "step": 162075 }, { "epoch": 2.3872991561243575, "grad_norm": 1.4881402254104614, "learning_rate": 1.1616473683866027e-05, "loss": 0.0648, "step": 162100 }, { "epoch": 2.3876673392144445, "grad_norm": 1.7149633169174194, "learning_rate": 1.1614837311899043e-05, "loss": 0.0653, "step": 162125 }, { "epoch": 2.3880355223045315, "grad_norm": 1.3947250843048096, "learning_rate": 1.1613200939932059e-05, "loss": 0.0529, "step": 162150 }, { "epoch": 2.3884037053946185, "grad_norm": 1.2231005430221558, "learning_rate": 1.1611564567965075e-05, "loss": 0.0626, "step": 162175 }, { "epoch": 2.3887718884847056, "grad_norm": 1.3092209100723267, "learning_rate": 1.160992819599809e-05, "loss": 0.0622, "step": 162200 }, { "epoch": 2.3891400715747926, "grad_norm": 1.6007519960403442, "learning_rate": 1.1608291824031106e-05, "loss": 0.0597, "step": 162225 }, { "epoch": 2.3895082546648796, "grad_norm": 1.2384833097457886, "learning_rate": 1.1606655452064119e-05, "loss": 0.065, "step": 162250 }, { "epoch": 2.3898764377549666, "grad_norm": 1.6072911024093628, "learning_rate": 1.1605019080097135e-05, "loss": 0.0629, "step": 162275 }, { "epoch": 2.3902446208450536, "grad_norm": 1.4751777648925781, "learning_rate": 1.1603382708130151e-05, "loss": 0.0641, "step": 162300 }, { "epoch": 2.3906128039351406, "grad_norm": 1.5758445262908936, "learning_rate": 1.1601746336163167e-05, "loss": 0.0636, "step": 162325 }, { "epoch": 2.390980987025228, "grad_norm": 1.7630226612091064, "learning_rate": 1.1600109964196182e-05, "loss": 0.0639, "step": 162350 }, { "epoch": 2.391349170115315, "grad_norm": 1.7897053956985474, "learning_rate": 1.1598473592229198e-05, "loss": 0.0603, "step": 162375 }, { "epoch": 2.391717353205402, "grad_norm": 1.2426737546920776, "learning_rate": 1.1596837220262214e-05, "loss": 0.0614, "step": 162400 }, { "epoch": 2.392085536295489, "grad_norm": 1.243597388267517, "learning_rate": 1.159520084829523e-05, "loss": 0.0608, "step": 162425 }, { "epoch": 2.392453719385576, "grad_norm": 0.9385548233985901, "learning_rate": 1.1593564476328244e-05, "loss": 0.0522, "step": 162450 }, { "epoch": 2.392821902475663, "grad_norm": 1.703717589378357, "learning_rate": 1.1591928104361259e-05, "loss": 0.0596, "step": 162475 }, { "epoch": 2.39319008556575, "grad_norm": 0.9092941880226135, "learning_rate": 1.1590291732394275e-05, "loss": 0.0618, "step": 162500 }, { "epoch": 2.393558268655837, "grad_norm": 1.6146618127822876, "learning_rate": 1.158865536042729e-05, "loss": 0.0588, "step": 162525 }, { "epoch": 2.3939264517459242, "grad_norm": 1.4100440740585327, "learning_rate": 1.1587018988460306e-05, "loss": 0.0645, "step": 162550 }, { "epoch": 2.3942946348360112, "grad_norm": 1.964572787284851, "learning_rate": 1.1585382616493322e-05, "loss": 0.0603, "step": 162575 }, { "epoch": 2.3946628179260983, "grad_norm": 1.041839599609375, "learning_rate": 1.1583746244526336e-05, "loss": 0.0624, "step": 162600 }, { "epoch": 2.3950310010161853, "grad_norm": 1.6516464948654175, "learning_rate": 1.1582109872559352e-05, "loss": 0.0584, "step": 162625 }, { "epoch": 2.3953991841062723, "grad_norm": 0.9532708525657654, "learning_rate": 1.1580473500592369e-05, "loss": 0.0611, "step": 162650 }, { "epoch": 2.3957673671963593, "grad_norm": 1.1092602014541626, "learning_rate": 1.1578837128625381e-05, "loss": 0.0614, "step": 162675 }, { "epoch": 2.3961355502864463, "grad_norm": 1.2556697130203247, "learning_rate": 1.1577200756658398e-05, "loss": 0.0647, "step": 162700 }, { "epoch": 2.3965037333765333, "grad_norm": 1.751400351524353, "learning_rate": 1.1575564384691414e-05, "loss": 0.0594, "step": 162725 }, { "epoch": 2.3968719164666203, "grad_norm": 1.47752046585083, "learning_rate": 1.157392801272443e-05, "loss": 0.0592, "step": 162750 }, { "epoch": 2.397240099556708, "grad_norm": 1.470221996307373, "learning_rate": 1.1572291640757444e-05, "loss": 0.0617, "step": 162775 }, { "epoch": 2.397608282646795, "grad_norm": 1.528430700302124, "learning_rate": 1.157065526879046e-05, "loss": 0.0579, "step": 162800 }, { "epoch": 2.397976465736882, "grad_norm": 1.5887783765792847, "learning_rate": 1.1569018896823477e-05, "loss": 0.0688, "step": 162825 }, { "epoch": 2.398344648826969, "grad_norm": 0.9407671093940735, "learning_rate": 1.1567382524856493e-05, "loss": 0.0595, "step": 162850 }, { "epoch": 2.398712831917056, "grad_norm": 1.6214011907577515, "learning_rate": 1.1565746152889507e-05, "loss": 0.0619, "step": 162875 }, { "epoch": 2.399081015007143, "grad_norm": 1.2414709329605103, "learning_rate": 1.1564109780922522e-05, "loss": 0.0543, "step": 162900 }, { "epoch": 2.39944919809723, "grad_norm": 1.1898075342178345, "learning_rate": 1.1562473408955536e-05, "loss": 0.0671, "step": 162925 }, { "epoch": 2.399817381187317, "grad_norm": 1.2103432416915894, "learning_rate": 1.1560837036988552e-05, "loss": 0.0552, "step": 162950 }, { "epoch": 2.400185564277404, "grad_norm": 1.0854477882385254, "learning_rate": 1.1559200665021568e-05, "loss": 0.0619, "step": 162975 }, { "epoch": 2.400553747367491, "grad_norm": 1.0311617851257324, "learning_rate": 1.1557564293054585e-05, "loss": 0.0562, "step": 163000 }, { "epoch": 2.400921930457578, "grad_norm": 1.5999698638916016, "learning_rate": 1.1555927921087599e-05, "loss": 0.0589, "step": 163025 }, { "epoch": 2.401290113547665, "grad_norm": 1.5684162378311157, "learning_rate": 1.1554291549120615e-05, "loss": 0.0607, "step": 163050 }, { "epoch": 2.401658296637752, "grad_norm": 1.2197459936141968, "learning_rate": 1.1552655177153631e-05, "loss": 0.0571, "step": 163075 }, { "epoch": 2.402026479727839, "grad_norm": 1.5436311960220337, "learning_rate": 1.1551018805186644e-05, "loss": 0.0681, "step": 163100 }, { "epoch": 2.402394662817926, "grad_norm": 0.9872592687606812, "learning_rate": 1.154938243321966e-05, "loss": 0.0588, "step": 163125 }, { "epoch": 2.402762845908013, "grad_norm": 1.3021868467330933, "learning_rate": 1.1547746061252676e-05, "loss": 0.0567, "step": 163150 }, { "epoch": 2.4031310289981, "grad_norm": 1.247228980064392, "learning_rate": 1.154610968928569e-05, "loss": 0.0538, "step": 163175 }, { "epoch": 2.403499212088187, "grad_norm": 1.711039662361145, "learning_rate": 1.1544473317318707e-05, "loss": 0.0625, "step": 163200 }, { "epoch": 2.403867395178274, "grad_norm": 1.3409584760665894, "learning_rate": 1.1542836945351723e-05, "loss": 0.0625, "step": 163225 }, { "epoch": 2.404235578268361, "grad_norm": 1.0156556367874146, "learning_rate": 1.154120057338474e-05, "loss": 0.0654, "step": 163250 }, { "epoch": 2.404603761358448, "grad_norm": 0.9794853925704956, "learning_rate": 1.1539564201417754e-05, "loss": 0.0541, "step": 163275 }, { "epoch": 2.404971944448535, "grad_norm": 2.182584285736084, "learning_rate": 1.1537927829450768e-05, "loss": 0.0598, "step": 163300 }, { "epoch": 2.4053401275386226, "grad_norm": 1.5068104267120361, "learning_rate": 1.1536291457483784e-05, "loss": 0.0599, "step": 163325 }, { "epoch": 2.4057083106287096, "grad_norm": 1.6922208070755005, "learning_rate": 1.1534655085516799e-05, "loss": 0.0581, "step": 163350 }, { "epoch": 2.4060764937187966, "grad_norm": 1.176310420036316, "learning_rate": 1.1533018713549815e-05, "loss": 0.057, "step": 163375 }, { "epoch": 2.4064446768088836, "grad_norm": 1.4202582836151123, "learning_rate": 1.1531382341582831e-05, "loss": 0.0626, "step": 163400 }, { "epoch": 2.4068128598989706, "grad_norm": 1.572019338607788, "learning_rate": 1.1529745969615847e-05, "loss": 0.0627, "step": 163425 }, { "epoch": 2.4071810429890577, "grad_norm": 1.3447861671447754, "learning_rate": 1.1528109597648862e-05, "loss": 0.066, "step": 163450 }, { "epoch": 2.4075492260791447, "grad_norm": 1.3755595684051514, "learning_rate": 1.1526473225681878e-05, "loss": 0.058, "step": 163475 }, { "epoch": 2.4079174091692317, "grad_norm": 1.036713719367981, "learning_rate": 1.1524836853714894e-05, "loss": 0.0597, "step": 163500 }, { "epoch": 2.4082855922593187, "grad_norm": 1.1989190578460693, "learning_rate": 1.1523200481747907e-05, "loss": 0.0539, "step": 163525 }, { "epoch": 2.4086537753494057, "grad_norm": 1.349012017250061, "learning_rate": 1.1521629564659602e-05, "loss": 0.0603, "step": 163550 }, { "epoch": 2.4090219584394927, "grad_norm": 1.689965844154358, "learning_rate": 1.1519993192692619e-05, "loss": 0.0656, "step": 163575 }, { "epoch": 2.4093901415295798, "grad_norm": 1.2762181758880615, "learning_rate": 1.1518356820725633e-05, "loss": 0.056, "step": 163600 }, { "epoch": 2.4097583246196668, "grad_norm": 1.458579659461975, "learning_rate": 1.1516720448758649e-05, "loss": 0.0668, "step": 163625 }, { "epoch": 2.410126507709754, "grad_norm": 1.2224823236465454, "learning_rate": 1.1515084076791665e-05, "loss": 0.0622, "step": 163650 }, { "epoch": 2.410494690799841, "grad_norm": 1.6604403257369995, "learning_rate": 1.1513447704824681e-05, "loss": 0.0627, "step": 163675 }, { "epoch": 2.410862873889928, "grad_norm": 1.5779081583023071, "learning_rate": 1.1511811332857696e-05, "loss": 0.0592, "step": 163700 }, { "epoch": 2.411231056980015, "grad_norm": 1.486918330192566, "learning_rate": 1.151017496089071e-05, "loss": 0.0611, "step": 163725 }, { "epoch": 2.4115992400701023, "grad_norm": 1.4067714214324951, "learning_rate": 1.1508538588923725e-05, "loss": 0.0594, "step": 163750 }, { "epoch": 2.4119674231601893, "grad_norm": 1.2232433557510376, "learning_rate": 1.1506902216956741e-05, "loss": 0.06, "step": 163775 }, { "epoch": 2.4123356062502763, "grad_norm": 1.5029042959213257, "learning_rate": 1.1505265844989757e-05, "loss": 0.0691, "step": 163800 }, { "epoch": 2.4127037893403633, "grad_norm": 1.155443787574768, "learning_rate": 1.1503629473022773e-05, "loss": 0.0591, "step": 163825 }, { "epoch": 2.4130719724304504, "grad_norm": 1.2783972024917603, "learning_rate": 1.1501993101055788e-05, "loss": 0.0525, "step": 163850 }, { "epoch": 2.4134401555205374, "grad_norm": 1.4131832122802734, "learning_rate": 1.1500356729088804e-05, "loss": 0.0634, "step": 163875 }, { "epoch": 2.4138083386106244, "grad_norm": 0.925175130367279, "learning_rate": 1.149872035712182e-05, "loss": 0.0598, "step": 163900 }, { "epoch": 2.4141765217007114, "grad_norm": 1.3498001098632812, "learning_rate": 1.1497083985154836e-05, "loss": 0.0558, "step": 163925 }, { "epoch": 2.4145447047907984, "grad_norm": 1.1745197772979736, "learning_rate": 1.1495447613187849e-05, "loss": 0.0579, "step": 163950 }, { "epoch": 2.4149128878808854, "grad_norm": 1.5623067617416382, "learning_rate": 1.1493811241220865e-05, "loss": 0.0643, "step": 163975 }, { "epoch": 2.4152810709709724, "grad_norm": 1.6745076179504395, "learning_rate": 1.149217486925388e-05, "loss": 0.0644, "step": 164000 }, { "epoch": 2.4156492540610595, "grad_norm": 1.1848735809326172, "learning_rate": 1.1490538497286896e-05, "loss": 0.0606, "step": 164025 }, { "epoch": 2.4160174371511465, "grad_norm": 1.2545878887176514, "learning_rate": 1.1488902125319912e-05, "loss": 0.0592, "step": 164050 }, { "epoch": 2.4163856202412335, "grad_norm": 0.9535441994667053, "learning_rate": 1.1487265753352928e-05, "loss": 0.0623, "step": 164075 }, { "epoch": 2.4167538033313205, "grad_norm": 2.1380114555358887, "learning_rate": 1.1485629381385942e-05, "loss": 0.0609, "step": 164100 }, { "epoch": 2.4171219864214075, "grad_norm": 1.0810357332229614, "learning_rate": 1.1483993009418959e-05, "loss": 0.0632, "step": 164125 }, { "epoch": 2.4174901695114945, "grad_norm": 1.2533912658691406, "learning_rate": 1.1482356637451973e-05, "loss": 0.0606, "step": 164150 }, { "epoch": 2.4178583526015816, "grad_norm": 0.8738951086997986, "learning_rate": 1.1480720265484988e-05, "loss": 0.0615, "step": 164175 }, { "epoch": 2.4182265356916686, "grad_norm": 1.5970969200134277, "learning_rate": 1.1479083893518004e-05, "loss": 0.0663, "step": 164200 }, { "epoch": 2.4185947187817556, "grad_norm": 1.8518763780593872, "learning_rate": 1.147744752155102e-05, "loss": 0.0599, "step": 164225 }, { "epoch": 2.4189629018718426, "grad_norm": 1.0967353582382202, "learning_rate": 1.1475811149584036e-05, "loss": 0.0679, "step": 164250 }, { "epoch": 2.41933108496193, "grad_norm": 1.0748199224472046, "learning_rate": 1.147417477761705e-05, "loss": 0.0602, "step": 164275 }, { "epoch": 2.419699268052017, "grad_norm": 1.453350305557251, "learning_rate": 1.1472538405650067e-05, "loss": 0.0577, "step": 164300 }, { "epoch": 2.420067451142104, "grad_norm": 1.313693881034851, "learning_rate": 1.1470902033683083e-05, "loss": 0.0561, "step": 164325 }, { "epoch": 2.420435634232191, "grad_norm": 1.944935917854309, "learning_rate": 1.1469265661716097e-05, "loss": 0.0668, "step": 164350 }, { "epoch": 2.420803817322278, "grad_norm": 1.3409377336502075, "learning_rate": 1.1467629289749112e-05, "loss": 0.0585, "step": 164375 }, { "epoch": 2.421172000412365, "grad_norm": 1.2738739252090454, "learning_rate": 1.1465992917782128e-05, "loss": 0.0526, "step": 164400 }, { "epoch": 2.421540183502452, "grad_norm": 1.3180350065231323, "learning_rate": 1.1464356545815142e-05, "loss": 0.0562, "step": 164425 }, { "epoch": 2.421908366592539, "grad_norm": 1.4948314428329468, "learning_rate": 1.1462720173848158e-05, "loss": 0.0628, "step": 164450 }, { "epoch": 2.422276549682626, "grad_norm": 2.0097382068634033, "learning_rate": 1.1461083801881175e-05, "loss": 0.0683, "step": 164475 }, { "epoch": 2.422644732772713, "grad_norm": 1.7877976894378662, "learning_rate": 1.145944742991419e-05, "loss": 0.0582, "step": 164500 }, { "epoch": 2.4230129158628, "grad_norm": 1.1123054027557373, "learning_rate": 1.1457811057947205e-05, "loss": 0.0641, "step": 164525 }, { "epoch": 2.4233810989528872, "grad_norm": 1.3095401525497437, "learning_rate": 1.1456174685980221e-05, "loss": 0.0645, "step": 164550 }, { "epoch": 2.4237492820429742, "grad_norm": 1.6199350357055664, "learning_rate": 1.1454538314013234e-05, "loss": 0.0572, "step": 164575 }, { "epoch": 2.4241174651330613, "grad_norm": 1.5714999437332153, "learning_rate": 1.145290194204625e-05, "loss": 0.0604, "step": 164600 }, { "epoch": 2.4244856482231483, "grad_norm": 0.8159253597259521, "learning_rate": 1.1451265570079266e-05, "loss": 0.0546, "step": 164625 }, { "epoch": 2.4248538313132353, "grad_norm": 1.240932583808899, "learning_rate": 1.1449629198112283e-05, "loss": 0.0572, "step": 164650 }, { "epoch": 2.4252220144033223, "grad_norm": 1.8340493440628052, "learning_rate": 1.1447992826145297e-05, "loss": 0.0666, "step": 164675 }, { "epoch": 2.4255901974934098, "grad_norm": 1.129704236984253, "learning_rate": 1.1446356454178313e-05, "loss": 0.0636, "step": 164700 }, { "epoch": 2.4259583805834968, "grad_norm": 1.0799140930175781, "learning_rate": 1.144472008221133e-05, "loss": 0.0583, "step": 164725 }, { "epoch": 2.426326563673584, "grad_norm": 1.3294018507003784, "learning_rate": 1.1443083710244345e-05, "loss": 0.056, "step": 164750 }, { "epoch": 2.426694746763671, "grad_norm": 1.4548373222351074, "learning_rate": 1.144144733827736e-05, "loss": 0.0635, "step": 164775 }, { "epoch": 2.427062929853758, "grad_norm": 1.2410050630569458, "learning_rate": 1.1439810966310374e-05, "loss": 0.0563, "step": 164800 }, { "epoch": 2.427431112943845, "grad_norm": 1.2411009073257446, "learning_rate": 1.143817459434339e-05, "loss": 0.0623, "step": 164825 }, { "epoch": 2.427799296033932, "grad_norm": 1.343772530555725, "learning_rate": 1.1436538222376405e-05, "loss": 0.0569, "step": 164850 }, { "epoch": 2.428167479124019, "grad_norm": 1.093827724456787, "learning_rate": 1.1434901850409421e-05, "loss": 0.0626, "step": 164875 }, { "epoch": 2.428535662214106, "grad_norm": 1.6780364513397217, "learning_rate": 1.1433265478442437e-05, "loss": 0.0639, "step": 164900 }, { "epoch": 2.428903845304193, "grad_norm": 1.2943000793457031, "learning_rate": 1.1431629106475452e-05, "loss": 0.0705, "step": 164925 }, { "epoch": 2.42927202839428, "grad_norm": 1.2445746660232544, "learning_rate": 1.1429992734508468e-05, "loss": 0.0588, "step": 164950 }, { "epoch": 2.429640211484367, "grad_norm": 1.9415868520736694, "learning_rate": 1.1428356362541484e-05, "loss": 0.057, "step": 164975 }, { "epoch": 2.430008394574454, "grad_norm": 1.4405913352966309, "learning_rate": 1.1426719990574497e-05, "loss": 0.0604, "step": 165000 }, { "epoch": 2.430376577664541, "grad_norm": 1.8014392852783203, "learning_rate": 1.1425083618607513e-05, "loss": 0.0629, "step": 165025 }, { "epoch": 2.430744760754628, "grad_norm": 1.5329076051712036, "learning_rate": 1.1423447246640529e-05, "loss": 0.0619, "step": 165050 }, { "epoch": 2.431112943844715, "grad_norm": 1.0596028566360474, "learning_rate": 1.1421810874673545e-05, "loss": 0.056, "step": 165075 }, { "epoch": 2.431481126934802, "grad_norm": 1.4429690837860107, "learning_rate": 1.142017450270656e-05, "loss": 0.0613, "step": 165100 }, { "epoch": 2.431849310024889, "grad_norm": 1.3741044998168945, "learning_rate": 1.1418538130739576e-05, "loss": 0.0546, "step": 165125 }, { "epoch": 2.432217493114976, "grad_norm": 1.1319230794906616, "learning_rate": 1.1416901758772592e-05, "loss": 0.0618, "step": 165150 }, { "epoch": 2.432585676205063, "grad_norm": 0.992778480052948, "learning_rate": 1.1415265386805608e-05, "loss": 0.0543, "step": 165175 }, { "epoch": 2.43295385929515, "grad_norm": 1.1014723777770996, "learning_rate": 1.1413629014838621e-05, "loss": 0.0591, "step": 165200 }, { "epoch": 2.433322042385237, "grad_norm": 1.273636817932129, "learning_rate": 1.1411992642871637e-05, "loss": 0.0554, "step": 165225 }, { "epoch": 2.4336902254753245, "grad_norm": 1.494178295135498, "learning_rate": 1.1410356270904652e-05, "loss": 0.0603, "step": 165250 }, { "epoch": 2.4340584085654116, "grad_norm": 1.1151559352874756, "learning_rate": 1.1408719898937668e-05, "loss": 0.062, "step": 165275 }, { "epoch": 2.4344265916554986, "grad_norm": 1.3149724006652832, "learning_rate": 1.1407083526970684e-05, "loss": 0.0622, "step": 165300 }, { "epoch": 2.4347947747455856, "grad_norm": 1.5353071689605713, "learning_rate": 1.14054471550037e-05, "loss": 0.0586, "step": 165325 }, { "epoch": 2.4351629578356726, "grad_norm": 1.3900264501571655, "learning_rate": 1.1403810783036714e-05, "loss": 0.0565, "step": 165350 }, { "epoch": 2.4355311409257596, "grad_norm": 1.4981166124343872, "learning_rate": 1.140217441106973e-05, "loss": 0.0581, "step": 165375 }, { "epoch": 2.4358993240158466, "grad_norm": 1.3365064859390259, "learning_rate": 1.1400538039102747e-05, "loss": 0.0585, "step": 165400 }, { "epoch": 2.4362675071059336, "grad_norm": 1.0220973491668701, "learning_rate": 1.139890166713576e-05, "loss": 0.0643, "step": 165425 }, { "epoch": 2.4366356901960207, "grad_norm": 1.3133913278579712, "learning_rate": 1.1397265295168776e-05, "loss": 0.0573, "step": 165450 }, { "epoch": 2.4370038732861077, "grad_norm": 2.3548390865325928, "learning_rate": 1.1395628923201792e-05, "loss": 0.0658, "step": 165475 }, { "epoch": 2.4373720563761947, "grad_norm": 1.3287529945373535, "learning_rate": 1.1393992551234806e-05, "loss": 0.0611, "step": 165500 }, { "epoch": 2.4377402394662817, "grad_norm": 1.0778312683105469, "learning_rate": 1.1392356179267822e-05, "loss": 0.0604, "step": 165525 }, { "epoch": 2.4381084225563687, "grad_norm": 1.2553143501281738, "learning_rate": 1.1390719807300839e-05, "loss": 0.0574, "step": 165550 }, { "epoch": 2.4384766056464557, "grad_norm": 1.6024491786956787, "learning_rate": 1.1389083435333855e-05, "loss": 0.0612, "step": 165575 }, { "epoch": 2.4388447887365428, "grad_norm": 1.249430775642395, "learning_rate": 1.138744706336687e-05, "loss": 0.0572, "step": 165600 }, { "epoch": 2.4392129718266298, "grad_norm": 1.5314658880233765, "learning_rate": 1.1385810691399884e-05, "loss": 0.0632, "step": 165625 }, { "epoch": 2.4395811549167172, "grad_norm": 1.287026286125183, "learning_rate": 1.13841743194329e-05, "loss": 0.0534, "step": 165650 }, { "epoch": 2.4399493380068042, "grad_norm": 1.6620677709579468, "learning_rate": 1.1382537947465914e-05, "loss": 0.0626, "step": 165675 }, { "epoch": 2.4403175210968913, "grad_norm": 1.3048840761184692, "learning_rate": 1.138090157549893e-05, "loss": 0.0637, "step": 165700 }, { "epoch": 2.4406857041869783, "grad_norm": 1.2793484926223755, "learning_rate": 1.1379265203531947e-05, "loss": 0.0563, "step": 165725 }, { "epoch": 2.4410538872770653, "grad_norm": 1.4037551879882812, "learning_rate": 1.1377628831564963e-05, "loss": 0.0549, "step": 165750 }, { "epoch": 2.4414220703671523, "grad_norm": 1.3582558631896973, "learning_rate": 1.1375992459597977e-05, "loss": 0.056, "step": 165775 }, { "epoch": 2.4417902534572393, "grad_norm": 1.780892014503479, "learning_rate": 1.1374356087630993e-05, "loss": 0.0642, "step": 165800 }, { "epoch": 2.4421584365473263, "grad_norm": 1.6464052200317383, "learning_rate": 1.137271971566401e-05, "loss": 0.0579, "step": 165825 }, { "epoch": 2.4425266196374134, "grad_norm": 1.0196897983551025, "learning_rate": 1.1371083343697022e-05, "loss": 0.0546, "step": 165850 }, { "epoch": 2.4428948027275004, "grad_norm": 1.2940099239349365, "learning_rate": 1.1369446971730038e-05, "loss": 0.0604, "step": 165875 }, { "epoch": 2.4432629858175874, "grad_norm": 1.2686991691589355, "learning_rate": 1.1367810599763055e-05, "loss": 0.0661, "step": 165900 }, { "epoch": 2.4436311689076744, "grad_norm": 0.5272985696792603, "learning_rate": 1.1366174227796069e-05, "loss": 0.0554, "step": 165925 }, { "epoch": 2.4439993519977614, "grad_norm": 1.1308406591415405, "learning_rate": 1.1364537855829085e-05, "loss": 0.0534, "step": 165950 }, { "epoch": 2.4443675350878484, "grad_norm": 1.2716814279556274, "learning_rate": 1.1362901483862101e-05, "loss": 0.0632, "step": 165975 }, { "epoch": 2.4447357181779354, "grad_norm": 1.2231541872024536, "learning_rate": 1.1361265111895117e-05, "loss": 0.0507, "step": 166000 }, { "epoch": 2.4451039012680225, "grad_norm": 1.2079716920852661, "learning_rate": 1.1359628739928132e-05, "loss": 0.0593, "step": 166025 }, { "epoch": 2.4454720843581095, "grad_norm": 1.2885600328445435, "learning_rate": 1.1357992367961146e-05, "loss": 0.0641, "step": 166050 }, { "epoch": 2.4458402674481965, "grad_norm": 1.2220584154129028, "learning_rate": 1.135635599599416e-05, "loss": 0.0544, "step": 166075 }, { "epoch": 2.4462084505382835, "grad_norm": 1.1342827081680298, "learning_rate": 1.1354719624027177e-05, "loss": 0.06, "step": 166100 }, { "epoch": 2.4465766336283705, "grad_norm": 1.4868804216384888, "learning_rate": 1.1353083252060193e-05, "loss": 0.0654, "step": 166125 }, { "epoch": 2.4469448167184575, "grad_norm": 1.4469658136367798, "learning_rate": 1.135144688009321e-05, "loss": 0.0593, "step": 166150 }, { "epoch": 2.4473129998085446, "grad_norm": 1.7173411846160889, "learning_rate": 1.1349810508126224e-05, "loss": 0.0649, "step": 166175 }, { "epoch": 2.447681182898632, "grad_norm": 1.4442107677459717, "learning_rate": 1.134817413615924e-05, "loss": 0.0578, "step": 166200 }, { "epoch": 2.448049365988719, "grad_norm": 1.5394749641418457, "learning_rate": 1.1346537764192256e-05, "loss": 0.0544, "step": 166225 }, { "epoch": 2.448417549078806, "grad_norm": 1.4512736797332764, "learning_rate": 1.1344901392225272e-05, "loss": 0.0584, "step": 166250 }, { "epoch": 2.448785732168893, "grad_norm": 1.2789298295974731, "learning_rate": 1.1343265020258285e-05, "loss": 0.0526, "step": 166275 }, { "epoch": 2.44915391525898, "grad_norm": 0.9591279625892639, "learning_rate": 1.1341628648291301e-05, "loss": 0.0586, "step": 166300 }, { "epoch": 2.449522098349067, "grad_norm": 1.3781404495239258, "learning_rate": 1.1339992276324317e-05, "loss": 0.0604, "step": 166325 }, { "epoch": 2.449890281439154, "grad_norm": 1.503045678138733, "learning_rate": 1.1338355904357332e-05, "loss": 0.0593, "step": 166350 }, { "epoch": 2.450258464529241, "grad_norm": 1.400524616241455, "learning_rate": 1.1336719532390348e-05, "loss": 0.065, "step": 166375 }, { "epoch": 2.450626647619328, "grad_norm": 1.530967354774475, "learning_rate": 1.1335083160423364e-05, "loss": 0.0679, "step": 166400 }, { "epoch": 2.450994830709415, "grad_norm": 0.9463401436805725, "learning_rate": 1.1333446788456378e-05, "loss": 0.0574, "step": 166425 }, { "epoch": 2.451363013799502, "grad_norm": 1.41310715675354, "learning_rate": 1.1331810416489395e-05, "loss": 0.057, "step": 166450 }, { "epoch": 2.451731196889589, "grad_norm": 1.444229006767273, "learning_rate": 1.1330174044522409e-05, "loss": 0.061, "step": 166475 }, { "epoch": 2.452099379979676, "grad_norm": 2.0020642280578613, "learning_rate": 1.1328537672555424e-05, "loss": 0.065, "step": 166500 }, { "epoch": 2.452467563069763, "grad_norm": 1.286094069480896, "learning_rate": 1.132690130058844e-05, "loss": 0.0589, "step": 166525 }, { "epoch": 2.4528357461598502, "grad_norm": 1.7219420671463013, "learning_rate": 1.1325330383500135e-05, "loss": 0.0585, "step": 166550 }, { "epoch": 2.4532039292499372, "grad_norm": 1.1732163429260254, "learning_rate": 1.1323694011533151e-05, "loss": 0.0568, "step": 166575 }, { "epoch": 2.4535721123400243, "grad_norm": 1.4670276641845703, "learning_rate": 1.1322057639566166e-05, "loss": 0.063, "step": 166600 }, { "epoch": 2.4539402954301117, "grad_norm": 1.4043080806732178, "learning_rate": 1.1320421267599182e-05, "loss": 0.0589, "step": 166625 }, { "epoch": 2.4543084785201987, "grad_norm": 0.8559965491294861, "learning_rate": 1.1318784895632198e-05, "loss": 0.0612, "step": 166650 }, { "epoch": 2.4546766616102857, "grad_norm": 1.137347936630249, "learning_rate": 1.1317148523665211e-05, "loss": 0.0626, "step": 166675 }, { "epoch": 2.4550448447003728, "grad_norm": 0.715144157409668, "learning_rate": 1.1315512151698227e-05, "loss": 0.0508, "step": 166700 }, { "epoch": 2.45541302779046, "grad_norm": 1.409767746925354, "learning_rate": 1.1313875779731243e-05, "loss": 0.0688, "step": 166725 }, { "epoch": 2.455781210880547, "grad_norm": 1.1211822032928467, "learning_rate": 1.1312239407764258e-05, "loss": 0.0679, "step": 166750 }, { "epoch": 2.456149393970634, "grad_norm": 1.1836835145950317, "learning_rate": 1.1310603035797274e-05, "loss": 0.0549, "step": 166775 }, { "epoch": 2.456517577060721, "grad_norm": 0.9444347023963928, "learning_rate": 1.130896666383029e-05, "loss": 0.0564, "step": 166800 }, { "epoch": 2.456885760150808, "grad_norm": 1.400810956954956, "learning_rate": 1.1307330291863306e-05, "loss": 0.0552, "step": 166825 }, { "epoch": 2.457253943240895, "grad_norm": 1.3926701545715332, "learning_rate": 1.130569391989632e-05, "loss": 0.0646, "step": 166850 }, { "epoch": 2.457622126330982, "grad_norm": 1.5172147750854492, "learning_rate": 1.1304057547929337e-05, "loss": 0.0562, "step": 166875 }, { "epoch": 2.457990309421069, "grad_norm": 1.601259708404541, "learning_rate": 1.130242117596235e-05, "loss": 0.0536, "step": 166900 }, { "epoch": 2.458358492511156, "grad_norm": 1.2095301151275635, "learning_rate": 1.1300784803995366e-05, "loss": 0.0563, "step": 166925 }, { "epoch": 2.458726675601243, "grad_norm": 1.4847033023834229, "learning_rate": 1.1299148432028382e-05, "loss": 0.0651, "step": 166950 }, { "epoch": 2.45909485869133, "grad_norm": 1.2221888303756714, "learning_rate": 1.1297512060061398e-05, "loss": 0.0614, "step": 166975 }, { "epoch": 2.459463041781417, "grad_norm": 1.0640538930892944, "learning_rate": 1.1295875688094412e-05, "loss": 0.0684, "step": 167000 }, { "epoch": 2.459831224871504, "grad_norm": 1.0552587509155273, "learning_rate": 1.1294239316127429e-05, "loss": 0.065, "step": 167025 }, { "epoch": 2.460199407961591, "grad_norm": 1.2368358373641968, "learning_rate": 1.1292602944160445e-05, "loss": 0.063, "step": 167050 }, { "epoch": 2.460567591051678, "grad_norm": 1.5190926790237427, "learning_rate": 1.1290966572193461e-05, "loss": 0.0612, "step": 167075 }, { "epoch": 2.460935774141765, "grad_norm": 1.2478116750717163, "learning_rate": 1.1289330200226474e-05, "loss": 0.0551, "step": 167100 }, { "epoch": 2.461303957231852, "grad_norm": 1.3048303127288818, "learning_rate": 1.128769382825949e-05, "loss": 0.0613, "step": 167125 }, { "epoch": 2.4616721403219395, "grad_norm": 2.1593852043151855, "learning_rate": 1.1286057456292506e-05, "loss": 0.0627, "step": 167150 }, { "epoch": 2.4620403234120265, "grad_norm": 1.3679306507110596, "learning_rate": 1.128442108432552e-05, "loss": 0.0598, "step": 167175 }, { "epoch": 2.4624085065021135, "grad_norm": 1.7907465696334839, "learning_rate": 1.1282784712358537e-05, "loss": 0.0606, "step": 167200 }, { "epoch": 2.4627766895922005, "grad_norm": 1.0296827554702759, "learning_rate": 1.1281148340391553e-05, "loss": 0.0631, "step": 167225 }, { "epoch": 2.4631448726822875, "grad_norm": 1.2811992168426514, "learning_rate": 1.1279511968424567e-05, "loss": 0.059, "step": 167250 }, { "epoch": 2.4635130557723746, "grad_norm": 1.2746224403381348, "learning_rate": 1.1277875596457583e-05, "loss": 0.0709, "step": 167275 }, { "epoch": 2.4638812388624616, "grad_norm": 1.7382864952087402, "learning_rate": 1.12762392244906e-05, "loss": 0.06, "step": 167300 }, { "epoch": 2.4642494219525486, "grad_norm": 1.1114996671676636, "learning_rate": 1.1274602852523612e-05, "loss": 0.0586, "step": 167325 }, { "epoch": 2.4646176050426356, "grad_norm": 1.0237200260162354, "learning_rate": 1.1272966480556628e-05, "loss": 0.0596, "step": 167350 }, { "epoch": 2.4649857881327226, "grad_norm": 1.5585252046585083, "learning_rate": 1.1271330108589645e-05, "loss": 0.0589, "step": 167375 }, { "epoch": 2.4653539712228096, "grad_norm": 1.7459185123443604, "learning_rate": 1.126969373662266e-05, "loss": 0.0593, "step": 167400 }, { "epoch": 2.4657221543128967, "grad_norm": 1.231163501739502, "learning_rate": 1.1268057364655675e-05, "loss": 0.0489, "step": 167425 }, { "epoch": 2.4660903374029837, "grad_norm": 1.2625170946121216, "learning_rate": 1.1266420992688691e-05, "loss": 0.0606, "step": 167450 }, { "epoch": 2.4664585204930707, "grad_norm": 1.569287657737732, "learning_rate": 1.1264784620721707e-05, "loss": 0.0607, "step": 167475 }, { "epoch": 2.4668267035831577, "grad_norm": 1.0937808752059937, "learning_rate": 1.1263148248754724e-05, "loss": 0.0587, "step": 167500 }, { "epoch": 2.4671948866732447, "grad_norm": 1.7669326066970825, "learning_rate": 1.1261511876787736e-05, "loss": 0.0674, "step": 167525 }, { "epoch": 2.4675630697633317, "grad_norm": 1.7553832530975342, "learning_rate": 1.1259875504820753e-05, "loss": 0.0643, "step": 167550 }, { "epoch": 2.467931252853419, "grad_norm": 1.7348791360855103, "learning_rate": 1.1258239132853767e-05, "loss": 0.0598, "step": 167575 }, { "epoch": 2.468299435943506, "grad_norm": 1.446185827255249, "learning_rate": 1.1256602760886783e-05, "loss": 0.0569, "step": 167600 }, { "epoch": 2.468667619033593, "grad_norm": 1.5019747018814087, "learning_rate": 1.12549663889198e-05, "loss": 0.0595, "step": 167625 }, { "epoch": 2.4690358021236802, "grad_norm": 1.207907795906067, "learning_rate": 1.1253330016952815e-05, "loss": 0.0568, "step": 167650 }, { "epoch": 2.4694039852137672, "grad_norm": 1.4958751201629639, "learning_rate": 1.125169364498583e-05, "loss": 0.0669, "step": 167675 }, { "epoch": 2.4697721683038543, "grad_norm": 1.3696891069412231, "learning_rate": 1.1250057273018846e-05, "loss": 0.0614, "step": 167700 }, { "epoch": 2.4701403513939413, "grad_norm": 1.3949445486068726, "learning_rate": 1.1248420901051862e-05, "loss": 0.0557, "step": 167725 }, { "epoch": 2.4705085344840283, "grad_norm": 1.7682507038116455, "learning_rate": 1.1246784529084875e-05, "loss": 0.0569, "step": 167750 }, { "epoch": 2.4708767175741153, "grad_norm": 1.3156243562698364, "learning_rate": 1.1245148157117891e-05, "loss": 0.0528, "step": 167775 }, { "epoch": 2.4712449006642023, "grad_norm": 1.1397193670272827, "learning_rate": 1.1243511785150907e-05, "loss": 0.0695, "step": 167800 }, { "epoch": 2.4716130837542893, "grad_norm": 1.5308188199996948, "learning_rate": 1.1241875413183922e-05, "loss": 0.0604, "step": 167825 }, { "epoch": 2.4719812668443764, "grad_norm": 1.0728814601898193, "learning_rate": 1.1240239041216938e-05, "loss": 0.0652, "step": 167850 }, { "epoch": 2.4723494499344634, "grad_norm": 1.5396006107330322, "learning_rate": 1.1238602669249954e-05, "loss": 0.0592, "step": 167875 }, { "epoch": 2.4727176330245504, "grad_norm": 1.8049131631851196, "learning_rate": 1.123696629728297e-05, "loss": 0.0626, "step": 167900 }, { "epoch": 2.4730858161146374, "grad_norm": 1.6638224124908447, "learning_rate": 1.1235329925315985e-05, "loss": 0.0599, "step": 167925 }, { "epoch": 2.4734539992047244, "grad_norm": 1.2251381874084473, "learning_rate": 1.1233693553348999e-05, "loss": 0.0556, "step": 167950 }, { "epoch": 2.4738221822948114, "grad_norm": 1.7455753087997437, "learning_rate": 1.1232057181382015e-05, "loss": 0.0598, "step": 167975 }, { "epoch": 2.4741903653848984, "grad_norm": 1.4482413530349731, "learning_rate": 1.123042080941503e-05, "loss": 0.059, "step": 168000 }, { "epoch": 2.4745585484749855, "grad_norm": 1.4696309566497803, "learning_rate": 1.1228784437448046e-05, "loss": 0.0654, "step": 168025 }, { "epoch": 2.4749267315650725, "grad_norm": 1.1336841583251953, "learning_rate": 1.1227148065481062e-05, "loss": 0.0552, "step": 168050 }, { "epoch": 2.4752949146551595, "grad_norm": 1.566132664680481, "learning_rate": 1.1225511693514078e-05, "loss": 0.0559, "step": 168075 }, { "epoch": 2.4756630977452465, "grad_norm": 1.8225576877593994, "learning_rate": 1.1223875321547093e-05, "loss": 0.058, "step": 168100 }, { "epoch": 2.476031280835334, "grad_norm": 1.896125078201294, "learning_rate": 1.1222238949580109e-05, "loss": 0.0632, "step": 168125 }, { "epoch": 2.476399463925421, "grad_norm": 1.2454347610473633, "learning_rate": 1.1220602577613122e-05, "loss": 0.0575, "step": 168150 }, { "epoch": 2.476767647015508, "grad_norm": 1.1391030550003052, "learning_rate": 1.1218966205646138e-05, "loss": 0.0587, "step": 168175 }, { "epoch": 2.477135830105595, "grad_norm": 1.3579012155532837, "learning_rate": 1.1217329833679154e-05, "loss": 0.0595, "step": 168200 }, { "epoch": 2.477504013195682, "grad_norm": 1.5360424518585205, "learning_rate": 1.121569346171217e-05, "loss": 0.0605, "step": 168225 }, { "epoch": 2.477872196285769, "grad_norm": 1.5696964263916016, "learning_rate": 1.1214057089745184e-05, "loss": 0.0562, "step": 168250 }, { "epoch": 2.478240379375856, "grad_norm": 0.9135528802871704, "learning_rate": 1.12124207177782e-05, "loss": 0.056, "step": 168275 }, { "epoch": 2.478608562465943, "grad_norm": 1.3621424436569214, "learning_rate": 1.1210784345811217e-05, "loss": 0.0658, "step": 168300 }, { "epoch": 2.47897674555603, "grad_norm": 1.819261908531189, "learning_rate": 1.1209147973844233e-05, "loss": 0.0595, "step": 168325 }, { "epoch": 2.479344928646117, "grad_norm": 1.3147543668746948, "learning_rate": 1.1207511601877247e-05, "loss": 0.0565, "step": 168350 }, { "epoch": 2.479713111736204, "grad_norm": 1.3996645212173462, "learning_rate": 1.1205875229910262e-05, "loss": 0.0694, "step": 168375 }, { "epoch": 2.480081294826291, "grad_norm": 1.4199289083480835, "learning_rate": 1.1204238857943276e-05, "loss": 0.0585, "step": 168400 }, { "epoch": 2.480449477916378, "grad_norm": 1.6981226205825806, "learning_rate": 1.1202602485976292e-05, "loss": 0.0599, "step": 168425 }, { "epoch": 2.480817661006465, "grad_norm": 1.2532683610916138, "learning_rate": 1.1200966114009309e-05, "loss": 0.0592, "step": 168450 }, { "epoch": 2.481185844096552, "grad_norm": 1.2565698623657227, "learning_rate": 1.1199329742042325e-05, "loss": 0.0608, "step": 168475 }, { "epoch": 2.481554027186639, "grad_norm": 1.2574912309646606, "learning_rate": 1.119769337007534e-05, "loss": 0.0611, "step": 168500 }, { "epoch": 2.4819222102767267, "grad_norm": 1.2927576303482056, "learning_rate": 1.1196056998108355e-05, "loss": 0.0591, "step": 168525 }, { "epoch": 2.4822903933668137, "grad_norm": 1.5040113925933838, "learning_rate": 1.1194420626141371e-05, "loss": 0.0655, "step": 168550 }, { "epoch": 2.4826585764569007, "grad_norm": 1.3379175662994385, "learning_rate": 1.1192784254174384e-05, "loss": 0.0668, "step": 168575 }, { "epoch": 2.4830267595469877, "grad_norm": 1.7368855476379395, "learning_rate": 1.11911478822074e-05, "loss": 0.0651, "step": 168600 }, { "epoch": 2.4833949426370747, "grad_norm": 0.9847592711448669, "learning_rate": 1.1189511510240417e-05, "loss": 0.0595, "step": 168625 }, { "epoch": 2.4837631257271617, "grad_norm": 1.8552279472351074, "learning_rate": 1.1187875138273433e-05, "loss": 0.0628, "step": 168650 }, { "epoch": 2.4841313088172488, "grad_norm": 0.9592670798301697, "learning_rate": 1.1186238766306447e-05, "loss": 0.0563, "step": 168675 }, { "epoch": 2.4844994919073358, "grad_norm": 1.369267463684082, "learning_rate": 1.1184602394339463e-05, "loss": 0.0614, "step": 168700 }, { "epoch": 2.484867674997423, "grad_norm": 1.1809782981872559, "learning_rate": 1.118296602237248e-05, "loss": 0.0536, "step": 168725 }, { "epoch": 2.48523585808751, "grad_norm": 1.6489936113357544, "learning_rate": 1.1181329650405494e-05, "loss": 0.0556, "step": 168750 }, { "epoch": 2.485604041177597, "grad_norm": 1.207336664199829, "learning_rate": 1.117969327843851e-05, "loss": 0.0591, "step": 168775 }, { "epoch": 2.485972224267684, "grad_norm": 1.1333446502685547, "learning_rate": 1.1178056906471525e-05, "loss": 0.0529, "step": 168800 }, { "epoch": 2.486340407357771, "grad_norm": 0.8984009027481079, "learning_rate": 1.1176420534504539e-05, "loss": 0.0616, "step": 168825 }, { "epoch": 2.486708590447858, "grad_norm": 1.1152271032333374, "learning_rate": 1.1174784162537555e-05, "loss": 0.0648, "step": 168850 }, { "epoch": 2.487076773537945, "grad_norm": 1.551584243774414, "learning_rate": 1.1173147790570571e-05, "loss": 0.0607, "step": 168875 }, { "epoch": 2.487444956628032, "grad_norm": 1.3065776824951172, "learning_rate": 1.1171511418603587e-05, "loss": 0.0533, "step": 168900 }, { "epoch": 2.487813139718119, "grad_norm": 1.4792369604110718, "learning_rate": 1.1169875046636602e-05, "loss": 0.066, "step": 168925 }, { "epoch": 2.488181322808206, "grad_norm": 1.4735231399536133, "learning_rate": 1.1168238674669618e-05, "loss": 0.06, "step": 168950 }, { "epoch": 2.488549505898293, "grad_norm": 0.8678383827209473, "learning_rate": 1.1166602302702634e-05, "loss": 0.0623, "step": 168975 }, { "epoch": 2.48891768898838, "grad_norm": 1.2267992496490479, "learning_rate": 1.1164965930735647e-05, "loss": 0.0543, "step": 169000 }, { "epoch": 2.489285872078467, "grad_norm": 1.070836067199707, "learning_rate": 1.1163329558768663e-05, "loss": 0.0582, "step": 169025 }, { "epoch": 2.489654055168554, "grad_norm": 1.0685932636260986, "learning_rate": 1.116169318680168e-05, "loss": 0.0554, "step": 169050 }, { "epoch": 2.4900222382586414, "grad_norm": 1.4122798442840576, "learning_rate": 1.1160056814834694e-05, "loss": 0.0612, "step": 169075 }, { "epoch": 2.4903904213487285, "grad_norm": 1.4628891944885254, "learning_rate": 1.115842044286771e-05, "loss": 0.0583, "step": 169100 }, { "epoch": 2.4907586044388155, "grad_norm": 1.6943978071212769, "learning_rate": 1.1156784070900726e-05, "loss": 0.0501, "step": 169125 }, { "epoch": 2.4911267875289025, "grad_norm": 1.1437104940414429, "learning_rate": 1.1155147698933742e-05, "loss": 0.0673, "step": 169150 }, { "epoch": 2.4914949706189895, "grad_norm": 1.5388097763061523, "learning_rate": 1.1153511326966757e-05, "loss": 0.0578, "step": 169175 }, { "epoch": 2.4918631537090765, "grad_norm": 1.300085186958313, "learning_rate": 1.1151874954999773e-05, "loss": 0.0583, "step": 169200 }, { "epoch": 2.4922313367991635, "grad_norm": 1.4053467512130737, "learning_rate": 1.1150238583032786e-05, "loss": 0.0626, "step": 169225 }, { "epoch": 2.4925995198892505, "grad_norm": 1.4101793766021729, "learning_rate": 1.1148602211065802e-05, "loss": 0.0663, "step": 169250 }, { "epoch": 2.4929677029793376, "grad_norm": 1.3437551259994507, "learning_rate": 1.1146965839098818e-05, "loss": 0.06, "step": 169275 }, { "epoch": 2.4933358860694246, "grad_norm": 1.3879871368408203, "learning_rate": 1.1145329467131834e-05, "loss": 0.0576, "step": 169300 }, { "epoch": 2.4937040691595116, "grad_norm": 1.4387868642807007, "learning_rate": 1.1143693095164848e-05, "loss": 0.0584, "step": 169325 }, { "epoch": 2.4940722522495986, "grad_norm": 1.3832757472991943, "learning_rate": 1.1142056723197865e-05, "loss": 0.0656, "step": 169350 }, { "epoch": 2.4944404353396856, "grad_norm": 1.2323750257492065, "learning_rate": 1.114042035123088e-05, "loss": 0.0608, "step": 169375 }, { "epoch": 2.4948086184297726, "grad_norm": 1.284777045249939, "learning_rate": 1.1138783979263897e-05, "loss": 0.0528, "step": 169400 }, { "epoch": 2.4951768015198597, "grad_norm": 1.9294309616088867, "learning_rate": 1.113714760729691e-05, "loss": 0.0682, "step": 169425 }, { "epoch": 2.4955449846099467, "grad_norm": 1.6158751249313354, "learning_rate": 1.1135511235329926e-05, "loss": 0.0629, "step": 169450 }, { "epoch": 2.4959131677000337, "grad_norm": 1.494902491569519, "learning_rate": 1.1133874863362942e-05, "loss": 0.0576, "step": 169475 }, { "epoch": 2.496281350790121, "grad_norm": 1.114361047744751, "learning_rate": 1.1132238491395956e-05, "loss": 0.0642, "step": 169500 }, { "epoch": 2.496649533880208, "grad_norm": 1.5320008993148804, "learning_rate": 1.1130602119428973e-05, "loss": 0.0625, "step": 169525 }, { "epoch": 2.497017716970295, "grad_norm": 1.7157726287841797, "learning_rate": 1.1128965747461989e-05, "loss": 0.0605, "step": 169550 }, { "epoch": 2.497385900060382, "grad_norm": 1.2340632677078247, "learning_rate": 1.1127329375495003e-05, "loss": 0.0582, "step": 169575 }, { "epoch": 2.497754083150469, "grad_norm": 0.5398813486099243, "learning_rate": 1.112569300352802e-05, "loss": 0.0602, "step": 169600 }, { "epoch": 2.498122266240556, "grad_norm": 1.3092273473739624, "learning_rate": 1.1124056631561034e-05, "loss": 0.0641, "step": 169625 }, { "epoch": 2.4984904493306432, "grad_norm": 1.3375014066696167, "learning_rate": 1.1122420259594048e-05, "loss": 0.0639, "step": 169650 }, { "epoch": 2.4988586324207303, "grad_norm": 1.0961357355117798, "learning_rate": 1.1120783887627064e-05, "loss": 0.0557, "step": 169675 }, { "epoch": 2.4992268155108173, "grad_norm": 1.2570444345474243, "learning_rate": 1.111914751566008e-05, "loss": 0.0522, "step": 169700 }, { "epoch": 2.4995949986009043, "grad_norm": 0.7900687456130981, "learning_rate": 1.1117511143693097e-05, "loss": 0.0534, "step": 169725 }, { "epoch": 2.4999631816909913, "grad_norm": 1.4797840118408203, "learning_rate": 1.1115874771726111e-05, "loss": 0.0582, "step": 169750 }, { "epoch": 2.5003313647810783, "grad_norm": 1.2982577085494995, "learning_rate": 1.1114238399759127e-05, "loss": 0.0588, "step": 169775 }, { "epoch": 2.5006995478711653, "grad_norm": 1.4716572761535645, "learning_rate": 1.1112602027792143e-05, "loss": 0.0634, "step": 169800 }, { "epoch": 2.5010677309612523, "grad_norm": 1.3682631254196167, "learning_rate": 1.111096565582516e-05, "loss": 0.0577, "step": 169825 }, { "epoch": 2.5014359140513394, "grad_norm": 1.2293293476104736, "learning_rate": 1.1109329283858172e-05, "loss": 0.0544, "step": 169850 }, { "epoch": 2.5018040971414264, "grad_norm": 0.9996431469917297, "learning_rate": 1.1107692911891189e-05, "loss": 0.0644, "step": 169875 }, { "epoch": 2.5021722802315134, "grad_norm": 1.39128839969635, "learning_rate": 1.1106056539924203e-05, "loss": 0.0578, "step": 169900 }, { "epoch": 2.5025404633216004, "grad_norm": 1.3627259731292725, "learning_rate": 1.1104485622835899e-05, "loss": 0.0621, "step": 169925 }, { "epoch": 2.5029086464116874, "grad_norm": 1.1350576877593994, "learning_rate": 1.1102849250868915e-05, "loss": 0.0619, "step": 169950 }, { "epoch": 2.5032768295017744, "grad_norm": 1.7639429569244385, "learning_rate": 1.1101212878901931e-05, "loss": 0.0592, "step": 169975 }, { "epoch": 2.5036450125918615, "grad_norm": 1.236167311668396, "learning_rate": 1.1099576506934945e-05, "loss": 0.0545, "step": 170000 }, { "epoch": 2.5040131956819485, "grad_norm": 0.9044005274772644, "learning_rate": 1.1097940134967961e-05, "loss": 0.0596, "step": 170025 }, { "epoch": 2.5043813787720355, "grad_norm": 1.596166729927063, "learning_rate": 1.1096303763000976e-05, "loss": 0.0638, "step": 170050 }, { "epoch": 2.504749561862123, "grad_norm": 1.2062618732452393, "learning_rate": 1.109466739103399e-05, "loss": 0.0594, "step": 170075 }, { "epoch": 2.50511774495221, "grad_norm": 1.2923988103866577, "learning_rate": 1.1093031019067007e-05, "loss": 0.0563, "step": 170100 }, { "epoch": 2.505485928042297, "grad_norm": 0.9151227474212646, "learning_rate": 1.1091394647100023e-05, "loss": 0.0602, "step": 170125 }, { "epoch": 2.505854111132384, "grad_norm": 1.2792390584945679, "learning_rate": 1.1089758275133037e-05, "loss": 0.0499, "step": 170150 }, { "epoch": 2.506222294222471, "grad_norm": 0.8770138025283813, "learning_rate": 1.1088121903166053e-05, "loss": 0.0576, "step": 170175 }, { "epoch": 2.506590477312558, "grad_norm": 1.378714680671692, "learning_rate": 1.108648553119907e-05, "loss": 0.0598, "step": 170200 }, { "epoch": 2.506958660402645, "grad_norm": 1.3941881656646729, "learning_rate": 1.1084849159232086e-05, "loss": 0.0584, "step": 170225 }, { "epoch": 2.507326843492732, "grad_norm": 1.348970651626587, "learning_rate": 1.10832127872651e-05, "loss": 0.0593, "step": 170250 }, { "epoch": 2.507695026582819, "grad_norm": 1.1845784187316895, "learning_rate": 1.1081576415298115e-05, "loss": 0.0588, "step": 170275 }, { "epoch": 2.508063209672906, "grad_norm": 1.4029450416564941, "learning_rate": 1.107994004333113e-05, "loss": 0.0561, "step": 170300 }, { "epoch": 2.508431392762993, "grad_norm": 0.9983207583427429, "learning_rate": 1.1078303671364145e-05, "loss": 0.0593, "step": 170325 }, { "epoch": 2.50879957585308, "grad_norm": 1.1089082956314087, "learning_rate": 1.1076667299397161e-05, "loss": 0.0662, "step": 170350 }, { "epoch": 2.509167758943167, "grad_norm": 1.2326053380966187, "learning_rate": 1.1075030927430177e-05, "loss": 0.0642, "step": 170375 }, { "epoch": 2.509535942033254, "grad_norm": 1.4079872369766235, "learning_rate": 1.1073394555463194e-05, "loss": 0.0551, "step": 170400 }, { "epoch": 2.5099041251233416, "grad_norm": 1.554484486579895, "learning_rate": 1.1071758183496208e-05, "loss": 0.0622, "step": 170425 }, { "epoch": 2.5102723082134286, "grad_norm": 1.2747653722763062, "learning_rate": 1.1070121811529224e-05, "loss": 0.0631, "step": 170450 }, { "epoch": 2.5106404913035156, "grad_norm": 2.409205913543701, "learning_rate": 1.1068485439562237e-05, "loss": 0.0681, "step": 170475 }, { "epoch": 2.5110086743936026, "grad_norm": 0.8650424480438232, "learning_rate": 1.1066849067595253e-05, "loss": 0.0619, "step": 170500 }, { "epoch": 2.5113768574836897, "grad_norm": 1.2267897129058838, "learning_rate": 1.106521269562827e-05, "loss": 0.0557, "step": 170525 }, { "epoch": 2.5117450405737767, "grad_norm": 1.2166086435317993, "learning_rate": 1.1063576323661285e-05, "loss": 0.0664, "step": 170550 }, { "epoch": 2.5121132236638637, "grad_norm": 0.9450547099113464, "learning_rate": 1.10619399516943e-05, "loss": 0.0574, "step": 170575 }, { "epoch": 2.5124814067539507, "grad_norm": 1.2446368932724, "learning_rate": 1.1060303579727316e-05, "loss": 0.0671, "step": 170600 }, { "epoch": 2.5128495898440377, "grad_norm": 1.5243786573410034, "learning_rate": 1.1058667207760332e-05, "loss": 0.0647, "step": 170625 }, { "epoch": 2.5132177729341247, "grad_norm": 1.6719969511032104, "learning_rate": 1.1057030835793348e-05, "loss": 0.0602, "step": 170650 }, { "epoch": 2.5135859560242118, "grad_norm": 1.5910475254058838, "learning_rate": 1.1055394463826363e-05, "loss": 0.0579, "step": 170675 }, { "epoch": 2.5139541391142988, "grad_norm": 1.9802005290985107, "learning_rate": 1.1053758091859377e-05, "loss": 0.0683, "step": 170700 }, { "epoch": 2.514322322204386, "grad_norm": 1.0029993057250977, "learning_rate": 1.1052121719892392e-05, "loss": 0.0549, "step": 170725 }, { "epoch": 2.514690505294473, "grad_norm": 1.3782914876937866, "learning_rate": 1.1050485347925408e-05, "loss": 0.0572, "step": 170750 }, { "epoch": 2.51505868838456, "grad_norm": 1.233562707901001, "learning_rate": 1.1048848975958424e-05, "loss": 0.0572, "step": 170775 }, { "epoch": 2.515426871474647, "grad_norm": 1.3055835962295532, "learning_rate": 1.104721260399144e-05, "loss": 0.0587, "step": 170800 }, { "epoch": 2.515795054564734, "grad_norm": 0.9142024517059326, "learning_rate": 1.1045576232024455e-05, "loss": 0.0573, "step": 170825 }, { "epoch": 2.516163237654821, "grad_norm": 1.1494581699371338, "learning_rate": 1.104393986005747e-05, "loss": 0.0607, "step": 170850 }, { "epoch": 2.516531420744908, "grad_norm": 1.1684163808822632, "learning_rate": 1.1042303488090487e-05, "loss": 0.0575, "step": 170875 }, { "epoch": 2.516899603834995, "grad_norm": 1.2969334125518799, "learning_rate": 1.10406671161235e-05, "loss": 0.0613, "step": 170900 }, { "epoch": 2.517267786925082, "grad_norm": 1.63365638256073, "learning_rate": 1.1039030744156516e-05, "loss": 0.0608, "step": 170925 }, { "epoch": 2.517635970015169, "grad_norm": 1.5004760026931763, "learning_rate": 1.1037394372189532e-05, "loss": 0.0576, "step": 170950 }, { "epoch": 2.518004153105256, "grad_norm": 1.3111398220062256, "learning_rate": 1.1035758000222546e-05, "loss": 0.0582, "step": 170975 }, { "epoch": 2.518372336195343, "grad_norm": 1.0577459335327148, "learning_rate": 1.1034121628255563e-05, "loss": 0.0601, "step": 171000 }, { "epoch": 2.5187405192854304, "grad_norm": 1.2442892789840698, "learning_rate": 1.1032485256288579e-05, "loss": 0.0623, "step": 171025 }, { "epoch": 2.5191087023755174, "grad_norm": 0.9899790287017822, "learning_rate": 1.1030848884321595e-05, "loss": 0.0555, "step": 171050 }, { "epoch": 2.5194768854656044, "grad_norm": 1.1004408597946167, "learning_rate": 1.102921251235461e-05, "loss": 0.0639, "step": 171075 }, { "epoch": 2.5198450685556915, "grad_norm": 1.7649691104888916, "learning_rate": 1.1027576140387624e-05, "loss": 0.0641, "step": 171100 }, { "epoch": 2.5202132516457785, "grad_norm": 1.2538145780563354, "learning_rate": 1.102593976842064e-05, "loss": 0.0651, "step": 171125 }, { "epoch": 2.5205814347358655, "grad_norm": 1.5799659490585327, "learning_rate": 1.1024303396453654e-05, "loss": 0.0566, "step": 171150 }, { "epoch": 2.5209496178259525, "grad_norm": 1.5274546146392822, "learning_rate": 1.102266702448667e-05, "loss": 0.0529, "step": 171175 }, { "epoch": 2.5213178009160395, "grad_norm": 1.1489367485046387, "learning_rate": 1.1021030652519687e-05, "loss": 0.0608, "step": 171200 }, { "epoch": 2.5216859840061265, "grad_norm": 1.1045472621917725, "learning_rate": 1.1019394280552703e-05, "loss": 0.0709, "step": 171225 }, { "epoch": 2.5220541670962136, "grad_norm": 0.944532573223114, "learning_rate": 1.1017757908585717e-05, "loss": 0.0585, "step": 171250 }, { "epoch": 2.5224223501863006, "grad_norm": 1.4226242303848267, "learning_rate": 1.1016121536618733e-05, "loss": 0.0618, "step": 171275 }, { "epoch": 2.5227905332763876, "grad_norm": 1.2734806537628174, "learning_rate": 1.101448516465175e-05, "loss": 0.0634, "step": 171300 }, { "epoch": 2.5231587163664746, "grad_norm": 1.162075161933899, "learning_rate": 1.1012848792684762e-05, "loss": 0.0565, "step": 171325 }, { "epoch": 2.5235268994565616, "grad_norm": 1.5107145309448242, "learning_rate": 1.1011212420717779e-05, "loss": 0.0586, "step": 171350 }, { "epoch": 2.523895082546649, "grad_norm": 1.5202807188034058, "learning_rate": 1.1009576048750795e-05, "loss": 0.0656, "step": 171375 }, { "epoch": 2.524263265636736, "grad_norm": 1.1547309160232544, "learning_rate": 1.100793967678381e-05, "loss": 0.0579, "step": 171400 }, { "epoch": 2.524631448726823, "grad_norm": 1.107877254486084, "learning_rate": 1.1006303304816825e-05, "loss": 0.0572, "step": 171425 }, { "epoch": 2.52499963181691, "grad_norm": 1.3621753454208374, "learning_rate": 1.1004666932849841e-05, "loss": 0.0583, "step": 171450 }, { "epoch": 2.525367814906997, "grad_norm": 1.2165898084640503, "learning_rate": 1.1003030560882858e-05, "loss": 0.0681, "step": 171475 }, { "epoch": 2.525735997997084, "grad_norm": 0.8010907173156738, "learning_rate": 1.1001394188915872e-05, "loss": 0.0596, "step": 171500 }, { "epoch": 2.526104181087171, "grad_norm": 1.4865715503692627, "learning_rate": 1.0999757816948887e-05, "loss": 0.0603, "step": 171525 }, { "epoch": 2.526472364177258, "grad_norm": 1.533263921737671, "learning_rate": 1.0998121444981901e-05, "loss": 0.061, "step": 171550 }, { "epoch": 2.526840547267345, "grad_norm": 1.4903484582901, "learning_rate": 1.0996485073014917e-05, "loss": 0.0564, "step": 171575 }, { "epoch": 2.527208730357432, "grad_norm": 0.9554063081741333, "learning_rate": 1.0994848701047933e-05, "loss": 0.0644, "step": 171600 }, { "epoch": 2.5275769134475192, "grad_norm": 1.4139825105667114, "learning_rate": 1.099321232908095e-05, "loss": 0.0728, "step": 171625 }, { "epoch": 2.5279450965376062, "grad_norm": 1.3340976238250732, "learning_rate": 1.0991575957113964e-05, "loss": 0.0621, "step": 171650 }, { "epoch": 2.5283132796276933, "grad_norm": 1.2663319110870361, "learning_rate": 1.098993958514698e-05, "loss": 0.0575, "step": 171675 }, { "epoch": 2.5286814627177803, "grad_norm": 1.643074631690979, "learning_rate": 1.0988303213179996e-05, "loss": 0.0551, "step": 171700 }, { "epoch": 2.5290496458078673, "grad_norm": 1.7283785343170166, "learning_rate": 1.0986666841213012e-05, "loss": 0.0674, "step": 171725 }, { "epoch": 2.5294178288979543, "grad_norm": 1.2985864877700806, "learning_rate": 1.0985030469246025e-05, "loss": 0.0588, "step": 171750 }, { "epoch": 2.5297860119880413, "grad_norm": 1.1789016723632812, "learning_rate": 1.0983394097279041e-05, "loss": 0.0598, "step": 171775 }, { "epoch": 2.5301541950781283, "grad_norm": 0.9327289462089539, "learning_rate": 1.0981757725312057e-05, "loss": 0.0624, "step": 171800 }, { "epoch": 2.5305223781682153, "grad_norm": 1.516343355178833, "learning_rate": 1.0980121353345072e-05, "loss": 0.0547, "step": 171825 }, { "epoch": 2.5308905612583024, "grad_norm": 1.5967307090759277, "learning_rate": 1.0978484981378088e-05, "loss": 0.0579, "step": 171850 }, { "epoch": 2.5312587443483894, "grad_norm": 1.1730389595031738, "learning_rate": 1.0976848609411104e-05, "loss": 0.0571, "step": 171875 }, { "epoch": 2.5316269274384764, "grad_norm": 1.4589629173278809, "learning_rate": 1.0975212237444119e-05, "loss": 0.0624, "step": 171900 }, { "epoch": 2.5319951105285634, "grad_norm": 1.6210570335388184, "learning_rate": 1.0973575865477135e-05, "loss": 0.0543, "step": 171925 }, { "epoch": 2.5323632936186504, "grad_norm": 1.1741224527359009, "learning_rate": 1.097193949351015e-05, "loss": 0.0584, "step": 171950 }, { "epoch": 2.532731476708738, "grad_norm": 1.2996466159820557, "learning_rate": 1.0970303121543164e-05, "loss": 0.0611, "step": 171975 }, { "epoch": 2.533099659798825, "grad_norm": 1.371118426322937, "learning_rate": 1.096866674957618e-05, "loss": 0.0574, "step": 172000 }, { "epoch": 2.533467842888912, "grad_norm": 1.527109980583191, "learning_rate": 1.0967030377609196e-05, "loss": 0.0606, "step": 172025 }, { "epoch": 2.533836025978999, "grad_norm": 0.7290975451469421, "learning_rate": 1.0965394005642212e-05, "loss": 0.0625, "step": 172050 }, { "epoch": 2.534204209069086, "grad_norm": 1.2977904081344604, "learning_rate": 1.0963757633675227e-05, "loss": 0.0614, "step": 172075 }, { "epoch": 2.534572392159173, "grad_norm": 1.3101640939712524, "learning_rate": 1.0962121261708243e-05, "loss": 0.0562, "step": 172100 }, { "epoch": 2.53494057524926, "grad_norm": 1.228004813194275, "learning_rate": 1.0960484889741259e-05, "loss": 0.0626, "step": 172125 }, { "epoch": 2.535308758339347, "grad_norm": 1.4276083707809448, "learning_rate": 1.0958848517774275e-05, "loss": 0.0571, "step": 172150 }, { "epoch": 2.535676941429434, "grad_norm": 1.3443721532821655, "learning_rate": 1.0957212145807288e-05, "loss": 0.062, "step": 172175 }, { "epoch": 2.536045124519521, "grad_norm": 0.9576521515846252, "learning_rate": 1.0955575773840304e-05, "loss": 0.0548, "step": 172200 }, { "epoch": 2.536413307609608, "grad_norm": 1.0474224090576172, "learning_rate": 1.0953939401873318e-05, "loss": 0.0593, "step": 172225 }, { "epoch": 2.536781490699695, "grad_norm": 1.3131436109542847, "learning_rate": 1.0952303029906335e-05, "loss": 0.0601, "step": 172250 }, { "epoch": 2.537149673789782, "grad_norm": 1.1439170837402344, "learning_rate": 1.095066665793935e-05, "loss": 0.0612, "step": 172275 }, { "epoch": 2.537517856879869, "grad_norm": 1.5104656219482422, "learning_rate": 1.0949030285972367e-05, "loss": 0.0612, "step": 172300 }, { "epoch": 2.537886039969956, "grad_norm": 0.8834033012390137, "learning_rate": 1.0947393914005381e-05, "loss": 0.0595, "step": 172325 }, { "epoch": 2.5382542230600436, "grad_norm": 1.4126570224761963, "learning_rate": 1.0945757542038398e-05, "loss": 0.0645, "step": 172350 }, { "epoch": 2.5386224061501306, "grad_norm": 1.0129644870758057, "learning_rate": 1.0944121170071412e-05, "loss": 0.0574, "step": 172375 }, { "epoch": 2.5389905892402176, "grad_norm": 1.1276706457138062, "learning_rate": 1.0942484798104426e-05, "loss": 0.0569, "step": 172400 }, { "epoch": 2.5393587723303046, "grad_norm": 0.7559683918952942, "learning_rate": 1.0940848426137443e-05, "loss": 0.0558, "step": 172425 }, { "epoch": 2.5397269554203916, "grad_norm": 0.7431871891021729, "learning_rate": 1.0939212054170459e-05, "loss": 0.0549, "step": 172450 }, { "epoch": 2.5400951385104786, "grad_norm": 1.4502934217453003, "learning_rate": 1.0937575682203473e-05, "loss": 0.0563, "step": 172475 }, { "epoch": 2.5404633216005656, "grad_norm": 1.8606306314468384, "learning_rate": 1.093593931023649e-05, "loss": 0.0565, "step": 172500 }, { "epoch": 2.5408315046906527, "grad_norm": 1.6751878261566162, "learning_rate": 1.0934302938269505e-05, "loss": 0.062, "step": 172525 }, { "epoch": 2.5411996877807397, "grad_norm": 1.163288950920105, "learning_rate": 1.0932666566302522e-05, "loss": 0.0578, "step": 172550 }, { "epoch": 2.5415678708708267, "grad_norm": 1.3800410032272339, "learning_rate": 1.0931030194335534e-05, "loss": 0.0639, "step": 172575 }, { "epoch": 2.5419360539609137, "grad_norm": 1.4605334997177124, "learning_rate": 1.092939382236855e-05, "loss": 0.0645, "step": 172600 }, { "epoch": 2.5423042370510007, "grad_norm": 1.5603034496307373, "learning_rate": 1.0927757450401567e-05, "loss": 0.0629, "step": 172625 }, { "epoch": 2.5426724201410877, "grad_norm": 1.1956393718719482, "learning_rate": 1.0926121078434581e-05, "loss": 0.0638, "step": 172650 }, { "epoch": 2.5430406032311748, "grad_norm": 1.3152358531951904, "learning_rate": 1.0924484706467597e-05, "loss": 0.0564, "step": 172675 }, { "epoch": 2.5434087863212618, "grad_norm": 1.3094853162765503, "learning_rate": 1.0922848334500613e-05, "loss": 0.0609, "step": 172700 }, { "epoch": 2.543776969411349, "grad_norm": 1.2942171096801758, "learning_rate": 1.092121196253363e-05, "loss": 0.0546, "step": 172725 }, { "epoch": 2.544145152501436, "grad_norm": 1.4587411880493164, "learning_rate": 1.0919575590566644e-05, "loss": 0.0564, "step": 172750 }, { "epoch": 2.544513335591523, "grad_norm": 1.1731085777282715, "learning_rate": 1.091793921859966e-05, "loss": 0.0624, "step": 172775 }, { "epoch": 2.54488151868161, "grad_norm": 1.484205722808838, "learning_rate": 1.0916302846632673e-05, "loss": 0.0599, "step": 172800 }, { "epoch": 2.545249701771697, "grad_norm": 1.0528786182403564, "learning_rate": 1.0914666474665689e-05, "loss": 0.0589, "step": 172825 }, { "epoch": 2.545617884861784, "grad_norm": 1.2374013662338257, "learning_rate": 1.0913030102698705e-05, "loss": 0.0566, "step": 172850 }, { "epoch": 2.545986067951871, "grad_norm": 1.175444483757019, "learning_rate": 1.0911393730731721e-05, "loss": 0.0592, "step": 172875 }, { "epoch": 2.546354251041958, "grad_norm": 1.2255836725234985, "learning_rate": 1.0909757358764736e-05, "loss": 0.0564, "step": 172900 }, { "epoch": 2.546722434132045, "grad_norm": 2.067763328552246, "learning_rate": 1.0908120986797752e-05, "loss": 0.0662, "step": 172925 }, { "epoch": 2.5470906172221324, "grad_norm": 1.4129890203475952, "learning_rate": 1.0906484614830768e-05, "loss": 0.0515, "step": 172950 }, { "epoch": 2.5474588003122194, "grad_norm": 1.695087194442749, "learning_rate": 1.0904848242863784e-05, "loss": 0.0631, "step": 172975 }, { "epoch": 2.5478269834023064, "grad_norm": 1.375633716583252, "learning_rate": 1.0903211870896797e-05, "loss": 0.0611, "step": 173000 }, { "epoch": 2.5481951664923934, "grad_norm": 1.4690459966659546, "learning_rate": 1.0901575498929813e-05, "loss": 0.0593, "step": 173025 }, { "epoch": 2.5485633495824804, "grad_norm": 2.098743438720703, "learning_rate": 1.0899939126962828e-05, "loss": 0.0638, "step": 173050 }, { "epoch": 2.5489315326725674, "grad_norm": 1.653788685798645, "learning_rate": 1.0898302754995844e-05, "loss": 0.0625, "step": 173075 }, { "epoch": 2.5492997157626545, "grad_norm": 1.2757081985473633, "learning_rate": 1.089666638302886e-05, "loss": 0.0621, "step": 173100 }, { "epoch": 2.5496678988527415, "grad_norm": 1.3836207389831543, "learning_rate": 1.0895030011061876e-05, "loss": 0.0575, "step": 173125 }, { "epoch": 2.5500360819428285, "grad_norm": 0.8787174224853516, "learning_rate": 1.089339363909489e-05, "loss": 0.0509, "step": 173150 }, { "epoch": 2.5504042650329155, "grad_norm": 1.0540931224822998, "learning_rate": 1.0891757267127907e-05, "loss": 0.0606, "step": 173175 }, { "epoch": 2.5507724481230025, "grad_norm": 0.9889383912086487, "learning_rate": 1.0890120895160923e-05, "loss": 0.055, "step": 173200 }, { "epoch": 2.5511406312130895, "grad_norm": 1.5400831699371338, "learning_rate": 1.0888484523193936e-05, "loss": 0.0574, "step": 173225 }, { "epoch": 2.5515088143031766, "grad_norm": 1.1952531337738037, "learning_rate": 1.0886848151226952e-05, "loss": 0.0573, "step": 173250 }, { "epoch": 2.5518769973932636, "grad_norm": 1.44627845287323, "learning_rate": 1.0885211779259968e-05, "loss": 0.0586, "step": 173275 }, { "epoch": 2.552245180483351, "grad_norm": 1.1690912246704102, "learning_rate": 1.0883575407292984e-05, "loss": 0.0588, "step": 173300 }, { "epoch": 2.552613363573438, "grad_norm": 1.0434222221374512, "learning_rate": 1.0882004490204678e-05, "loss": 0.059, "step": 173325 }, { "epoch": 2.552981546663525, "grad_norm": 1.2067543268203735, "learning_rate": 1.0880368118237694e-05, "loss": 0.0609, "step": 173350 }, { "epoch": 2.553349729753612, "grad_norm": 1.1828426122665405, "learning_rate": 1.087873174627071e-05, "loss": 0.0577, "step": 173375 }, { "epoch": 2.553717912843699, "grad_norm": 1.1234699487686157, "learning_rate": 1.0877095374303725e-05, "loss": 0.0528, "step": 173400 }, { "epoch": 2.554086095933786, "grad_norm": 1.2688320875167847, "learning_rate": 1.087545900233674e-05, "loss": 0.0558, "step": 173425 }, { "epoch": 2.554454279023873, "grad_norm": 1.5327723026275635, "learning_rate": 1.0873822630369755e-05, "loss": 0.06, "step": 173450 }, { "epoch": 2.55482246211396, "grad_norm": 1.2880806922912598, "learning_rate": 1.087218625840277e-05, "loss": 0.0557, "step": 173475 }, { "epoch": 2.555190645204047, "grad_norm": 1.1509093046188354, "learning_rate": 1.0870549886435786e-05, "loss": 0.0611, "step": 173500 }, { "epoch": 2.555558828294134, "grad_norm": 1.282781958580017, "learning_rate": 1.0868913514468802e-05, "loss": 0.0599, "step": 173525 }, { "epoch": 2.555927011384221, "grad_norm": 1.4733446836471558, "learning_rate": 1.0867277142501818e-05, "loss": 0.0553, "step": 173550 }, { "epoch": 2.556295194474308, "grad_norm": 1.279699444770813, "learning_rate": 1.0865640770534833e-05, "loss": 0.0565, "step": 173575 }, { "epoch": 2.556663377564395, "grad_norm": 1.5490727424621582, "learning_rate": 1.0864004398567849e-05, "loss": 0.063, "step": 173600 }, { "epoch": 2.5570315606544822, "grad_norm": 1.3272525072097778, "learning_rate": 1.0862368026600865e-05, "loss": 0.0592, "step": 173625 }, { "epoch": 2.5573997437445692, "grad_norm": 1.7039073705673218, "learning_rate": 1.0860731654633878e-05, "loss": 0.0627, "step": 173650 }, { "epoch": 2.5577679268346563, "grad_norm": 1.239800214767456, "learning_rate": 1.0859095282666894e-05, "loss": 0.0652, "step": 173675 }, { "epoch": 2.5581361099247433, "grad_norm": 1.62569260597229, "learning_rate": 1.085745891069991e-05, "loss": 0.065, "step": 173700 }, { "epoch": 2.5585042930148303, "grad_norm": 1.069297194480896, "learning_rate": 1.0855822538732925e-05, "loss": 0.058, "step": 173725 }, { "epoch": 2.5588724761049173, "grad_norm": 1.8260818719863892, "learning_rate": 1.085418616676594e-05, "loss": 0.0539, "step": 173750 }, { "epoch": 2.5592406591950043, "grad_norm": 1.5096160173416138, "learning_rate": 1.0852549794798957e-05, "loss": 0.0625, "step": 173775 }, { "epoch": 2.5596088422850913, "grad_norm": 1.12891685962677, "learning_rate": 1.0850913422831973e-05, "loss": 0.0599, "step": 173800 }, { "epoch": 2.5599770253751783, "grad_norm": 1.8938446044921875, "learning_rate": 1.0849277050864988e-05, "loss": 0.0677, "step": 173825 }, { "epoch": 2.5603452084652654, "grad_norm": 0.8113965392112732, "learning_rate": 1.0847640678898002e-05, "loss": 0.0572, "step": 173850 }, { "epoch": 2.5607133915553524, "grad_norm": 1.1232788562774658, "learning_rate": 1.0846004306931016e-05, "loss": 0.061, "step": 173875 }, { "epoch": 2.56108157464544, "grad_norm": 1.623877763748169, "learning_rate": 1.0844367934964033e-05, "loss": 0.0707, "step": 173900 }, { "epoch": 2.561449757735527, "grad_norm": 1.5341931581497192, "learning_rate": 1.0842731562997049e-05, "loss": 0.0569, "step": 173925 }, { "epoch": 2.561817940825614, "grad_norm": 1.5959268808364868, "learning_rate": 1.0841095191030065e-05, "loss": 0.0597, "step": 173950 }, { "epoch": 2.562186123915701, "grad_norm": 1.1079158782958984, "learning_rate": 1.083945881906308e-05, "loss": 0.0651, "step": 173975 }, { "epoch": 2.562554307005788, "grad_norm": 1.3437291383743286, "learning_rate": 1.0837822447096095e-05, "loss": 0.0605, "step": 174000 }, { "epoch": 2.562922490095875, "grad_norm": 1.8788952827453613, "learning_rate": 1.0836186075129112e-05, "loss": 0.0646, "step": 174025 }, { "epoch": 2.563290673185962, "grad_norm": 1.4978293180465698, "learning_rate": 1.0834549703162128e-05, "loss": 0.0648, "step": 174050 }, { "epoch": 2.563658856276049, "grad_norm": 1.351623773574829, "learning_rate": 1.083291333119514e-05, "loss": 0.0655, "step": 174075 }, { "epoch": 2.564027039366136, "grad_norm": 1.4173803329467773, "learning_rate": 1.0831276959228157e-05, "loss": 0.0547, "step": 174100 }, { "epoch": 2.564395222456223, "grad_norm": 0.7470574975013733, "learning_rate": 1.0829640587261173e-05, "loss": 0.0613, "step": 174125 }, { "epoch": 2.56476340554631, "grad_norm": 1.2393810749053955, "learning_rate": 1.0828004215294187e-05, "loss": 0.0655, "step": 174150 }, { "epoch": 2.565131588636397, "grad_norm": 1.1187108755111694, "learning_rate": 1.0826367843327203e-05, "loss": 0.058, "step": 174175 }, { "epoch": 2.565499771726484, "grad_norm": 1.1925551891326904, "learning_rate": 1.082473147136022e-05, "loss": 0.0612, "step": 174200 }, { "epoch": 2.565867954816571, "grad_norm": 1.777707815170288, "learning_rate": 1.0823095099393234e-05, "loss": 0.0609, "step": 174225 }, { "epoch": 2.5662361379066585, "grad_norm": 1.914480209350586, "learning_rate": 1.082145872742625e-05, "loss": 0.0545, "step": 174250 }, { "epoch": 2.5666043209967455, "grad_norm": 1.142600655555725, "learning_rate": 1.0819822355459265e-05, "loss": 0.0564, "step": 174275 }, { "epoch": 2.5669725040868325, "grad_norm": 1.641832947731018, "learning_rate": 1.0818185983492279e-05, "loss": 0.0694, "step": 174300 }, { "epoch": 2.5673406871769195, "grad_norm": 1.7670449018478394, "learning_rate": 1.0816549611525295e-05, "loss": 0.0577, "step": 174325 }, { "epoch": 2.5677088702670066, "grad_norm": 1.3882704973220825, "learning_rate": 1.0814913239558311e-05, "loss": 0.0603, "step": 174350 }, { "epoch": 2.5680770533570936, "grad_norm": 1.5545358657836914, "learning_rate": 1.0813276867591328e-05, "loss": 0.0563, "step": 174375 }, { "epoch": 2.5684452364471806, "grad_norm": 1.2787203788757324, "learning_rate": 1.0811640495624342e-05, "loss": 0.0528, "step": 174400 }, { "epoch": 2.5688134195372676, "grad_norm": 1.0915101766586304, "learning_rate": 1.0810004123657358e-05, "loss": 0.0554, "step": 174425 }, { "epoch": 2.5691816026273546, "grad_norm": 1.2762171030044556, "learning_rate": 1.0808367751690374e-05, "loss": 0.0593, "step": 174450 }, { "epoch": 2.5695497857174416, "grad_norm": 1.1823515892028809, "learning_rate": 1.0806731379723387e-05, "loss": 0.0623, "step": 174475 }, { "epoch": 2.5699179688075287, "grad_norm": 1.8453294038772583, "learning_rate": 1.0805095007756403e-05, "loss": 0.0611, "step": 174500 }, { "epoch": 2.5702861518976157, "grad_norm": 1.1715410947799683, "learning_rate": 1.080345863578942e-05, "loss": 0.0582, "step": 174525 }, { "epoch": 2.5706543349877027, "grad_norm": 1.1181731224060059, "learning_rate": 1.0801822263822434e-05, "loss": 0.0557, "step": 174550 }, { "epoch": 2.5710225180777897, "grad_norm": 1.0414645671844482, "learning_rate": 1.080018589185545e-05, "loss": 0.0575, "step": 174575 }, { "epoch": 2.5713907011678767, "grad_norm": 1.3872711658477783, "learning_rate": 1.0798549519888466e-05, "loss": 0.0664, "step": 174600 }, { "epoch": 2.5717588842579637, "grad_norm": 1.711892008781433, "learning_rate": 1.0796913147921482e-05, "loss": 0.0582, "step": 174625 }, { "epoch": 2.5721270673480507, "grad_norm": 1.81852388381958, "learning_rate": 1.0795276775954497e-05, "loss": 0.0632, "step": 174650 }, { "epoch": 2.5724952504381378, "grad_norm": 1.6783087253570557, "learning_rate": 1.0793640403987513e-05, "loss": 0.0642, "step": 174675 }, { "epoch": 2.5728634335282248, "grad_norm": 1.4971789121627808, "learning_rate": 1.0792004032020527e-05, "loss": 0.0624, "step": 174700 }, { "epoch": 2.573231616618312, "grad_norm": 1.3275350332260132, "learning_rate": 1.0790367660053542e-05, "loss": 0.0637, "step": 174725 }, { "epoch": 2.573599799708399, "grad_norm": 1.269001841545105, "learning_rate": 1.0788731288086558e-05, "loss": 0.0545, "step": 174750 }, { "epoch": 2.573967982798486, "grad_norm": 1.5252752304077148, "learning_rate": 1.0787094916119574e-05, "loss": 0.0596, "step": 174775 }, { "epoch": 2.574336165888573, "grad_norm": 1.285728096961975, "learning_rate": 1.0785458544152589e-05, "loss": 0.0595, "step": 174800 }, { "epoch": 2.57470434897866, "grad_norm": 1.3012200593948364, "learning_rate": 1.0783822172185605e-05, "loss": 0.0563, "step": 174825 }, { "epoch": 2.5750725320687473, "grad_norm": 1.4969555139541626, "learning_rate": 1.0782185800218621e-05, "loss": 0.0619, "step": 174850 }, { "epoch": 2.5754407151588343, "grad_norm": 1.3967996835708618, "learning_rate": 1.0780549428251637e-05, "loss": 0.0596, "step": 174875 }, { "epoch": 2.5758088982489213, "grad_norm": 1.6889322996139526, "learning_rate": 1.077891305628465e-05, "loss": 0.0588, "step": 174900 }, { "epoch": 2.5761770813390084, "grad_norm": 1.378534197807312, "learning_rate": 1.0777276684317666e-05, "loss": 0.0602, "step": 174925 }, { "epoch": 2.5765452644290954, "grad_norm": 1.597387433052063, "learning_rate": 1.0775640312350682e-05, "loss": 0.0559, "step": 174950 }, { "epoch": 2.5769134475191824, "grad_norm": 1.8913077116012573, "learning_rate": 1.0774003940383697e-05, "loss": 0.068, "step": 174975 }, { "epoch": 2.5772816306092694, "grad_norm": 1.259865403175354, "learning_rate": 1.0772367568416713e-05, "loss": 0.0538, "step": 175000 }, { "epoch": 2.5776498136993564, "grad_norm": 0.9669091105461121, "learning_rate": 1.0770731196449729e-05, "loss": 0.0602, "step": 175025 }, { "epoch": 2.5780179967894434, "grad_norm": 1.1503703594207764, "learning_rate": 1.0769094824482745e-05, "loss": 0.0486, "step": 175050 }, { "epoch": 2.5783861798795304, "grad_norm": 1.5549144744873047, "learning_rate": 1.076745845251576e-05, "loss": 0.0566, "step": 175075 }, { "epoch": 2.5787543629696175, "grad_norm": 0.9050664901733398, "learning_rate": 1.0765822080548776e-05, "loss": 0.0601, "step": 175100 }, { "epoch": 2.5791225460597045, "grad_norm": 1.4891036748886108, "learning_rate": 1.0764185708581788e-05, "loss": 0.0568, "step": 175125 }, { "epoch": 2.5794907291497915, "grad_norm": 1.5478847026824951, "learning_rate": 1.0762549336614805e-05, "loss": 0.0596, "step": 175150 }, { "epoch": 2.5798589122398785, "grad_norm": 1.3112157583236694, "learning_rate": 1.076091296464782e-05, "loss": 0.0574, "step": 175175 }, { "epoch": 2.5802270953299655, "grad_norm": 1.233964443206787, "learning_rate": 1.0759276592680837e-05, "loss": 0.053, "step": 175200 }, { "epoch": 2.580595278420053, "grad_norm": 0.8812665343284607, "learning_rate": 1.0757640220713851e-05, "loss": 0.0581, "step": 175225 }, { "epoch": 2.58096346151014, "grad_norm": 1.3952767848968506, "learning_rate": 1.0756003848746867e-05, "loss": 0.0504, "step": 175250 }, { "epoch": 2.581331644600227, "grad_norm": 1.034899115562439, "learning_rate": 1.0754367476779884e-05, "loss": 0.0546, "step": 175275 }, { "epoch": 2.581699827690314, "grad_norm": 2.012953996658325, "learning_rate": 1.07527311048129e-05, "loss": 0.0577, "step": 175300 }, { "epoch": 2.582068010780401, "grad_norm": 1.2987000942230225, "learning_rate": 1.0751094732845913e-05, "loss": 0.0645, "step": 175325 }, { "epoch": 2.582436193870488, "grad_norm": 1.3182604312896729, "learning_rate": 1.0749458360878929e-05, "loss": 0.057, "step": 175350 }, { "epoch": 2.582804376960575, "grad_norm": 1.5478579998016357, "learning_rate": 1.0747821988911943e-05, "loss": 0.0605, "step": 175375 }, { "epoch": 2.583172560050662, "grad_norm": 1.0221810340881348, "learning_rate": 1.074618561694496e-05, "loss": 0.0603, "step": 175400 }, { "epoch": 2.583540743140749, "grad_norm": 1.7871437072753906, "learning_rate": 1.0744549244977975e-05, "loss": 0.0553, "step": 175425 }, { "epoch": 2.583908926230836, "grad_norm": 1.3152027130126953, "learning_rate": 1.0742912873010992e-05, "loss": 0.0568, "step": 175450 }, { "epoch": 2.584277109320923, "grad_norm": 1.2218319177627563, "learning_rate": 1.0741276501044006e-05, "loss": 0.0606, "step": 175475 }, { "epoch": 2.58464529241101, "grad_norm": 1.649019479751587, "learning_rate": 1.0739640129077022e-05, "loss": 0.0582, "step": 175500 }, { "epoch": 2.585013475501097, "grad_norm": 1.5051950216293335, "learning_rate": 1.0738003757110038e-05, "loss": 0.0602, "step": 175525 }, { "epoch": 2.585381658591184, "grad_norm": 1.6265712976455688, "learning_rate": 1.0736367385143051e-05, "loss": 0.0691, "step": 175550 }, { "epoch": 2.585749841681271, "grad_norm": 1.2473348379135132, "learning_rate": 1.0734731013176067e-05, "loss": 0.0581, "step": 175575 }, { "epoch": 2.586118024771358, "grad_norm": 1.1284838914871216, "learning_rate": 1.0733094641209083e-05, "loss": 0.058, "step": 175600 }, { "epoch": 2.5864862078614452, "grad_norm": 1.1544748544692993, "learning_rate": 1.07314582692421e-05, "loss": 0.0625, "step": 175625 }, { "epoch": 2.5868543909515322, "grad_norm": 1.856658935546875, "learning_rate": 1.0729821897275114e-05, "loss": 0.0602, "step": 175650 }, { "epoch": 2.5872225740416193, "grad_norm": 1.3996042013168335, "learning_rate": 1.072818552530813e-05, "loss": 0.0626, "step": 175675 }, { "epoch": 2.5875907571317063, "grad_norm": 1.4461251497268677, "learning_rate": 1.0726549153341146e-05, "loss": 0.0661, "step": 175700 }, { "epoch": 2.5879589402217933, "grad_norm": 1.5906156301498413, "learning_rate": 1.072491278137416e-05, "loss": 0.0666, "step": 175725 }, { "epoch": 2.5883271233118803, "grad_norm": 1.1601279973983765, "learning_rate": 1.0723276409407175e-05, "loss": 0.0535, "step": 175750 }, { "epoch": 2.5886953064019673, "grad_norm": 1.4376877546310425, "learning_rate": 1.0721640037440191e-05, "loss": 0.0573, "step": 175775 }, { "epoch": 2.5890634894920543, "grad_norm": 1.4215503931045532, "learning_rate": 1.0720003665473206e-05, "loss": 0.0592, "step": 175800 }, { "epoch": 2.589431672582142, "grad_norm": 1.3166005611419678, "learning_rate": 1.0718367293506222e-05, "loss": 0.0643, "step": 175825 }, { "epoch": 2.589799855672229, "grad_norm": 1.6120201349258423, "learning_rate": 1.0716730921539238e-05, "loss": 0.0646, "step": 175850 }, { "epoch": 2.590168038762316, "grad_norm": 1.91206693649292, "learning_rate": 1.0715094549572254e-05, "loss": 0.0594, "step": 175875 }, { "epoch": 2.590536221852403, "grad_norm": 1.233803629875183, "learning_rate": 1.0713458177605269e-05, "loss": 0.0539, "step": 175900 }, { "epoch": 2.59090440494249, "grad_norm": 1.8472914695739746, "learning_rate": 1.0711821805638285e-05, "loss": 0.0652, "step": 175925 }, { "epoch": 2.591272588032577, "grad_norm": 1.537962555885315, "learning_rate": 1.0710185433671298e-05, "loss": 0.0577, "step": 175950 }, { "epoch": 2.591640771122664, "grad_norm": 0.8386574387550354, "learning_rate": 1.0708549061704314e-05, "loss": 0.0536, "step": 175975 }, { "epoch": 2.592008954212751, "grad_norm": 1.2876434326171875, "learning_rate": 1.070691268973733e-05, "loss": 0.061, "step": 176000 }, { "epoch": 2.592377137302838, "grad_norm": 1.6931270360946655, "learning_rate": 1.0705276317770346e-05, "loss": 0.0539, "step": 176025 }, { "epoch": 2.592745320392925, "grad_norm": 1.1909981966018677, "learning_rate": 1.070363994580336e-05, "loss": 0.0619, "step": 176050 }, { "epoch": 2.593113503483012, "grad_norm": 1.2086323499679565, "learning_rate": 1.0702003573836377e-05, "loss": 0.0532, "step": 176075 }, { "epoch": 2.593481686573099, "grad_norm": 1.4572724103927612, "learning_rate": 1.0700367201869393e-05, "loss": 0.056, "step": 176100 }, { "epoch": 2.593849869663186, "grad_norm": 1.4585648775100708, "learning_rate": 1.0698730829902409e-05, "loss": 0.0677, "step": 176125 }, { "epoch": 2.594218052753273, "grad_norm": 1.7802493572235107, "learning_rate": 1.0697094457935424e-05, "loss": 0.0601, "step": 176150 }, { "epoch": 2.5945862358433605, "grad_norm": 1.6509461402893066, "learning_rate": 1.0695458085968438e-05, "loss": 0.0589, "step": 176175 }, { "epoch": 2.5949544189334475, "grad_norm": 1.4764903783798218, "learning_rate": 1.0693821714001454e-05, "loss": 0.0637, "step": 176200 }, { "epoch": 2.5953226020235345, "grad_norm": 1.2520935535430908, "learning_rate": 1.0692185342034469e-05, "loss": 0.0604, "step": 176225 }, { "epoch": 2.5956907851136215, "grad_norm": 1.5063178539276123, "learning_rate": 1.0690548970067485e-05, "loss": 0.0556, "step": 176250 }, { "epoch": 2.5960589682037085, "grad_norm": 1.5612396001815796, "learning_rate": 1.0688912598100501e-05, "loss": 0.0626, "step": 176275 }, { "epoch": 2.5964271512937955, "grad_norm": 1.3243896961212158, "learning_rate": 1.0687276226133515e-05, "loss": 0.065, "step": 176300 }, { "epoch": 2.5967953343838825, "grad_norm": 0.7383653521537781, "learning_rate": 1.0685639854166532e-05, "loss": 0.0543, "step": 176325 }, { "epoch": 2.5971635174739696, "grad_norm": 1.701586365699768, "learning_rate": 1.0684003482199548e-05, "loss": 0.0538, "step": 176350 }, { "epoch": 2.5975317005640566, "grad_norm": 1.2080687284469604, "learning_rate": 1.068236711023256e-05, "loss": 0.054, "step": 176375 }, { "epoch": 2.5978998836541436, "grad_norm": 0.8733327984809875, "learning_rate": 1.0680730738265577e-05, "loss": 0.0581, "step": 176400 }, { "epoch": 2.5982680667442306, "grad_norm": 1.3134326934814453, "learning_rate": 1.0679094366298593e-05, "loss": 0.0651, "step": 176425 }, { "epoch": 2.5986362498343176, "grad_norm": 1.738545298576355, "learning_rate": 1.0677457994331609e-05, "loss": 0.0623, "step": 176450 }, { "epoch": 2.5990044329244046, "grad_norm": 1.4502187967300415, "learning_rate": 1.0675821622364623e-05, "loss": 0.0648, "step": 176475 }, { "epoch": 2.5993726160144917, "grad_norm": 1.1931694746017456, "learning_rate": 1.067418525039764e-05, "loss": 0.0623, "step": 176500 }, { "epoch": 2.5997407991045787, "grad_norm": 2.380855083465576, "learning_rate": 1.0672548878430656e-05, "loss": 0.0677, "step": 176525 }, { "epoch": 2.6001089821946657, "grad_norm": 1.5158504247665405, "learning_rate": 1.067091250646367e-05, "loss": 0.058, "step": 176550 }, { "epoch": 2.6004771652847527, "grad_norm": 1.609561800956726, "learning_rate": 1.0669276134496686e-05, "loss": 0.0576, "step": 176575 }, { "epoch": 2.6008453483748397, "grad_norm": 1.1952263116836548, "learning_rate": 1.06676397625297e-05, "loss": 0.0568, "step": 176600 }, { "epoch": 2.6012135314649267, "grad_norm": 0.928909182548523, "learning_rate": 1.0666003390562715e-05, "loss": 0.0591, "step": 176625 }, { "epoch": 2.6015817145550137, "grad_norm": 1.8413697481155396, "learning_rate": 1.0664367018595731e-05, "loss": 0.0657, "step": 176650 }, { "epoch": 2.6019498976451008, "grad_norm": 1.7064995765686035, "learning_rate": 1.0662730646628747e-05, "loss": 0.0651, "step": 176675 }, { "epoch": 2.6023180807351878, "grad_norm": 1.1123034954071045, "learning_rate": 1.0661094274661764e-05, "loss": 0.056, "step": 176700 }, { "epoch": 2.602686263825275, "grad_norm": 1.1000996828079224, "learning_rate": 1.0659457902694778e-05, "loss": 0.0579, "step": 176725 }, { "epoch": 2.603054446915362, "grad_norm": 1.422594428062439, "learning_rate": 1.0657821530727794e-05, "loss": 0.0564, "step": 176750 }, { "epoch": 2.6034226300054493, "grad_norm": 1.125012755393982, "learning_rate": 1.065618515876081e-05, "loss": 0.0528, "step": 176775 }, { "epoch": 2.6037908130955363, "grad_norm": 1.297001838684082, "learning_rate": 1.0654548786793823e-05, "loss": 0.0614, "step": 176800 }, { "epoch": 2.6041589961856233, "grad_norm": 1.121140480041504, "learning_rate": 1.065291241482684e-05, "loss": 0.0554, "step": 176825 }, { "epoch": 2.6045271792757103, "grad_norm": 1.47142493724823, "learning_rate": 1.0651276042859855e-05, "loss": 0.0611, "step": 176850 }, { "epoch": 2.6048953623657973, "grad_norm": 1.4502065181732178, "learning_rate": 1.064963967089287e-05, "loss": 0.0663, "step": 176875 }, { "epoch": 2.6052635454558843, "grad_norm": 1.3370851278305054, "learning_rate": 1.0648003298925886e-05, "loss": 0.0592, "step": 176900 }, { "epoch": 2.6056317285459714, "grad_norm": 0.9253253936767578, "learning_rate": 1.0646366926958902e-05, "loss": 0.0599, "step": 176925 }, { "epoch": 2.6059999116360584, "grad_norm": 0.8026204705238342, "learning_rate": 1.0644730554991918e-05, "loss": 0.0581, "step": 176950 }, { "epoch": 2.6063680947261454, "grad_norm": 1.0097384452819824, "learning_rate": 1.0643094183024933e-05, "loss": 0.0579, "step": 176975 }, { "epoch": 2.6067362778162324, "grad_norm": 1.2711149454116821, "learning_rate": 1.0641457811057949e-05, "loss": 0.0634, "step": 177000 }, { "epoch": 2.6071044609063194, "grad_norm": 1.2087106704711914, "learning_rate": 1.0639821439090963e-05, "loss": 0.0588, "step": 177025 }, { "epoch": 2.6074726439964064, "grad_norm": 1.7671455144882202, "learning_rate": 1.0638185067123978e-05, "loss": 0.0627, "step": 177050 }, { "epoch": 2.6078408270864935, "grad_norm": 1.6097408533096313, "learning_rate": 1.0636548695156994e-05, "loss": 0.0637, "step": 177075 }, { "epoch": 2.6082090101765805, "grad_norm": 1.0626698732376099, "learning_rate": 1.063491232319001e-05, "loss": 0.0553, "step": 177100 }, { "epoch": 2.6085771932666675, "grad_norm": 1.4588572978973389, "learning_rate": 1.0633275951223025e-05, "loss": 0.065, "step": 177125 }, { "epoch": 2.608945376356755, "grad_norm": 0.9147449731826782, "learning_rate": 1.063163957925604e-05, "loss": 0.056, "step": 177150 }, { "epoch": 2.609313559446842, "grad_norm": 1.1432408094406128, "learning_rate": 1.0630003207289057e-05, "loss": 0.0652, "step": 177175 }, { "epoch": 2.609681742536929, "grad_norm": 1.362931489944458, "learning_rate": 1.0628366835322073e-05, "loss": 0.0697, "step": 177200 }, { "epoch": 2.610049925627016, "grad_norm": 0.7644778490066528, "learning_rate": 1.0626730463355086e-05, "loss": 0.055, "step": 177225 }, { "epoch": 2.610418108717103, "grad_norm": 1.5069628953933716, "learning_rate": 1.0625094091388102e-05, "loss": 0.0588, "step": 177250 }, { "epoch": 2.61078629180719, "grad_norm": 1.3445383310317993, "learning_rate": 1.0623523174299798e-05, "loss": 0.0705, "step": 177275 }, { "epoch": 2.611154474897277, "grad_norm": 1.5314680337905884, "learning_rate": 1.0621886802332812e-05, "loss": 0.0605, "step": 177300 }, { "epoch": 2.611522657987364, "grad_norm": 1.4227596521377563, "learning_rate": 1.0620250430365828e-05, "loss": 0.0615, "step": 177325 }, { "epoch": 2.611890841077451, "grad_norm": 1.6466411352157593, "learning_rate": 1.0618614058398844e-05, "loss": 0.0557, "step": 177350 }, { "epoch": 2.612259024167538, "grad_norm": 1.2078698873519897, "learning_rate": 1.061697768643186e-05, "loss": 0.0673, "step": 177375 }, { "epoch": 2.612627207257625, "grad_norm": 1.6566219329833984, "learning_rate": 1.0615341314464875e-05, "loss": 0.0542, "step": 177400 }, { "epoch": 2.612995390347712, "grad_norm": 1.2760077714920044, "learning_rate": 1.061370494249789e-05, "loss": 0.0585, "step": 177425 }, { "epoch": 2.613363573437799, "grad_norm": 1.257825493812561, "learning_rate": 1.0612068570530904e-05, "loss": 0.061, "step": 177450 }, { "epoch": 2.613731756527886, "grad_norm": 0.9882104396820068, "learning_rate": 1.061043219856392e-05, "loss": 0.0563, "step": 177475 }, { "epoch": 2.614099939617973, "grad_norm": 1.5817950963974, "learning_rate": 1.0608795826596936e-05, "loss": 0.0602, "step": 177500 }, { "epoch": 2.61446812270806, "grad_norm": 1.0192517042160034, "learning_rate": 1.0607159454629952e-05, "loss": 0.0618, "step": 177525 }, { "epoch": 2.614836305798147, "grad_norm": 1.6604156494140625, "learning_rate": 1.0605523082662967e-05, "loss": 0.069, "step": 177550 }, { "epoch": 2.615204488888234, "grad_norm": 1.4611319303512573, "learning_rate": 1.0603886710695983e-05, "loss": 0.0656, "step": 177575 }, { "epoch": 2.615572671978321, "grad_norm": 1.1192989349365234, "learning_rate": 1.0602250338728999e-05, "loss": 0.0592, "step": 177600 }, { "epoch": 2.6159408550684082, "grad_norm": 1.1363897323608398, "learning_rate": 1.0600613966762015e-05, "loss": 0.0633, "step": 177625 }, { "epoch": 2.6163090381584952, "grad_norm": 1.5756871700286865, "learning_rate": 1.0598977594795028e-05, "loss": 0.0561, "step": 177650 }, { "epoch": 2.6166772212485823, "grad_norm": 1.8618441820144653, "learning_rate": 1.0597341222828044e-05, "loss": 0.0657, "step": 177675 }, { "epoch": 2.6170454043386693, "grad_norm": 0.9189126491546631, "learning_rate": 1.0595704850861059e-05, "loss": 0.0553, "step": 177700 }, { "epoch": 2.6174135874287567, "grad_norm": 1.7426941394805908, "learning_rate": 1.0594068478894075e-05, "loss": 0.063, "step": 177725 }, { "epoch": 2.6177817705188438, "grad_norm": 1.4612746238708496, "learning_rate": 1.0592432106927091e-05, "loss": 0.0561, "step": 177750 }, { "epoch": 2.6181499536089308, "grad_norm": 1.1249966621398926, "learning_rate": 1.0590795734960107e-05, "loss": 0.0691, "step": 177775 }, { "epoch": 2.618518136699018, "grad_norm": 1.174182653427124, "learning_rate": 1.0589159362993122e-05, "loss": 0.0618, "step": 177800 }, { "epoch": 2.618886319789105, "grad_norm": 0.9717144966125488, "learning_rate": 1.0587522991026138e-05, "loss": 0.0547, "step": 177825 }, { "epoch": 2.619254502879192, "grad_norm": 1.2711896896362305, "learning_rate": 1.0585886619059152e-05, "loss": 0.0626, "step": 177850 }, { "epoch": 2.619622685969279, "grad_norm": 1.4679142236709595, "learning_rate": 1.0584250247092167e-05, "loss": 0.0562, "step": 177875 }, { "epoch": 2.619990869059366, "grad_norm": 1.3813141584396362, "learning_rate": 1.0582613875125183e-05, "loss": 0.0577, "step": 177900 }, { "epoch": 2.620359052149453, "grad_norm": 1.1439592838287354, "learning_rate": 1.0580977503158199e-05, "loss": 0.0544, "step": 177925 }, { "epoch": 2.62072723523954, "grad_norm": 1.5633246898651123, "learning_rate": 1.0579341131191215e-05, "loss": 0.0523, "step": 177950 }, { "epoch": 2.621095418329627, "grad_norm": 1.2544136047363281, "learning_rate": 1.057770475922423e-05, "loss": 0.0643, "step": 177975 }, { "epoch": 2.621463601419714, "grad_norm": 1.5179665088653564, "learning_rate": 1.0576068387257246e-05, "loss": 0.0619, "step": 178000 }, { "epoch": 2.621831784509801, "grad_norm": 1.1079788208007812, "learning_rate": 1.0574432015290262e-05, "loss": 0.06, "step": 178025 }, { "epoch": 2.622199967599888, "grad_norm": 1.4913833141326904, "learning_rate": 1.0572795643323276e-05, "loss": 0.0645, "step": 178050 }, { "epoch": 2.622568150689975, "grad_norm": 1.2368390560150146, "learning_rate": 1.057115927135629e-05, "loss": 0.0581, "step": 178075 }, { "epoch": 2.6229363337800624, "grad_norm": 1.7132432460784912, "learning_rate": 1.0569522899389307e-05, "loss": 0.0593, "step": 178100 }, { "epoch": 2.6233045168701494, "grad_norm": 1.515574336051941, "learning_rate": 1.0567886527422321e-05, "loss": 0.0579, "step": 178125 }, { "epoch": 2.6236726999602364, "grad_norm": 1.7901968955993652, "learning_rate": 1.0566250155455337e-05, "loss": 0.0569, "step": 178150 }, { "epoch": 2.6240408830503235, "grad_norm": 1.4704879522323608, "learning_rate": 1.0564613783488354e-05, "loss": 0.05, "step": 178175 }, { "epoch": 2.6244090661404105, "grad_norm": 0.9033634066581726, "learning_rate": 1.056297741152137e-05, "loss": 0.0617, "step": 178200 }, { "epoch": 2.6247772492304975, "grad_norm": 1.3718925714492798, "learning_rate": 1.0561341039554384e-05, "loss": 0.0597, "step": 178225 }, { "epoch": 2.6251454323205845, "grad_norm": 1.5202617645263672, "learning_rate": 1.05597046675874e-05, "loss": 0.0643, "step": 178250 }, { "epoch": 2.6255136154106715, "grad_norm": 1.2252256870269775, "learning_rate": 1.0558068295620413e-05, "loss": 0.0608, "step": 178275 }, { "epoch": 2.6258817985007585, "grad_norm": 1.1312342882156372, "learning_rate": 1.055643192365343e-05, "loss": 0.0628, "step": 178300 }, { "epoch": 2.6262499815908456, "grad_norm": 1.2931057214736938, "learning_rate": 1.0554795551686445e-05, "loss": 0.0628, "step": 178325 }, { "epoch": 2.6266181646809326, "grad_norm": 1.1888556480407715, "learning_rate": 1.0553159179719462e-05, "loss": 0.0635, "step": 178350 }, { "epoch": 2.6269863477710196, "grad_norm": 1.7914178371429443, "learning_rate": 1.0551522807752476e-05, "loss": 0.0554, "step": 178375 }, { "epoch": 2.6273545308611066, "grad_norm": 1.2818809747695923, "learning_rate": 1.0549886435785492e-05, "loss": 0.062, "step": 178400 }, { "epoch": 2.6277227139511936, "grad_norm": 1.2769396305084229, "learning_rate": 1.0548250063818508e-05, "loss": 0.0591, "step": 178425 }, { "epoch": 2.6280908970412806, "grad_norm": 1.2644082307815552, "learning_rate": 1.0546613691851525e-05, "loss": 0.0544, "step": 178450 }, { "epoch": 2.6284590801313676, "grad_norm": 1.585900068283081, "learning_rate": 1.0544977319884539e-05, "loss": 0.0576, "step": 178475 }, { "epoch": 2.6288272632214547, "grad_norm": 1.2027477025985718, "learning_rate": 1.0543340947917553e-05, "loss": 0.0575, "step": 178500 }, { "epoch": 2.6291954463115417, "grad_norm": 1.3439468145370483, "learning_rate": 1.0541704575950568e-05, "loss": 0.0623, "step": 178525 }, { "epoch": 2.6295636294016287, "grad_norm": 1.4071685075759888, "learning_rate": 1.0540068203983584e-05, "loss": 0.0554, "step": 178550 }, { "epoch": 2.6299318124917157, "grad_norm": 1.3623868227005005, "learning_rate": 1.05384318320166e-05, "loss": 0.0625, "step": 178575 }, { "epoch": 2.6302999955818027, "grad_norm": 1.6273113489151, "learning_rate": 1.0536795460049616e-05, "loss": 0.0586, "step": 178600 }, { "epoch": 2.6306681786718897, "grad_norm": 1.7515143156051636, "learning_rate": 1.053515908808263e-05, "loss": 0.0587, "step": 178625 }, { "epoch": 2.6310363617619767, "grad_norm": 1.0457935333251953, "learning_rate": 1.0533522716115647e-05, "loss": 0.0591, "step": 178650 }, { "epoch": 2.6314045448520638, "grad_norm": 0.7781147360801697, "learning_rate": 1.0531886344148663e-05, "loss": 0.056, "step": 178675 }, { "epoch": 2.6317727279421512, "grad_norm": 1.553816318511963, "learning_rate": 1.0530249972181676e-05, "loss": 0.0545, "step": 178700 }, { "epoch": 2.6321409110322382, "grad_norm": 1.633156657218933, "learning_rate": 1.0528613600214692e-05, "loss": 0.0559, "step": 178725 }, { "epoch": 2.6325090941223253, "grad_norm": 1.4244391918182373, "learning_rate": 1.0526977228247708e-05, "loss": 0.0552, "step": 178750 }, { "epoch": 2.6328772772124123, "grad_norm": 0.9048569202423096, "learning_rate": 1.0525340856280724e-05, "loss": 0.0548, "step": 178775 }, { "epoch": 2.6332454603024993, "grad_norm": 1.279308795928955, "learning_rate": 1.0523704484313739e-05, "loss": 0.0544, "step": 178800 }, { "epoch": 2.6336136433925863, "grad_norm": 1.0054556131362915, "learning_rate": 1.0522068112346755e-05, "loss": 0.0565, "step": 178825 }, { "epoch": 2.6339818264826733, "grad_norm": 1.6993483304977417, "learning_rate": 1.0520431740379771e-05, "loss": 0.0599, "step": 178850 }, { "epoch": 2.6343500095727603, "grad_norm": 1.3486400842666626, "learning_rate": 1.0518795368412786e-05, "loss": 0.0606, "step": 178875 }, { "epoch": 2.6347181926628473, "grad_norm": 1.2938765287399292, "learning_rate": 1.05171589964458e-05, "loss": 0.0579, "step": 178900 }, { "epoch": 2.6350863757529344, "grad_norm": 1.1513739824295044, "learning_rate": 1.0515522624478816e-05, "loss": 0.0558, "step": 178925 }, { "epoch": 2.6354545588430214, "grad_norm": 1.2724298238754272, "learning_rate": 1.051388625251183e-05, "loss": 0.0531, "step": 178950 }, { "epoch": 2.6358227419331084, "grad_norm": 1.5667345523834229, "learning_rate": 1.0512249880544847e-05, "loss": 0.0578, "step": 178975 }, { "epoch": 2.6361909250231954, "grad_norm": 1.5931990146636963, "learning_rate": 1.0510613508577863e-05, "loss": 0.0597, "step": 179000 }, { "epoch": 2.6365591081132824, "grad_norm": 1.4145405292510986, "learning_rate": 1.0508977136610879e-05, "loss": 0.0558, "step": 179025 }, { "epoch": 2.63692729120337, "grad_norm": 1.402555227279663, "learning_rate": 1.0507340764643894e-05, "loss": 0.0583, "step": 179050 }, { "epoch": 2.637295474293457, "grad_norm": 1.4087753295898438, "learning_rate": 1.050570439267691e-05, "loss": 0.0596, "step": 179075 }, { "epoch": 2.637663657383544, "grad_norm": 1.6323907375335693, "learning_rate": 1.0504068020709926e-05, "loss": 0.0617, "step": 179100 }, { "epoch": 2.638031840473631, "grad_norm": 1.080856442451477, "learning_rate": 1.0502431648742939e-05, "loss": 0.0622, "step": 179125 }, { "epoch": 2.638400023563718, "grad_norm": 1.7374264001846313, "learning_rate": 1.0500795276775955e-05, "loss": 0.0564, "step": 179150 }, { "epoch": 2.638768206653805, "grad_norm": 1.0557068586349487, "learning_rate": 1.0499158904808971e-05, "loss": 0.0551, "step": 179175 }, { "epoch": 2.639136389743892, "grad_norm": 0.7824804782867432, "learning_rate": 1.0497522532841985e-05, "loss": 0.05, "step": 179200 }, { "epoch": 2.639504572833979, "grad_norm": 1.5242996215820312, "learning_rate": 1.0495886160875001e-05, "loss": 0.0595, "step": 179225 }, { "epoch": 2.639872755924066, "grad_norm": 1.93537437915802, "learning_rate": 1.0494249788908018e-05, "loss": 0.0596, "step": 179250 }, { "epoch": 2.640240939014153, "grad_norm": 0.8486847281455994, "learning_rate": 1.0492613416941034e-05, "loss": 0.0664, "step": 179275 }, { "epoch": 2.64060912210424, "grad_norm": 1.0348448753356934, "learning_rate": 1.0490977044974048e-05, "loss": 0.0648, "step": 179300 }, { "epoch": 2.640977305194327, "grad_norm": 1.3246134519577026, "learning_rate": 1.0489340673007063e-05, "loss": 0.0591, "step": 179325 }, { "epoch": 2.641345488284414, "grad_norm": 1.2293293476104736, "learning_rate": 1.0487704301040079e-05, "loss": 0.0618, "step": 179350 }, { "epoch": 2.641713671374501, "grad_norm": 1.6534684896469116, "learning_rate": 1.0486067929073093e-05, "loss": 0.0616, "step": 179375 }, { "epoch": 2.642081854464588, "grad_norm": 1.2842737436294556, "learning_rate": 1.048443155710611e-05, "loss": 0.0509, "step": 179400 }, { "epoch": 2.642450037554675, "grad_norm": 1.3370314836502075, "learning_rate": 1.0482795185139126e-05, "loss": 0.0632, "step": 179425 }, { "epoch": 2.642818220644762, "grad_norm": 1.1738921403884888, "learning_rate": 1.048115881317214e-05, "loss": 0.0631, "step": 179450 }, { "epoch": 2.643186403734849, "grad_norm": 1.4158613681793213, "learning_rate": 1.0479522441205156e-05, "loss": 0.0536, "step": 179475 }, { "epoch": 2.643554586824936, "grad_norm": 1.729436993598938, "learning_rate": 1.0477886069238172e-05, "loss": 0.06, "step": 179500 }, { "epoch": 2.643922769915023, "grad_norm": 1.5001217126846313, "learning_rate": 1.0476249697271189e-05, "loss": 0.0559, "step": 179525 }, { "epoch": 2.64429095300511, "grad_norm": 1.3951103687286377, "learning_rate": 1.0474613325304201e-05, "loss": 0.0672, "step": 179550 }, { "epoch": 2.644659136095197, "grad_norm": 1.2371768951416016, "learning_rate": 1.0472976953337217e-05, "loss": 0.0588, "step": 179575 }, { "epoch": 2.645027319185284, "grad_norm": 1.9210978746414185, "learning_rate": 1.0471340581370234e-05, "loss": 0.0629, "step": 179600 }, { "epoch": 2.6453955022753712, "grad_norm": 1.996050238609314, "learning_rate": 1.0469704209403248e-05, "loss": 0.0604, "step": 179625 }, { "epoch": 2.6457636853654587, "grad_norm": 1.2609702348709106, "learning_rate": 1.0468067837436264e-05, "loss": 0.0575, "step": 179650 }, { "epoch": 2.6461318684555457, "grad_norm": 1.7031487226486206, "learning_rate": 1.046643146546928e-05, "loss": 0.0625, "step": 179675 }, { "epoch": 2.6465000515456327, "grad_norm": 1.2939832210540771, "learning_rate": 1.0464795093502297e-05, "loss": 0.058, "step": 179700 }, { "epoch": 2.6468682346357197, "grad_norm": 1.6751508712768555, "learning_rate": 1.0463158721535311e-05, "loss": 0.0618, "step": 179725 }, { "epoch": 2.6472364177258068, "grad_norm": 1.25525963306427, "learning_rate": 1.0461522349568325e-05, "loss": 0.0661, "step": 179750 }, { "epoch": 2.6476046008158938, "grad_norm": 0.7105354070663452, "learning_rate": 1.045988597760134e-05, "loss": 0.0616, "step": 179775 }, { "epoch": 2.647972783905981, "grad_norm": 1.251990556716919, "learning_rate": 1.0458249605634356e-05, "loss": 0.0592, "step": 179800 }, { "epoch": 2.648340966996068, "grad_norm": 0.9045290946960449, "learning_rate": 1.0456613233667372e-05, "loss": 0.0612, "step": 179825 }, { "epoch": 2.648709150086155, "grad_norm": 1.2470132112503052, "learning_rate": 1.0454976861700388e-05, "loss": 0.0642, "step": 179850 }, { "epoch": 2.649077333176242, "grad_norm": 1.1372416019439697, "learning_rate": 1.0453340489733403e-05, "loss": 0.0578, "step": 179875 }, { "epoch": 2.649445516266329, "grad_norm": 1.4001555442810059, "learning_rate": 1.0451704117766419e-05, "loss": 0.0601, "step": 179900 }, { "epoch": 2.649813699356416, "grad_norm": 1.0540432929992676, "learning_rate": 1.0450067745799435e-05, "loss": 0.0537, "step": 179925 }, { "epoch": 2.650181882446503, "grad_norm": 0.8042251467704773, "learning_rate": 1.0448431373832451e-05, "loss": 0.0598, "step": 179950 }, { "epoch": 2.65055006553659, "grad_norm": 1.184349775314331, "learning_rate": 1.0446795001865464e-05, "loss": 0.0679, "step": 179975 }, { "epoch": 2.650918248626677, "grad_norm": 1.085429072380066, "learning_rate": 1.044515862989848e-05, "loss": 0.0613, "step": 180000 }, { "epoch": 2.6512864317167644, "grad_norm": 1.4456406831741333, "learning_rate": 1.0443522257931495e-05, "loss": 0.0649, "step": 180025 }, { "epoch": 2.6516546148068514, "grad_norm": 1.3142204284667969, "learning_rate": 1.044188588596451e-05, "loss": 0.0624, "step": 180050 }, { "epoch": 2.6520227978969384, "grad_norm": 1.2352559566497803, "learning_rate": 1.0440249513997527e-05, "loss": 0.0614, "step": 180075 }, { "epoch": 2.6523909809870254, "grad_norm": 1.8774738311767578, "learning_rate": 1.0438613142030543e-05, "loss": 0.0564, "step": 180100 }, { "epoch": 2.6527591640771124, "grad_norm": 1.3439046144485474, "learning_rate": 1.0436976770063558e-05, "loss": 0.0552, "step": 180125 }, { "epoch": 2.6531273471671994, "grad_norm": 1.3229684829711914, "learning_rate": 1.0435340398096574e-05, "loss": 0.0612, "step": 180150 }, { "epoch": 2.6534955302572865, "grad_norm": 1.5237793922424316, "learning_rate": 1.0433704026129588e-05, "loss": 0.0587, "step": 180175 }, { "epoch": 2.6538637133473735, "grad_norm": 1.4181277751922607, "learning_rate": 1.0432067654162603e-05, "loss": 0.0579, "step": 180200 }, { "epoch": 2.6542318964374605, "grad_norm": 0.8959699869155884, "learning_rate": 1.0430431282195619e-05, "loss": 0.0641, "step": 180225 }, { "epoch": 2.6546000795275475, "grad_norm": 1.299402117729187, "learning_rate": 1.0428794910228635e-05, "loss": 0.0669, "step": 180250 }, { "epoch": 2.6549682626176345, "grad_norm": 1.4478684663772583, "learning_rate": 1.0427158538261651e-05, "loss": 0.0588, "step": 180275 }, { "epoch": 2.6553364457077215, "grad_norm": 1.7965478897094727, "learning_rate": 1.0425522166294666e-05, "loss": 0.0587, "step": 180300 }, { "epoch": 2.6557046287978086, "grad_norm": 1.7480050325393677, "learning_rate": 1.0423885794327682e-05, "loss": 0.06, "step": 180325 }, { "epoch": 2.6560728118878956, "grad_norm": 1.9673129320144653, "learning_rate": 1.0422249422360698e-05, "loss": 0.0626, "step": 180350 }, { "epoch": 2.6564409949779826, "grad_norm": 1.3611135482788086, "learning_rate": 1.042061305039371e-05, "loss": 0.059, "step": 180375 }, { "epoch": 2.6568091780680696, "grad_norm": 1.392541766166687, "learning_rate": 1.0418976678426727e-05, "loss": 0.0619, "step": 180400 }, { "epoch": 2.6571773611581566, "grad_norm": 1.6021356582641602, "learning_rate": 1.0417340306459743e-05, "loss": 0.0605, "step": 180425 }, { "epoch": 2.6575455442482436, "grad_norm": 1.6446949243545532, "learning_rate": 1.0415703934492757e-05, "loss": 0.0616, "step": 180450 }, { "epoch": 2.6579137273383306, "grad_norm": 1.6089822053909302, "learning_rate": 1.0414067562525773e-05, "loss": 0.0617, "step": 180475 }, { "epoch": 2.6582819104284177, "grad_norm": 1.604913592338562, "learning_rate": 1.041243119055879e-05, "loss": 0.0607, "step": 180500 }, { "epoch": 2.6586500935185047, "grad_norm": 1.532718539237976, "learning_rate": 1.0410794818591806e-05, "loss": 0.0671, "step": 180525 }, { "epoch": 2.6590182766085917, "grad_norm": 1.3487820625305176, "learning_rate": 1.040915844662482e-05, "loss": 0.0561, "step": 180550 }, { "epoch": 2.6593864596986787, "grad_norm": 1.6589494943618774, "learning_rate": 1.0407522074657836e-05, "loss": 0.0679, "step": 180575 }, { "epoch": 2.6597546427887657, "grad_norm": 1.7940930128097534, "learning_rate": 1.040588570269085e-05, "loss": 0.0604, "step": 180600 }, { "epoch": 2.660122825878853, "grad_norm": 1.5940213203430176, "learning_rate": 1.0404249330723865e-05, "loss": 0.0629, "step": 180625 }, { "epoch": 2.66049100896894, "grad_norm": 1.478488564491272, "learning_rate": 1.0402612958756881e-05, "loss": 0.0647, "step": 180650 }, { "epoch": 2.660859192059027, "grad_norm": 1.3178554773330688, "learning_rate": 1.0400976586789898e-05, "loss": 0.058, "step": 180675 }, { "epoch": 2.6612273751491142, "grad_norm": 1.3025761842727661, "learning_rate": 1.0399340214822912e-05, "loss": 0.0574, "step": 180700 }, { "epoch": 2.6615955582392012, "grad_norm": 1.010762095451355, "learning_rate": 1.0397703842855928e-05, "loss": 0.0566, "step": 180725 }, { "epoch": 2.6619637413292883, "grad_norm": 1.4706687927246094, "learning_rate": 1.0396067470888944e-05, "loss": 0.0647, "step": 180750 }, { "epoch": 2.6623319244193753, "grad_norm": 1.3642903566360474, "learning_rate": 1.039443109892196e-05, "loss": 0.0571, "step": 180775 }, { "epoch": 2.6627001075094623, "grad_norm": 1.0870933532714844, "learning_rate": 1.0392794726954973e-05, "loss": 0.0578, "step": 180800 }, { "epoch": 2.6630682905995493, "grad_norm": 0.9486274123191833, "learning_rate": 1.039115835498799e-05, "loss": 0.0534, "step": 180825 }, { "epoch": 2.6634364736896363, "grad_norm": 1.5033544301986694, "learning_rate": 1.0389521983021006e-05, "loss": 0.0597, "step": 180850 }, { "epoch": 2.6638046567797233, "grad_norm": 1.4026621580123901, "learning_rate": 1.038788561105402e-05, "loss": 0.0551, "step": 180875 }, { "epoch": 2.6641728398698103, "grad_norm": 1.653326392173767, "learning_rate": 1.0386249239087036e-05, "loss": 0.0607, "step": 180900 }, { "epoch": 2.6645410229598974, "grad_norm": 1.20106840133667, "learning_rate": 1.0384612867120052e-05, "loss": 0.0609, "step": 180925 }, { "epoch": 2.6649092060499844, "grad_norm": 1.3902958631515503, "learning_rate": 1.0382976495153067e-05, "loss": 0.0563, "step": 180950 }, { "epoch": 2.665277389140072, "grad_norm": 1.240664005279541, "learning_rate": 1.0381340123186083e-05, "loss": 0.0548, "step": 180975 }, { "epoch": 2.665645572230159, "grad_norm": 1.4866517782211304, "learning_rate": 1.0379703751219099e-05, "loss": 0.0525, "step": 181000 }, { "epoch": 2.666013755320246, "grad_norm": 1.3387285470962524, "learning_rate": 1.0378132834130791e-05, "loss": 0.0574, "step": 181025 }, { "epoch": 2.666381938410333, "grad_norm": 1.2695602178573608, "learning_rate": 1.0376496462163807e-05, "loss": 0.0538, "step": 181050 }, { "epoch": 2.66675012150042, "grad_norm": 0.8715903162956238, "learning_rate": 1.0374860090196824e-05, "loss": 0.0577, "step": 181075 }, { "epoch": 2.667118304590507, "grad_norm": 1.5755671262741089, "learning_rate": 1.037322371822984e-05, "loss": 0.0568, "step": 181100 }, { "epoch": 2.667486487680594, "grad_norm": 1.0684443712234497, "learning_rate": 1.0371587346262854e-05, "loss": 0.0573, "step": 181125 }, { "epoch": 2.667854670770681, "grad_norm": 1.036328911781311, "learning_rate": 1.036995097429587e-05, "loss": 0.0599, "step": 181150 }, { "epoch": 2.668222853860768, "grad_norm": 1.6287823915481567, "learning_rate": 1.0368314602328887e-05, "loss": 0.0638, "step": 181175 }, { "epoch": 2.668591036950855, "grad_norm": 1.5044418573379517, "learning_rate": 1.0366678230361901e-05, "loss": 0.0543, "step": 181200 }, { "epoch": 2.668959220040942, "grad_norm": 1.2453765869140625, "learning_rate": 1.0365041858394915e-05, "loss": 0.0606, "step": 181225 }, { "epoch": 2.669327403131029, "grad_norm": 1.1694879531860352, "learning_rate": 1.0363405486427932e-05, "loss": 0.0556, "step": 181250 }, { "epoch": 2.669695586221116, "grad_norm": 1.2286295890808105, "learning_rate": 1.0361769114460946e-05, "loss": 0.0574, "step": 181275 }, { "epoch": 2.670063769311203, "grad_norm": 1.068738579750061, "learning_rate": 1.0360132742493962e-05, "loss": 0.0645, "step": 181300 }, { "epoch": 2.67043195240129, "grad_norm": 1.381239891052246, "learning_rate": 1.0358496370526978e-05, "loss": 0.0567, "step": 181325 }, { "epoch": 2.670800135491377, "grad_norm": 1.7819762229919434, "learning_rate": 1.0356859998559994e-05, "loss": 0.0624, "step": 181350 }, { "epoch": 2.671168318581464, "grad_norm": 1.2299953699111938, "learning_rate": 1.0355223626593009e-05, "loss": 0.0551, "step": 181375 }, { "epoch": 2.671536501671551, "grad_norm": 1.6271673440933228, "learning_rate": 1.0353587254626025e-05, "loss": 0.0583, "step": 181400 }, { "epoch": 2.671904684761638, "grad_norm": 1.5756698846817017, "learning_rate": 1.0351950882659041e-05, "loss": 0.0567, "step": 181425 }, { "epoch": 2.672272867851725, "grad_norm": 1.4155449867248535, "learning_rate": 1.0350314510692054e-05, "loss": 0.0588, "step": 181450 }, { "epoch": 2.672641050941812, "grad_norm": 1.3141456842422485, "learning_rate": 1.034867813872507e-05, "loss": 0.065, "step": 181475 }, { "epoch": 2.673009234031899, "grad_norm": 1.2345174551010132, "learning_rate": 1.0347041766758086e-05, "loss": 0.0513, "step": 181500 }, { "epoch": 2.673377417121986, "grad_norm": 1.3157939910888672, "learning_rate": 1.03454053947911e-05, "loss": 0.0706, "step": 181525 }, { "epoch": 2.673745600212073, "grad_norm": 1.5634199380874634, "learning_rate": 1.0343769022824117e-05, "loss": 0.0549, "step": 181550 }, { "epoch": 2.6741137833021607, "grad_norm": 1.149841070175171, "learning_rate": 1.0342132650857133e-05, "loss": 0.0657, "step": 181575 }, { "epoch": 2.6744819663922477, "grad_norm": 1.2981230020523071, "learning_rate": 1.034049627889015e-05, "loss": 0.0591, "step": 181600 }, { "epoch": 2.6748501494823347, "grad_norm": 0.9457674026489258, "learning_rate": 1.0338859906923164e-05, "loss": 0.0559, "step": 181625 }, { "epoch": 2.6752183325724217, "grad_norm": 1.519731044769287, "learning_rate": 1.0337223534956178e-05, "loss": 0.0629, "step": 181650 }, { "epoch": 2.6755865156625087, "grad_norm": 0.9381436109542847, "learning_rate": 1.0335587162989194e-05, "loss": 0.0591, "step": 181675 }, { "epoch": 2.6759546987525957, "grad_norm": 1.9244869947433472, "learning_rate": 1.0333950791022209e-05, "loss": 0.0602, "step": 181700 }, { "epoch": 2.6763228818426827, "grad_norm": 1.1269614696502686, "learning_rate": 1.0332314419055225e-05, "loss": 0.0569, "step": 181725 }, { "epoch": 2.6766910649327698, "grad_norm": 1.834871530532837, "learning_rate": 1.0330678047088241e-05, "loss": 0.0593, "step": 181750 }, { "epoch": 2.6770592480228568, "grad_norm": 2.0192348957061768, "learning_rate": 1.0329041675121256e-05, "loss": 0.065, "step": 181775 }, { "epoch": 2.677427431112944, "grad_norm": 1.3955726623535156, "learning_rate": 1.0327405303154272e-05, "loss": 0.0611, "step": 181800 }, { "epoch": 2.677795614203031, "grad_norm": 0.8697972297668457, "learning_rate": 1.0325768931187288e-05, "loss": 0.0708, "step": 181825 }, { "epoch": 2.678163797293118, "grad_norm": 1.6199911832809448, "learning_rate": 1.03241325592203e-05, "loss": 0.0612, "step": 181850 }, { "epoch": 2.678531980383205, "grad_norm": 1.4861690998077393, "learning_rate": 1.0322496187253317e-05, "loss": 0.0614, "step": 181875 }, { "epoch": 2.678900163473292, "grad_norm": 1.5810816287994385, "learning_rate": 1.0320859815286333e-05, "loss": 0.061, "step": 181900 }, { "epoch": 2.6792683465633793, "grad_norm": 0.9216818809509277, "learning_rate": 1.0319223443319349e-05, "loss": 0.0607, "step": 181925 }, { "epoch": 2.6796365296534663, "grad_norm": 1.6507874727249146, "learning_rate": 1.0317587071352363e-05, "loss": 0.0598, "step": 181950 }, { "epoch": 2.6800047127435533, "grad_norm": 0.9667916297912598, "learning_rate": 1.031595069938538e-05, "loss": 0.0605, "step": 181975 }, { "epoch": 2.6803728958336404, "grad_norm": 1.180269718170166, "learning_rate": 1.0314314327418396e-05, "loss": 0.0568, "step": 182000 }, { "epoch": 2.6807410789237274, "grad_norm": 1.100769281387329, "learning_rate": 1.0312677955451412e-05, "loss": 0.0626, "step": 182025 }, { "epoch": 2.6811092620138144, "grad_norm": 1.3334780931472778, "learning_rate": 1.0311041583484426e-05, "loss": 0.0689, "step": 182050 }, { "epoch": 2.6814774451039014, "grad_norm": 1.484243631362915, "learning_rate": 1.0309405211517441e-05, "loss": 0.0606, "step": 182075 }, { "epoch": 2.6818456281939884, "grad_norm": 1.574515700340271, "learning_rate": 1.0307768839550455e-05, "loss": 0.0495, "step": 182100 }, { "epoch": 2.6822138112840754, "grad_norm": 1.226787805557251, "learning_rate": 1.0306132467583471e-05, "loss": 0.0526, "step": 182125 }, { "epoch": 2.6825819943741624, "grad_norm": 1.495360255241394, "learning_rate": 1.0304496095616488e-05, "loss": 0.0601, "step": 182150 }, { "epoch": 2.6829501774642495, "grad_norm": 1.9710766077041626, "learning_rate": 1.0302859723649504e-05, "loss": 0.062, "step": 182175 }, { "epoch": 2.6833183605543365, "grad_norm": 1.479754090309143, "learning_rate": 1.0301223351682518e-05, "loss": 0.0616, "step": 182200 }, { "epoch": 2.6836865436444235, "grad_norm": 1.2248257398605347, "learning_rate": 1.0299586979715534e-05, "loss": 0.0529, "step": 182225 }, { "epoch": 2.6840547267345105, "grad_norm": 1.264366626739502, "learning_rate": 1.029795060774855e-05, "loss": 0.0598, "step": 182250 }, { "epoch": 2.6844229098245975, "grad_norm": 1.2543889284133911, "learning_rate": 1.0296314235781563e-05, "loss": 0.0615, "step": 182275 }, { "epoch": 2.6847910929146845, "grad_norm": 1.1692509651184082, "learning_rate": 1.029467786381458e-05, "loss": 0.0611, "step": 182300 }, { "epoch": 2.6851592760047716, "grad_norm": 2.0396153926849365, "learning_rate": 1.0293041491847596e-05, "loss": 0.0589, "step": 182325 }, { "epoch": 2.6855274590948586, "grad_norm": 1.1993850469589233, "learning_rate": 1.029140511988061e-05, "loss": 0.0634, "step": 182350 }, { "epoch": 2.6858956421849456, "grad_norm": 1.1579844951629639, "learning_rate": 1.0289768747913626e-05, "loss": 0.0566, "step": 182375 }, { "epoch": 2.6862638252750326, "grad_norm": 0.9792632460594177, "learning_rate": 1.0288132375946642e-05, "loss": 0.0551, "step": 182400 }, { "epoch": 2.6866320083651196, "grad_norm": 1.3327845335006714, "learning_rate": 1.0286496003979659e-05, "loss": 0.0551, "step": 182425 }, { "epoch": 2.6870001914552066, "grad_norm": 1.2504613399505615, "learning_rate": 1.0284859632012673e-05, "loss": 0.0575, "step": 182450 }, { "epoch": 2.6873683745452936, "grad_norm": 1.36167311668396, "learning_rate": 1.0283223260045689e-05, "loss": 0.0582, "step": 182475 }, { "epoch": 2.6877365576353807, "grad_norm": 1.4674620628356934, "learning_rate": 1.0281586888078704e-05, "loss": 0.0553, "step": 182500 }, { "epoch": 2.688104740725468, "grad_norm": 1.6915929317474365, "learning_rate": 1.0279950516111718e-05, "loss": 0.0558, "step": 182525 }, { "epoch": 2.688472923815555, "grad_norm": 1.7129229307174683, "learning_rate": 1.0278314144144734e-05, "loss": 0.058, "step": 182550 }, { "epoch": 2.688841106905642, "grad_norm": 1.9740345478057861, "learning_rate": 1.027667777217775e-05, "loss": 0.0554, "step": 182575 }, { "epoch": 2.689209289995729, "grad_norm": 1.830592393875122, "learning_rate": 1.0275041400210766e-05, "loss": 0.0591, "step": 182600 }, { "epoch": 2.689577473085816, "grad_norm": 1.1833752393722534, "learning_rate": 1.0273405028243781e-05, "loss": 0.0594, "step": 182625 }, { "epoch": 2.689945656175903, "grad_norm": 1.2470561265945435, "learning_rate": 1.0271768656276797e-05, "loss": 0.0619, "step": 182650 }, { "epoch": 2.69031383926599, "grad_norm": 1.4791810512542725, "learning_rate": 1.0270132284309813e-05, "loss": 0.062, "step": 182675 }, { "epoch": 2.6906820223560772, "grad_norm": 1.0999714136123657, "learning_rate": 1.0268495912342826e-05, "loss": 0.0613, "step": 182700 }, { "epoch": 2.6910502054461642, "grad_norm": 1.560540795326233, "learning_rate": 1.0266859540375842e-05, "loss": 0.0626, "step": 182725 }, { "epoch": 2.6914183885362513, "grad_norm": 1.7665736675262451, "learning_rate": 1.0265223168408858e-05, "loss": 0.0655, "step": 182750 }, { "epoch": 2.6917865716263383, "grad_norm": 1.2932590246200562, "learning_rate": 1.0263586796441873e-05, "loss": 0.0583, "step": 182775 }, { "epoch": 2.6921547547164253, "grad_norm": 0.8844801187515259, "learning_rate": 1.0261950424474889e-05, "loss": 0.0517, "step": 182800 }, { "epoch": 2.6925229378065123, "grad_norm": 1.4140905141830444, "learning_rate": 1.0260314052507905e-05, "loss": 0.0613, "step": 182825 }, { "epoch": 2.6928911208965993, "grad_norm": 1.438362717628479, "learning_rate": 1.0258677680540921e-05, "loss": 0.0495, "step": 182850 }, { "epoch": 2.6932593039866863, "grad_norm": 1.283368706703186, "learning_rate": 1.0257041308573936e-05, "loss": 0.0623, "step": 182875 }, { "epoch": 2.693627487076774, "grad_norm": 1.3483699560165405, "learning_rate": 1.0255404936606952e-05, "loss": 0.0664, "step": 182900 }, { "epoch": 2.693995670166861, "grad_norm": 1.1190013885498047, "learning_rate": 1.0253768564639965e-05, "loss": 0.0613, "step": 182925 }, { "epoch": 2.694363853256948, "grad_norm": 1.2656058073043823, "learning_rate": 1.025213219267298e-05, "loss": 0.0557, "step": 182950 }, { "epoch": 2.694732036347035, "grad_norm": 1.3406060934066772, "learning_rate": 1.0250495820705997e-05, "loss": 0.0638, "step": 182975 }, { "epoch": 2.695100219437122, "grad_norm": 1.6723088026046753, "learning_rate": 1.0248859448739013e-05, "loss": 0.0623, "step": 183000 }, { "epoch": 2.695468402527209, "grad_norm": 1.4660754203796387, "learning_rate": 1.0247223076772028e-05, "loss": 0.0602, "step": 183025 }, { "epoch": 2.695836585617296, "grad_norm": 1.482211947441101, "learning_rate": 1.0245586704805044e-05, "loss": 0.0583, "step": 183050 }, { "epoch": 2.696204768707383, "grad_norm": 1.2991197109222412, "learning_rate": 1.024395033283806e-05, "loss": 0.0672, "step": 183075 }, { "epoch": 2.69657295179747, "grad_norm": 1.8289599418640137, "learning_rate": 1.0242313960871076e-05, "loss": 0.0647, "step": 183100 }, { "epoch": 2.696941134887557, "grad_norm": 1.2484925985336304, "learning_rate": 1.0240743043782768e-05, "loss": 0.0571, "step": 183125 }, { "epoch": 2.697309317977644, "grad_norm": 1.3957743644714355, "learning_rate": 1.0239106671815784e-05, "loss": 0.0583, "step": 183150 }, { "epoch": 2.697677501067731, "grad_norm": 0.8832376599311829, "learning_rate": 1.0237470299848799e-05, "loss": 0.0627, "step": 183175 }, { "epoch": 2.698045684157818, "grad_norm": 1.1454840898513794, "learning_rate": 1.0235833927881815e-05, "loss": 0.0547, "step": 183200 }, { "epoch": 2.698413867247905, "grad_norm": 1.428775668144226, "learning_rate": 1.0234197555914831e-05, "loss": 0.0609, "step": 183225 }, { "epoch": 2.698782050337992, "grad_norm": 1.228224754333496, "learning_rate": 1.0232561183947847e-05, "loss": 0.0587, "step": 183250 }, { "epoch": 2.699150233428079, "grad_norm": 1.730570673942566, "learning_rate": 1.0230924811980862e-05, "loss": 0.0708, "step": 183275 }, { "epoch": 2.699518416518166, "grad_norm": 1.202194094657898, "learning_rate": 1.0229288440013878e-05, "loss": 0.0561, "step": 183300 }, { "epoch": 2.699886599608253, "grad_norm": 1.2887914180755615, "learning_rate": 1.0227652068046894e-05, "loss": 0.0612, "step": 183325 }, { "epoch": 2.70025478269834, "grad_norm": 1.7454698085784912, "learning_rate": 1.0226015696079907e-05, "loss": 0.0716, "step": 183350 }, { "epoch": 2.700622965788427, "grad_norm": 1.3642715215682983, "learning_rate": 1.0224379324112923e-05, "loss": 0.0597, "step": 183375 }, { "epoch": 2.700991148878514, "grad_norm": 1.5880589485168457, "learning_rate": 1.0222742952145939e-05, "loss": 0.0587, "step": 183400 }, { "epoch": 2.701359331968601, "grad_norm": 1.3546175956726074, "learning_rate": 1.0221106580178955e-05, "loss": 0.0556, "step": 183425 }, { "epoch": 2.701727515058688, "grad_norm": 1.6443510055541992, "learning_rate": 1.021947020821197e-05, "loss": 0.0564, "step": 183450 }, { "epoch": 2.702095698148775, "grad_norm": 1.2800062894821167, "learning_rate": 1.0217833836244986e-05, "loss": 0.05, "step": 183475 }, { "epoch": 2.7024638812388626, "grad_norm": 0.8485159873962402, "learning_rate": 1.0216197464278002e-05, "loss": 0.0629, "step": 183500 }, { "epoch": 2.7028320643289496, "grad_norm": 1.1817010641098022, "learning_rate": 1.0214561092311016e-05, "loss": 0.057, "step": 183525 }, { "epoch": 2.7032002474190366, "grad_norm": 0.9812033772468567, "learning_rate": 1.0212924720344031e-05, "loss": 0.0588, "step": 183550 }, { "epoch": 2.7035684305091237, "grad_norm": 1.533164381980896, "learning_rate": 1.0211288348377047e-05, "loss": 0.0617, "step": 183575 }, { "epoch": 2.7039366135992107, "grad_norm": 1.741667628288269, "learning_rate": 1.0209651976410061e-05, "loss": 0.0591, "step": 183600 }, { "epoch": 2.7043047966892977, "grad_norm": 1.0580602884292603, "learning_rate": 1.0208015604443078e-05, "loss": 0.0608, "step": 183625 }, { "epoch": 2.7046729797793847, "grad_norm": 1.3728079795837402, "learning_rate": 1.0206379232476094e-05, "loss": 0.0539, "step": 183650 }, { "epoch": 2.7050411628694717, "grad_norm": 1.7485744953155518, "learning_rate": 1.020474286050911e-05, "loss": 0.058, "step": 183675 }, { "epoch": 2.7054093459595587, "grad_norm": 1.710504412651062, "learning_rate": 1.0203106488542124e-05, "loss": 0.0619, "step": 183700 }, { "epoch": 2.7057775290496457, "grad_norm": 1.5417767763137817, "learning_rate": 1.020147011657514e-05, "loss": 0.0622, "step": 183725 }, { "epoch": 2.7061457121397328, "grad_norm": 1.6605935096740723, "learning_rate": 1.0199833744608153e-05, "loss": 0.0631, "step": 183750 }, { "epoch": 2.7065138952298198, "grad_norm": 1.4925915002822876, "learning_rate": 1.019819737264117e-05, "loss": 0.0607, "step": 183775 }, { "epoch": 2.706882078319907, "grad_norm": 0.6819592118263245, "learning_rate": 1.0196561000674186e-05, "loss": 0.0568, "step": 183800 }, { "epoch": 2.707250261409994, "grad_norm": 1.4788029193878174, "learning_rate": 1.0194924628707202e-05, "loss": 0.064, "step": 183825 }, { "epoch": 2.7076184445000813, "grad_norm": 1.4906667470932007, "learning_rate": 1.0193288256740216e-05, "loss": 0.0507, "step": 183850 }, { "epoch": 2.7079866275901683, "grad_norm": 1.268228530883789, "learning_rate": 1.0191651884773232e-05, "loss": 0.0611, "step": 183875 }, { "epoch": 2.7083548106802553, "grad_norm": 1.1930062770843506, "learning_rate": 1.0190015512806249e-05, "loss": 0.0586, "step": 183900 }, { "epoch": 2.7087229937703423, "grad_norm": 1.4462981224060059, "learning_rate": 1.0188379140839265e-05, "loss": 0.0587, "step": 183925 }, { "epoch": 2.7090911768604293, "grad_norm": 1.7341102361679077, "learning_rate": 1.0186742768872279e-05, "loss": 0.0656, "step": 183950 }, { "epoch": 2.7094593599505163, "grad_norm": 1.2396494150161743, "learning_rate": 1.0185106396905294e-05, "loss": 0.0585, "step": 183975 }, { "epoch": 2.7098275430406034, "grad_norm": 1.207575798034668, "learning_rate": 1.018347002493831e-05, "loss": 0.0662, "step": 184000 }, { "epoch": 2.7101957261306904, "grad_norm": 1.2569626569747925, "learning_rate": 1.0181833652971324e-05, "loss": 0.0669, "step": 184025 }, { "epoch": 2.7105639092207774, "grad_norm": 1.3071489334106445, "learning_rate": 1.018019728100434e-05, "loss": 0.0563, "step": 184050 }, { "epoch": 2.7109320923108644, "grad_norm": 1.3904650211334229, "learning_rate": 1.0178560909037357e-05, "loss": 0.0715, "step": 184075 }, { "epoch": 2.7113002754009514, "grad_norm": 1.1116451025009155, "learning_rate": 1.0176924537070371e-05, "loss": 0.0616, "step": 184100 }, { "epoch": 2.7116684584910384, "grad_norm": 1.0635896921157837, "learning_rate": 1.0175288165103387e-05, "loss": 0.0577, "step": 184125 }, { "epoch": 2.7120366415811255, "grad_norm": 1.2569341659545898, "learning_rate": 1.0173651793136403e-05, "loss": 0.053, "step": 184150 }, { "epoch": 2.7124048246712125, "grad_norm": 1.6762291193008423, "learning_rate": 1.0172015421169416e-05, "loss": 0.0583, "step": 184175 }, { "epoch": 2.7127730077612995, "grad_norm": 1.5419416427612305, "learning_rate": 1.0170379049202432e-05, "loss": 0.0592, "step": 184200 }, { "epoch": 2.7131411908513865, "grad_norm": 1.745168924331665, "learning_rate": 1.0168742677235448e-05, "loss": 0.0552, "step": 184225 }, { "epoch": 2.7135093739414735, "grad_norm": 0.9518269300460815, "learning_rate": 1.0167106305268464e-05, "loss": 0.0631, "step": 184250 }, { "epoch": 2.7138775570315605, "grad_norm": 1.358918309211731, "learning_rate": 1.0165469933301479e-05, "loss": 0.0698, "step": 184275 }, { "epoch": 2.7142457401216475, "grad_norm": 1.5168896913528442, "learning_rate": 1.0163833561334495e-05, "loss": 0.0604, "step": 184300 }, { "epoch": 2.7146139232117346, "grad_norm": 1.1325515508651733, "learning_rate": 1.0162197189367511e-05, "loss": 0.0613, "step": 184325 }, { "epoch": 2.7149821063018216, "grad_norm": 1.3978608846664429, "learning_rate": 1.0160560817400527e-05, "loss": 0.0605, "step": 184350 }, { "epoch": 2.7153502893919086, "grad_norm": 2.056851863861084, "learning_rate": 1.0158924445433542e-05, "loss": 0.0594, "step": 184375 }, { "epoch": 2.7157184724819956, "grad_norm": 0.8224292993545532, "learning_rate": 1.0157288073466556e-05, "loss": 0.0596, "step": 184400 }, { "epoch": 2.7160866555720826, "grad_norm": 1.7935566902160645, "learning_rate": 1.015565170149957e-05, "loss": 0.0646, "step": 184425 }, { "epoch": 2.71645483866217, "grad_norm": 1.458459734916687, "learning_rate": 1.0154015329532587e-05, "loss": 0.0604, "step": 184450 }, { "epoch": 2.716823021752257, "grad_norm": 1.405131220817566, "learning_rate": 1.0152378957565603e-05, "loss": 0.0624, "step": 184475 }, { "epoch": 2.717191204842344, "grad_norm": 1.4670943021774292, "learning_rate": 1.015074258559862e-05, "loss": 0.062, "step": 184500 }, { "epoch": 2.717559387932431, "grad_norm": 1.455169439315796, "learning_rate": 1.0149106213631634e-05, "loss": 0.0629, "step": 184525 }, { "epoch": 2.717927571022518, "grad_norm": 1.6445952653884888, "learning_rate": 1.014746984166465e-05, "loss": 0.0568, "step": 184550 }, { "epoch": 2.718295754112605, "grad_norm": 1.038856863975525, "learning_rate": 1.0145833469697666e-05, "loss": 0.0519, "step": 184575 }, { "epoch": 2.718663937202692, "grad_norm": 1.7846399545669556, "learning_rate": 1.0144197097730679e-05, "loss": 0.0638, "step": 184600 }, { "epoch": 2.719032120292779, "grad_norm": 1.5178264379501343, "learning_rate": 1.0142560725763695e-05, "loss": 0.0596, "step": 184625 }, { "epoch": 2.719400303382866, "grad_norm": 1.2321747541427612, "learning_rate": 1.0140924353796711e-05, "loss": 0.0601, "step": 184650 }, { "epoch": 2.719768486472953, "grad_norm": 1.2565826177597046, "learning_rate": 1.0139287981829726e-05, "loss": 0.056, "step": 184675 }, { "epoch": 2.7201366695630402, "grad_norm": 1.3941514492034912, "learning_rate": 1.0137651609862742e-05, "loss": 0.0562, "step": 184700 }, { "epoch": 2.7205048526531272, "grad_norm": 1.3635790348052979, "learning_rate": 1.0136015237895758e-05, "loss": 0.062, "step": 184725 }, { "epoch": 2.7208730357432143, "grad_norm": 1.7387267351150513, "learning_rate": 1.0134378865928774e-05, "loss": 0.0541, "step": 184750 }, { "epoch": 2.7212412188333013, "grad_norm": 1.214390516281128, "learning_rate": 1.0132742493961788e-05, "loss": 0.0603, "step": 184775 }, { "epoch": 2.7216094019233887, "grad_norm": 1.81710946559906, "learning_rate": 1.0131106121994805e-05, "loss": 0.0648, "step": 184800 }, { "epoch": 2.7219775850134758, "grad_norm": 1.5733720064163208, "learning_rate": 1.0129469750027819e-05, "loss": 0.0607, "step": 184825 }, { "epoch": 2.7223457681035628, "grad_norm": 1.319262146949768, "learning_rate": 1.0127833378060833e-05, "loss": 0.062, "step": 184850 }, { "epoch": 2.72271395119365, "grad_norm": 1.2286608219146729, "learning_rate": 1.012619700609385e-05, "loss": 0.0655, "step": 184875 }, { "epoch": 2.723082134283737, "grad_norm": 1.5763959884643555, "learning_rate": 1.0124560634126866e-05, "loss": 0.0624, "step": 184900 }, { "epoch": 2.723450317373824, "grad_norm": 0.7446858882904053, "learning_rate": 1.0122924262159882e-05, "loss": 0.057, "step": 184925 }, { "epoch": 2.723818500463911, "grad_norm": 1.4298956394195557, "learning_rate": 1.0121287890192896e-05, "loss": 0.0563, "step": 184950 }, { "epoch": 2.724186683553998, "grad_norm": 1.0701384544372559, "learning_rate": 1.0119651518225913e-05, "loss": 0.0582, "step": 184975 }, { "epoch": 2.724554866644085, "grad_norm": 1.5140502452850342, "learning_rate": 1.0118015146258929e-05, "loss": 0.056, "step": 185000 }, { "epoch": 2.724923049734172, "grad_norm": 1.266882300376892, "learning_rate": 1.0116378774291941e-05, "loss": 0.0578, "step": 185025 }, { "epoch": 2.725291232824259, "grad_norm": 1.3724349737167358, "learning_rate": 1.0114742402324958e-05, "loss": 0.0575, "step": 185050 }, { "epoch": 2.725659415914346, "grad_norm": 1.0935099124908447, "learning_rate": 1.0113106030357974e-05, "loss": 0.0615, "step": 185075 }, { "epoch": 2.726027599004433, "grad_norm": 1.2079156637191772, "learning_rate": 1.0111469658390988e-05, "loss": 0.0582, "step": 185100 }, { "epoch": 2.72639578209452, "grad_norm": 1.390535831451416, "learning_rate": 1.0109833286424004e-05, "loss": 0.0651, "step": 185125 }, { "epoch": 2.726763965184607, "grad_norm": 1.5886473655700684, "learning_rate": 1.010819691445702e-05, "loss": 0.0611, "step": 185150 }, { "epoch": 2.727132148274694, "grad_norm": 1.565333366394043, "learning_rate": 1.0106560542490037e-05, "loss": 0.055, "step": 185175 }, { "epoch": 2.727500331364781, "grad_norm": 0.9570712447166443, "learning_rate": 1.0104924170523051e-05, "loss": 0.0584, "step": 185200 }, { "epoch": 2.727868514454868, "grad_norm": 1.2584855556488037, "learning_rate": 1.0103287798556066e-05, "loss": 0.0547, "step": 185225 }, { "epoch": 2.728236697544955, "grad_norm": 2.0586068630218506, "learning_rate": 1.010165142658908e-05, "loss": 0.0679, "step": 185250 }, { "epoch": 2.728604880635042, "grad_norm": 1.3739484548568726, "learning_rate": 1.0100015054622096e-05, "loss": 0.0623, "step": 185275 }, { "epoch": 2.728973063725129, "grad_norm": 1.7004926204681396, "learning_rate": 1.0098378682655112e-05, "loss": 0.0607, "step": 185300 }, { "epoch": 2.729341246815216, "grad_norm": 1.7843974828720093, "learning_rate": 1.0096742310688128e-05, "loss": 0.0642, "step": 185325 }, { "epoch": 2.729709429905303, "grad_norm": 1.2455390691757202, "learning_rate": 1.0095105938721143e-05, "loss": 0.0569, "step": 185350 }, { "epoch": 2.73007761299539, "grad_norm": 1.5210189819335938, "learning_rate": 1.0093469566754159e-05, "loss": 0.0582, "step": 185375 }, { "epoch": 2.7304457960854776, "grad_norm": 1.5243985652923584, "learning_rate": 1.0091833194787175e-05, "loss": 0.058, "step": 185400 }, { "epoch": 2.7308139791755646, "grad_norm": 1.044806718826294, "learning_rate": 1.0090196822820191e-05, "loss": 0.0558, "step": 185425 }, { "epoch": 2.7311821622656516, "grad_norm": 1.198866367340088, "learning_rate": 1.0088560450853204e-05, "loss": 0.0577, "step": 185450 }, { "epoch": 2.7315503453557386, "grad_norm": 1.286551833152771, "learning_rate": 1.008692407888622e-05, "loss": 0.0575, "step": 185475 }, { "epoch": 2.7319185284458256, "grad_norm": 1.1551345586776733, "learning_rate": 1.0085287706919236e-05, "loss": 0.0534, "step": 185500 }, { "epoch": 2.7322867115359126, "grad_norm": 1.4485493898391724, "learning_rate": 1.0083651334952251e-05, "loss": 0.0637, "step": 185525 }, { "epoch": 2.7326548946259996, "grad_norm": 1.0969319343566895, "learning_rate": 1.0082014962985267e-05, "loss": 0.0602, "step": 185550 }, { "epoch": 2.7330230777160867, "grad_norm": 0.8755432963371277, "learning_rate": 1.0080378591018283e-05, "loss": 0.0587, "step": 185575 }, { "epoch": 2.7333912608061737, "grad_norm": 0.9794855713844299, "learning_rate": 1.0078742219051298e-05, "loss": 0.0532, "step": 185600 }, { "epoch": 2.7337594438962607, "grad_norm": 1.1577626466751099, "learning_rate": 1.0077105847084314e-05, "loss": 0.0601, "step": 185625 }, { "epoch": 2.7341276269863477, "grad_norm": 0.9449526071548462, "learning_rate": 1.0075469475117328e-05, "loss": 0.0572, "step": 185650 }, { "epoch": 2.7344958100764347, "grad_norm": 1.190841794013977, "learning_rate": 1.0073833103150343e-05, "loss": 0.059, "step": 185675 }, { "epoch": 2.7348639931665217, "grad_norm": 1.2565597295761108, "learning_rate": 1.0072196731183359e-05, "loss": 0.0605, "step": 185700 }, { "epoch": 2.7352321762566087, "grad_norm": 1.2178332805633545, "learning_rate": 1.0070560359216375e-05, "loss": 0.0591, "step": 185725 }, { "epoch": 2.7356003593466958, "grad_norm": 0.8831363916397095, "learning_rate": 1.0068923987249391e-05, "loss": 0.0542, "step": 185750 }, { "epoch": 2.7359685424367832, "grad_norm": 2.107327938079834, "learning_rate": 1.0067287615282406e-05, "loss": 0.0552, "step": 185775 }, { "epoch": 2.7363367255268702, "grad_norm": 1.45307195186615, "learning_rate": 1.0065651243315422e-05, "loss": 0.062, "step": 185800 }, { "epoch": 2.7367049086169573, "grad_norm": 1.8094033002853394, "learning_rate": 1.0064014871348438e-05, "loss": 0.0566, "step": 185825 }, { "epoch": 2.7370730917070443, "grad_norm": 1.331187129020691, "learning_rate": 1.0062378499381454e-05, "loss": 0.064, "step": 185850 }, { "epoch": 2.7374412747971313, "grad_norm": 1.1713544130325317, "learning_rate": 1.0060742127414467e-05, "loss": 0.0624, "step": 185875 }, { "epoch": 2.7378094578872183, "grad_norm": 1.4612226486206055, "learning_rate": 1.0059105755447483e-05, "loss": 0.0611, "step": 185900 }, { "epoch": 2.7381776409773053, "grad_norm": 1.2365119457244873, "learning_rate": 1.0057469383480497e-05, "loss": 0.0594, "step": 185925 }, { "epoch": 2.7385458240673923, "grad_norm": 1.6546189785003662, "learning_rate": 1.0055833011513514e-05, "loss": 0.0648, "step": 185950 }, { "epoch": 2.7389140071574793, "grad_norm": 1.514780879020691, "learning_rate": 1.005419663954653e-05, "loss": 0.0567, "step": 185975 }, { "epoch": 2.7392821902475664, "grad_norm": 0.8987876772880554, "learning_rate": 1.0052560267579546e-05, "loss": 0.063, "step": 186000 }, { "epoch": 2.7396503733376534, "grad_norm": 1.7933945655822754, "learning_rate": 1.005092389561256e-05, "loss": 0.0563, "step": 186025 }, { "epoch": 2.7400185564277404, "grad_norm": 1.4574695825576782, "learning_rate": 1.0049287523645577e-05, "loss": 0.0541, "step": 186050 }, { "epoch": 2.7403867395178274, "grad_norm": 1.4761006832122803, "learning_rate": 1.0047651151678591e-05, "loss": 0.0591, "step": 186075 }, { "epoch": 2.7407549226079144, "grad_norm": 1.0592039823532104, "learning_rate": 1.0046014779711605e-05, "loss": 0.0572, "step": 186100 }, { "epoch": 2.7411231056980014, "grad_norm": 1.2496352195739746, "learning_rate": 1.0044378407744622e-05, "loss": 0.0622, "step": 186125 }, { "epoch": 2.7414912887880885, "grad_norm": 1.2604317665100098, "learning_rate": 1.0042742035777638e-05, "loss": 0.059, "step": 186150 }, { "epoch": 2.7418594718781755, "grad_norm": 1.5644952058792114, "learning_rate": 1.0041105663810652e-05, "loss": 0.057, "step": 186175 }, { "epoch": 2.7422276549682625, "grad_norm": 1.5964096784591675, "learning_rate": 1.0039469291843668e-05, "loss": 0.0601, "step": 186200 }, { "epoch": 2.7425958380583495, "grad_norm": 1.1815171241760254, "learning_rate": 1.0037832919876685e-05, "loss": 0.057, "step": 186225 }, { "epoch": 2.7429640211484365, "grad_norm": 1.429641842842102, "learning_rate": 1.00361965479097e-05, "loss": 0.0606, "step": 186250 }, { "epoch": 2.7433322042385235, "grad_norm": 1.0084843635559082, "learning_rate": 1.0034560175942715e-05, "loss": 0.0547, "step": 186275 }, { "epoch": 2.7437003873286105, "grad_norm": 1.1361294984817505, "learning_rate": 1.003292380397573e-05, "loss": 0.0585, "step": 186300 }, { "epoch": 2.7440685704186976, "grad_norm": 1.2982580661773682, "learning_rate": 1.0031287432008746e-05, "loss": 0.0613, "step": 186325 }, { "epoch": 2.7444367535087846, "grad_norm": 1.3378002643585205, "learning_rate": 1.002965106004176e-05, "loss": 0.0548, "step": 186350 }, { "epoch": 2.744804936598872, "grad_norm": 1.4267215728759766, "learning_rate": 1.0028014688074776e-05, "loss": 0.0565, "step": 186375 }, { "epoch": 2.745173119688959, "grad_norm": 1.2975184917449951, "learning_rate": 1.0026378316107793e-05, "loss": 0.0607, "step": 186400 }, { "epoch": 2.745541302779046, "grad_norm": 1.2684420347213745, "learning_rate": 1.0024741944140807e-05, "loss": 0.0543, "step": 186425 }, { "epoch": 2.745909485869133, "grad_norm": 1.9877132177352905, "learning_rate": 1.0023105572173823e-05, "loss": 0.0593, "step": 186450 }, { "epoch": 2.74627766895922, "grad_norm": 1.5988712310791016, "learning_rate": 1.002146920020684e-05, "loss": 0.0593, "step": 186475 }, { "epoch": 2.746645852049307, "grad_norm": 1.7441291809082031, "learning_rate": 1.0019832828239852e-05, "loss": 0.0604, "step": 186500 }, { "epoch": 2.747014035139394, "grad_norm": 1.3226075172424316, "learning_rate": 1.0018196456272868e-05, "loss": 0.0599, "step": 186525 }, { "epoch": 2.747382218229481, "grad_norm": 1.805211067199707, "learning_rate": 1.0016560084305884e-05, "loss": 0.0656, "step": 186550 }, { "epoch": 2.747750401319568, "grad_norm": 1.4552751779556274, "learning_rate": 1.00149237123389e-05, "loss": 0.0653, "step": 186575 }, { "epoch": 2.748118584409655, "grad_norm": 1.0965235233306885, "learning_rate": 1.0013352795250594e-05, "loss": 0.0565, "step": 186600 }, { "epoch": 2.748486767499742, "grad_norm": 1.0309356451034546, "learning_rate": 1.001171642328361e-05, "loss": 0.0553, "step": 186625 }, { "epoch": 2.748854950589829, "grad_norm": 1.480932593345642, "learning_rate": 1.0010080051316627e-05, "loss": 0.0655, "step": 186650 }, { "epoch": 2.749223133679916, "grad_norm": 2.2531635761260986, "learning_rate": 1.0008443679349643e-05, "loss": 0.0674, "step": 186675 }, { "epoch": 2.7495913167700032, "grad_norm": 0.9868398308753967, "learning_rate": 1.0006807307382656e-05, "loss": 0.0566, "step": 186700 }, { "epoch": 2.7499594998600907, "grad_norm": 1.5144098997116089, "learning_rate": 1.0005170935415672e-05, "loss": 0.0554, "step": 186725 }, { "epoch": 2.7503276829501777, "grad_norm": 0.9801357388496399, "learning_rate": 1.0003534563448686e-05, "loss": 0.061, "step": 186750 }, { "epoch": 2.7506958660402647, "grad_norm": 1.4268430471420288, "learning_rate": 1.0001898191481702e-05, "loss": 0.0629, "step": 186775 }, { "epoch": 2.7510640491303517, "grad_norm": 1.3382500410079956, "learning_rate": 1.0000261819514719e-05, "loss": 0.0605, "step": 186800 }, { "epoch": 2.7514322322204388, "grad_norm": 1.1464941501617432, "learning_rate": 9.998625447547735e-06, "loss": 0.0562, "step": 186825 }, { "epoch": 2.7518004153105258, "grad_norm": 1.2671126127243042, "learning_rate": 9.996989075580749e-06, "loss": 0.0576, "step": 186850 }, { "epoch": 2.752168598400613, "grad_norm": 1.2742894887924194, "learning_rate": 9.995352703613764e-06, "loss": 0.0568, "step": 186875 }, { "epoch": 2.7525367814907, "grad_norm": 0.9946377873420715, "learning_rate": 9.99371633164678e-06, "loss": 0.0608, "step": 186900 }, { "epoch": 2.752904964580787, "grad_norm": 1.217646837234497, "learning_rate": 9.992079959679796e-06, "loss": 0.0678, "step": 186925 }, { "epoch": 2.753273147670874, "grad_norm": 1.749219536781311, "learning_rate": 9.990443587712812e-06, "loss": 0.064, "step": 186950 }, { "epoch": 2.753641330760961, "grad_norm": 1.1158767938613892, "learning_rate": 9.988807215745826e-06, "loss": 0.0521, "step": 186975 }, { "epoch": 2.754009513851048, "grad_norm": 1.01271390914917, "learning_rate": 9.987170843778841e-06, "loss": 0.0547, "step": 187000 }, { "epoch": 2.754377696941135, "grad_norm": 1.6766948699951172, "learning_rate": 9.985534471811857e-06, "loss": 0.0592, "step": 187025 }, { "epoch": 2.754745880031222, "grad_norm": 1.7003371715545654, "learning_rate": 9.983898099844873e-06, "loss": 0.0603, "step": 187050 }, { "epoch": 2.755114063121309, "grad_norm": 1.6806586980819702, "learning_rate": 9.982261727877888e-06, "loss": 0.056, "step": 187075 }, { "epoch": 2.755482246211396, "grad_norm": 1.0180401802062988, "learning_rate": 9.980625355910904e-06, "loss": 0.0636, "step": 187100 }, { "epoch": 2.755850429301483, "grad_norm": 1.5644352436065674, "learning_rate": 9.978988983943918e-06, "loss": 0.0576, "step": 187125 }, { "epoch": 2.75621861239157, "grad_norm": 1.1711918115615845, "learning_rate": 9.977352611976934e-06, "loss": 0.0517, "step": 187150 }, { "epoch": 2.756586795481657, "grad_norm": 1.4307109117507935, "learning_rate": 9.975716240009949e-06, "loss": 0.0627, "step": 187175 }, { "epoch": 2.756954978571744, "grad_norm": 1.8698443174362183, "learning_rate": 9.974079868042965e-06, "loss": 0.0569, "step": 187200 }, { "epoch": 2.757323161661831, "grad_norm": 1.1340711116790771, "learning_rate": 9.972443496075981e-06, "loss": 0.0543, "step": 187225 }, { "epoch": 2.757691344751918, "grad_norm": 1.3797444105148315, "learning_rate": 9.970807124108997e-06, "loss": 0.0633, "step": 187250 }, { "epoch": 2.758059527842005, "grad_norm": 1.2021266222000122, "learning_rate": 9.969170752142012e-06, "loss": 0.0642, "step": 187275 }, { "epoch": 2.758427710932092, "grad_norm": 1.1204200983047485, "learning_rate": 9.967534380175026e-06, "loss": 0.0626, "step": 187300 }, { "epoch": 2.7587958940221795, "grad_norm": 1.6233242750167847, "learning_rate": 9.965898008208042e-06, "loss": 0.0634, "step": 187325 }, { "epoch": 2.7591640771122665, "grad_norm": 1.3069599866867065, "learning_rate": 9.964261636241059e-06, "loss": 0.0581, "step": 187350 }, { "epoch": 2.7595322602023535, "grad_norm": 1.0715394020080566, "learning_rate": 9.962625264274075e-06, "loss": 0.0556, "step": 187375 }, { "epoch": 2.7599004432924406, "grad_norm": 1.428015112876892, "learning_rate": 9.96098889230709e-06, "loss": 0.0596, "step": 187400 }, { "epoch": 2.7602686263825276, "grad_norm": 1.0343254804611206, "learning_rate": 9.959352520340104e-06, "loss": 0.0498, "step": 187425 }, { "epoch": 2.7606368094726146, "grad_norm": 1.025355339050293, "learning_rate": 9.95771614837312e-06, "loss": 0.0572, "step": 187450 }, { "epoch": 2.7610049925627016, "grad_norm": 1.2642844915390015, "learning_rate": 9.956079776406136e-06, "loss": 0.0647, "step": 187475 }, { "epoch": 2.7613731756527886, "grad_norm": 1.3289872407913208, "learning_rate": 9.95444340443915e-06, "loss": 0.0674, "step": 187500 }, { "epoch": 2.7617413587428756, "grad_norm": 1.2489908933639526, "learning_rate": 9.952807032472167e-06, "loss": 0.0621, "step": 187525 }, { "epoch": 2.7621095418329626, "grad_norm": 1.3466240167617798, "learning_rate": 9.951170660505181e-06, "loss": 0.0587, "step": 187550 }, { "epoch": 2.7624777249230497, "grad_norm": 1.1342620849609375, "learning_rate": 9.949534288538197e-06, "loss": 0.0596, "step": 187575 }, { "epoch": 2.7628459080131367, "grad_norm": 1.3637194633483887, "learning_rate": 9.947897916571212e-06, "loss": 0.0626, "step": 187600 }, { "epoch": 2.7632140911032237, "grad_norm": 1.1403675079345703, "learning_rate": 9.946261544604228e-06, "loss": 0.0602, "step": 187625 }, { "epoch": 2.7635822741933107, "grad_norm": 2.7129833698272705, "learning_rate": 9.944625172637244e-06, "loss": 0.0615, "step": 187650 }, { "epoch": 2.763950457283398, "grad_norm": 1.598002314567566, "learning_rate": 9.942988800670258e-06, "loss": 0.0648, "step": 187675 }, { "epoch": 2.764318640373485, "grad_norm": 1.5435110330581665, "learning_rate": 9.941352428703273e-06, "loss": 0.0631, "step": 187700 }, { "epoch": 2.764686823463572, "grad_norm": 1.4801486730575562, "learning_rate": 9.939716056736289e-06, "loss": 0.0577, "step": 187725 }, { "epoch": 2.765055006553659, "grad_norm": 1.0425965785980225, "learning_rate": 9.938079684769305e-06, "loss": 0.063, "step": 187750 }, { "epoch": 2.7654231896437462, "grad_norm": 0.7088472843170166, "learning_rate": 9.936443312802321e-06, "loss": 0.0616, "step": 187775 }, { "epoch": 2.7657913727338332, "grad_norm": 1.5590195655822754, "learning_rate": 9.934806940835336e-06, "loss": 0.0561, "step": 187800 }, { "epoch": 2.7661595558239203, "grad_norm": 1.3413126468658447, "learning_rate": 9.933170568868352e-06, "loss": 0.0524, "step": 187825 }, { "epoch": 2.7665277389140073, "grad_norm": 1.69755220413208, "learning_rate": 9.931534196901366e-06, "loss": 0.0671, "step": 187850 }, { "epoch": 2.7668959220040943, "grad_norm": 1.50053870677948, "learning_rate": 9.929897824934383e-06, "loss": 0.0546, "step": 187875 }, { "epoch": 2.7672641050941813, "grad_norm": 1.535345435142517, "learning_rate": 9.928261452967399e-06, "loss": 0.0575, "step": 187900 }, { "epoch": 2.7676322881842683, "grad_norm": 1.266869068145752, "learning_rate": 9.926625081000413e-06, "loss": 0.059, "step": 187925 }, { "epoch": 2.7680004712743553, "grad_norm": 1.2413359880447388, "learning_rate": 9.92498870903343e-06, "loss": 0.0599, "step": 187950 }, { "epoch": 2.7683686543644423, "grad_norm": 0.9187526702880859, "learning_rate": 9.923352337066444e-06, "loss": 0.0549, "step": 187975 }, { "epoch": 2.7687368374545294, "grad_norm": 1.7610441446304321, "learning_rate": 9.92171596509946e-06, "loss": 0.0609, "step": 188000 }, { "epoch": 2.7691050205446164, "grad_norm": 1.9958604574203491, "learning_rate": 9.920079593132474e-06, "loss": 0.0594, "step": 188025 }, { "epoch": 2.7694732036347034, "grad_norm": 1.1109797954559326, "learning_rate": 9.91844322116549e-06, "loss": 0.0619, "step": 188050 }, { "epoch": 2.7698413867247904, "grad_norm": 1.0043673515319824, "learning_rate": 9.916806849198507e-06, "loss": 0.0604, "step": 188075 }, { "epoch": 2.7702095698148774, "grad_norm": 1.6157029867172241, "learning_rate": 9.915170477231521e-06, "loss": 0.0533, "step": 188100 }, { "epoch": 2.7705777529049644, "grad_norm": 1.8857016563415527, "learning_rate": 9.913534105264536e-06, "loss": 0.0631, "step": 188125 }, { "epoch": 2.7709459359950515, "grad_norm": 1.3368146419525146, "learning_rate": 9.911897733297552e-06, "loss": 0.0583, "step": 188150 }, { "epoch": 2.7713141190851385, "grad_norm": 1.1812440156936646, "learning_rate": 9.910261361330568e-06, "loss": 0.0519, "step": 188175 }, { "epoch": 2.7716823021752255, "grad_norm": 1.7864316701889038, "learning_rate": 9.908624989363584e-06, "loss": 0.0555, "step": 188200 }, { "epoch": 2.7720504852653125, "grad_norm": 1.478463888168335, "learning_rate": 9.906988617396598e-06, "loss": 0.0604, "step": 188225 }, { "epoch": 2.7724186683553995, "grad_norm": 1.3287866115570068, "learning_rate": 9.905352245429613e-06, "loss": 0.0594, "step": 188250 }, { "epoch": 2.772786851445487, "grad_norm": 1.3550938367843628, "learning_rate": 9.903715873462629e-06, "loss": 0.0692, "step": 188275 }, { "epoch": 2.773155034535574, "grad_norm": 1.0496293306350708, "learning_rate": 9.902079501495645e-06, "loss": 0.0567, "step": 188300 }, { "epoch": 2.773523217625661, "grad_norm": 1.3580968379974365, "learning_rate": 9.900443129528661e-06, "loss": 0.0606, "step": 188325 }, { "epoch": 2.773891400715748, "grad_norm": 1.5281096696853638, "learning_rate": 9.898806757561676e-06, "loss": 0.0646, "step": 188350 }, { "epoch": 2.774259583805835, "grad_norm": 1.658506155014038, "learning_rate": 9.89717038559469e-06, "loss": 0.0572, "step": 188375 }, { "epoch": 2.774627766895922, "grad_norm": 1.673966884613037, "learning_rate": 9.895534013627706e-06, "loss": 0.0615, "step": 188400 }, { "epoch": 2.774995949986009, "grad_norm": 1.0819004774093628, "learning_rate": 9.893897641660723e-06, "loss": 0.0494, "step": 188425 }, { "epoch": 2.775364133076096, "grad_norm": 1.4841569662094116, "learning_rate": 9.892261269693737e-06, "loss": 0.0585, "step": 188450 }, { "epoch": 2.775732316166183, "grad_norm": 1.232907772064209, "learning_rate": 9.890624897726753e-06, "loss": 0.0551, "step": 188475 }, { "epoch": 2.77610049925627, "grad_norm": 1.1234492063522339, "learning_rate": 9.888988525759768e-06, "loss": 0.053, "step": 188500 }, { "epoch": 2.776468682346357, "grad_norm": 1.416610836982727, "learning_rate": 9.887352153792784e-06, "loss": 0.0585, "step": 188525 }, { "epoch": 2.776836865436444, "grad_norm": 1.4943337440490723, "learning_rate": 9.885715781825798e-06, "loss": 0.0635, "step": 188550 }, { "epoch": 2.777205048526531, "grad_norm": 1.4836252927780151, "learning_rate": 9.884079409858814e-06, "loss": 0.0642, "step": 188575 }, { "epoch": 2.777573231616618, "grad_norm": 2.0967702865600586, "learning_rate": 9.88244303789183e-06, "loss": 0.0603, "step": 188600 }, { "epoch": 2.777941414706705, "grad_norm": 0.9067227244377136, "learning_rate": 9.880806665924845e-06, "loss": 0.0653, "step": 188625 }, { "epoch": 2.7783095977967927, "grad_norm": 1.601743459701538, "learning_rate": 9.879170293957861e-06, "loss": 0.0596, "step": 188650 }, { "epoch": 2.7786777808868797, "grad_norm": 1.953127145767212, "learning_rate": 9.877533921990876e-06, "loss": 0.067, "step": 188675 }, { "epoch": 2.7790459639769667, "grad_norm": 0.987812340259552, "learning_rate": 9.875897550023892e-06, "loss": 0.0592, "step": 188700 }, { "epoch": 2.7794141470670537, "grad_norm": 1.4707144498825073, "learning_rate": 9.874261178056908e-06, "loss": 0.0603, "step": 188725 }, { "epoch": 2.7797823301571407, "grad_norm": 1.6171125173568726, "learning_rate": 9.872624806089922e-06, "loss": 0.0611, "step": 188750 }, { "epoch": 2.7801505132472277, "grad_norm": 1.6431132555007935, "learning_rate": 9.870988434122939e-06, "loss": 0.0632, "step": 188775 }, { "epoch": 2.7805186963373147, "grad_norm": 1.7907843589782715, "learning_rate": 9.869352062155953e-06, "loss": 0.0607, "step": 188800 }, { "epoch": 2.7808868794274018, "grad_norm": 1.4770323038101196, "learning_rate": 9.86771569018897e-06, "loss": 0.0611, "step": 188825 }, { "epoch": 2.7812550625174888, "grad_norm": 1.2739169597625732, "learning_rate": 9.866079318221985e-06, "loss": 0.0581, "step": 188850 }, { "epoch": 2.781623245607576, "grad_norm": 1.1502888202667236, "learning_rate": 9.864442946255e-06, "loss": 0.0657, "step": 188875 }, { "epoch": 2.781991428697663, "grad_norm": 1.880157709121704, "learning_rate": 9.862806574288016e-06, "loss": 0.0618, "step": 188900 }, { "epoch": 2.78235961178775, "grad_norm": 1.2659341096878052, "learning_rate": 9.86117020232103e-06, "loss": 0.0601, "step": 188925 }, { "epoch": 2.782727794877837, "grad_norm": 0.9823083281517029, "learning_rate": 9.859533830354047e-06, "loss": 0.0592, "step": 188950 }, { "epoch": 2.783095977967924, "grad_norm": 1.0225110054016113, "learning_rate": 9.857897458387061e-06, "loss": 0.0605, "step": 188975 }, { "epoch": 2.783464161058011, "grad_norm": 1.818889856338501, "learning_rate": 9.856261086420077e-06, "loss": 0.0557, "step": 189000 }, { "epoch": 2.783832344148098, "grad_norm": 1.8652188777923584, "learning_rate": 9.854624714453093e-06, "loss": 0.0629, "step": 189025 }, { "epoch": 2.784200527238185, "grad_norm": 1.6811672449111938, "learning_rate": 9.852988342486108e-06, "loss": 0.0602, "step": 189050 }, { "epoch": 2.784568710328272, "grad_norm": 1.3678489923477173, "learning_rate": 9.851351970519122e-06, "loss": 0.0664, "step": 189075 }, { "epoch": 2.784936893418359, "grad_norm": 1.0080233812332153, "learning_rate": 9.849715598552138e-06, "loss": 0.0593, "step": 189100 }, { "epoch": 2.785305076508446, "grad_norm": 1.2449897527694702, "learning_rate": 9.848144681463834e-06, "loss": 0.054, "step": 189125 }, { "epoch": 2.785673259598533, "grad_norm": 0.9559805393218994, "learning_rate": 9.84650830949685e-06, "loss": 0.0592, "step": 189150 }, { "epoch": 2.78604144268862, "grad_norm": 1.4270541667938232, "learning_rate": 9.844871937529865e-06, "loss": 0.0537, "step": 189175 }, { "epoch": 2.786409625778707, "grad_norm": 1.5277680158615112, "learning_rate": 9.843235565562879e-06, "loss": 0.0583, "step": 189200 }, { "epoch": 2.786777808868794, "grad_norm": 1.2945669889450073, "learning_rate": 9.841599193595895e-06, "loss": 0.0563, "step": 189225 }, { "epoch": 2.7871459919588815, "grad_norm": 0.9818471670150757, "learning_rate": 9.839962821628911e-06, "loss": 0.0534, "step": 189250 }, { "epoch": 2.7875141750489685, "grad_norm": 2.0116989612579346, "learning_rate": 9.838326449661927e-06, "loss": 0.0615, "step": 189275 }, { "epoch": 2.7878823581390555, "grad_norm": 1.2373155355453491, "learning_rate": 9.836690077694942e-06, "loss": 0.0562, "step": 189300 }, { "epoch": 2.7882505412291425, "grad_norm": 1.25845468044281, "learning_rate": 9.835053705727956e-06, "loss": 0.057, "step": 189325 }, { "epoch": 2.7886187243192295, "grad_norm": 1.2892045974731445, "learning_rate": 9.833417333760973e-06, "loss": 0.0591, "step": 189350 }, { "epoch": 2.7889869074093165, "grad_norm": 1.18462073802948, "learning_rate": 9.831780961793989e-06, "loss": 0.062, "step": 189375 }, { "epoch": 2.7893550904994036, "grad_norm": 1.44797682762146, "learning_rate": 9.830144589827003e-06, "loss": 0.0601, "step": 189400 }, { "epoch": 2.7897232735894906, "grad_norm": 0.7727309465408325, "learning_rate": 9.82850821786002e-06, "loss": 0.0568, "step": 189425 }, { "epoch": 2.7900914566795776, "grad_norm": 1.4913910627365112, "learning_rate": 9.826871845893034e-06, "loss": 0.0618, "step": 189450 }, { "epoch": 2.7904596397696646, "grad_norm": 1.3875844478607178, "learning_rate": 9.82523547392605e-06, "loss": 0.0559, "step": 189475 }, { "epoch": 2.7908278228597516, "grad_norm": 1.2802366018295288, "learning_rate": 9.823599101959064e-06, "loss": 0.0603, "step": 189500 }, { "epoch": 2.7911960059498386, "grad_norm": 1.418169617652893, "learning_rate": 9.82196272999208e-06, "loss": 0.0662, "step": 189525 }, { "epoch": 2.7915641890399256, "grad_norm": 1.1409392356872559, "learning_rate": 9.820326358025097e-06, "loss": 0.0594, "step": 189550 }, { "epoch": 2.7919323721300127, "grad_norm": 1.0275858640670776, "learning_rate": 9.818689986058113e-06, "loss": 0.0587, "step": 189575 }, { "epoch": 2.7923005552201, "grad_norm": 1.3041986227035522, "learning_rate": 9.817053614091127e-06, "loss": 0.0517, "step": 189600 }, { "epoch": 2.792668738310187, "grad_norm": 1.662649154663086, "learning_rate": 9.815417242124142e-06, "loss": 0.0593, "step": 189625 }, { "epoch": 2.793036921400274, "grad_norm": 1.0946348905563354, "learning_rate": 9.813780870157158e-06, "loss": 0.0533, "step": 189650 }, { "epoch": 2.793405104490361, "grad_norm": 1.1640589237213135, "learning_rate": 9.812144498190174e-06, "loss": 0.0634, "step": 189675 }, { "epoch": 2.793773287580448, "grad_norm": 1.4606651067733765, "learning_rate": 9.81050812622319e-06, "loss": 0.0626, "step": 189700 }, { "epoch": 2.794141470670535, "grad_norm": 1.110967755317688, "learning_rate": 9.808871754256205e-06, "loss": 0.0572, "step": 189725 }, { "epoch": 2.794509653760622, "grad_norm": 0.9042086601257324, "learning_rate": 9.807235382289219e-06, "loss": 0.0502, "step": 189750 }, { "epoch": 2.7948778368507092, "grad_norm": 1.7488985061645508, "learning_rate": 9.805599010322235e-06, "loss": 0.0617, "step": 189775 }, { "epoch": 2.7952460199407962, "grad_norm": 1.443360447883606, "learning_rate": 9.803962638355251e-06, "loss": 0.0574, "step": 189800 }, { "epoch": 2.7956142030308833, "grad_norm": 1.2800196409225464, "learning_rate": 9.802326266388266e-06, "loss": 0.0626, "step": 189825 }, { "epoch": 2.7959823861209703, "grad_norm": 1.0426499843597412, "learning_rate": 9.800689894421282e-06, "loss": 0.059, "step": 189850 }, { "epoch": 2.7963505692110573, "grad_norm": 0.7933362126350403, "learning_rate": 9.799053522454296e-06, "loss": 0.0581, "step": 189875 }, { "epoch": 2.7967187523011443, "grad_norm": 1.4540982246398926, "learning_rate": 9.797417150487313e-06, "loss": 0.0576, "step": 189900 }, { "epoch": 2.7970869353912313, "grad_norm": 1.2096306085586548, "learning_rate": 9.795780778520327e-06, "loss": 0.059, "step": 189925 }, { "epoch": 2.7974551184813183, "grad_norm": 1.3250669240951538, "learning_rate": 9.794144406553343e-06, "loss": 0.0562, "step": 189950 }, { "epoch": 2.7978233015714054, "grad_norm": 2.0905303955078125, "learning_rate": 9.79250803458636e-06, "loss": 0.063, "step": 189975 }, { "epoch": 2.7981914846614924, "grad_norm": 1.696402668952942, "learning_rate": 9.790871662619374e-06, "loss": 0.061, "step": 190000 }, { "epoch": 2.7985596677515794, "grad_norm": 1.4953043460845947, "learning_rate": 9.789235290652388e-06, "loss": 0.0581, "step": 190025 }, { "epoch": 2.7989278508416664, "grad_norm": 1.6427489519119263, "learning_rate": 9.787598918685404e-06, "loss": 0.0556, "step": 190050 }, { "epoch": 2.7992960339317534, "grad_norm": 1.1342544555664062, "learning_rate": 9.78596254671842e-06, "loss": 0.0566, "step": 190075 }, { "epoch": 2.7996642170218404, "grad_norm": 1.2183283567428589, "learning_rate": 9.784326174751437e-06, "loss": 0.0587, "step": 190100 }, { "epoch": 2.8000324001119274, "grad_norm": 1.477626085281372, "learning_rate": 9.782689802784451e-06, "loss": 0.0547, "step": 190125 }, { "epoch": 2.8004005832020145, "grad_norm": 1.677478551864624, "learning_rate": 9.781053430817466e-06, "loss": 0.0615, "step": 190150 }, { "epoch": 2.8007687662921015, "grad_norm": 1.1885257959365845, "learning_rate": 9.779417058850482e-06, "loss": 0.0563, "step": 190175 }, { "epoch": 2.801136949382189, "grad_norm": 1.6312246322631836, "learning_rate": 9.777780686883498e-06, "loss": 0.0563, "step": 190200 }, { "epoch": 2.801505132472276, "grad_norm": 1.4285744428634644, "learning_rate": 9.776144314916514e-06, "loss": 0.0634, "step": 190225 }, { "epoch": 2.801873315562363, "grad_norm": 1.203006625175476, "learning_rate": 9.774507942949529e-06, "loss": 0.0681, "step": 190250 }, { "epoch": 2.80224149865245, "grad_norm": 1.2275274991989136, "learning_rate": 9.772871570982545e-06, "loss": 0.067, "step": 190275 }, { "epoch": 2.802609681742537, "grad_norm": 0.7865505814552307, "learning_rate": 9.77123519901556e-06, "loss": 0.0616, "step": 190300 }, { "epoch": 2.802977864832624, "grad_norm": 1.6754999160766602, "learning_rate": 9.769598827048575e-06, "loss": 0.0605, "step": 190325 }, { "epoch": 2.803346047922711, "grad_norm": 1.0922694206237793, "learning_rate": 9.76796245508159e-06, "loss": 0.0603, "step": 190350 }, { "epoch": 2.803714231012798, "grad_norm": 1.393845558166504, "learning_rate": 9.766326083114606e-06, "loss": 0.058, "step": 190375 }, { "epoch": 2.804082414102885, "grad_norm": 1.4487552642822266, "learning_rate": 9.764689711147622e-06, "loss": 0.0651, "step": 190400 }, { "epoch": 2.804450597192972, "grad_norm": 1.6551408767700195, "learning_rate": 9.763053339180637e-06, "loss": 0.0579, "step": 190425 }, { "epoch": 2.804818780283059, "grad_norm": 1.6418046951293945, "learning_rate": 9.761416967213651e-06, "loss": 0.0545, "step": 190450 }, { "epoch": 2.805186963373146, "grad_norm": 1.3582541942596436, "learning_rate": 9.759780595246667e-06, "loss": 0.0539, "step": 190475 }, { "epoch": 2.805555146463233, "grad_norm": 1.7742160558700562, "learning_rate": 9.758144223279683e-06, "loss": 0.0687, "step": 190500 }, { "epoch": 2.80592332955332, "grad_norm": 1.258548378944397, "learning_rate": 9.7565078513127e-06, "loss": 0.0592, "step": 190525 }, { "epoch": 2.806291512643407, "grad_norm": 1.7473278045654297, "learning_rate": 9.754871479345714e-06, "loss": 0.0584, "step": 190550 }, { "epoch": 2.8066596957334946, "grad_norm": 1.3154525756835938, "learning_rate": 9.753235107378728e-06, "loss": 0.0556, "step": 190575 }, { "epoch": 2.8070278788235816, "grad_norm": 1.2229403257369995, "learning_rate": 9.751598735411745e-06, "loss": 0.0636, "step": 190600 }, { "epoch": 2.8073960619136686, "grad_norm": 1.0417224168777466, "learning_rate": 9.74996236344476e-06, "loss": 0.0575, "step": 190625 }, { "epoch": 2.8077642450037557, "grad_norm": 1.4170892238616943, "learning_rate": 9.748325991477777e-06, "loss": 0.062, "step": 190650 }, { "epoch": 2.8081324280938427, "grad_norm": 1.2770042419433594, "learning_rate": 9.746689619510791e-06, "loss": 0.0586, "step": 190675 }, { "epoch": 2.8085006111839297, "grad_norm": 1.2343826293945312, "learning_rate": 9.745053247543806e-06, "loss": 0.0518, "step": 190700 }, { "epoch": 2.8088687942740167, "grad_norm": 1.0419585704803467, "learning_rate": 9.743416875576822e-06, "loss": 0.0569, "step": 190725 }, { "epoch": 2.8092369773641037, "grad_norm": 1.4943057298660278, "learning_rate": 9.741780503609838e-06, "loss": 0.0646, "step": 190750 }, { "epoch": 2.8096051604541907, "grad_norm": 1.158626675605774, "learning_rate": 9.740144131642853e-06, "loss": 0.0563, "step": 190775 }, { "epoch": 2.8099733435442777, "grad_norm": 1.5623724460601807, "learning_rate": 9.738507759675869e-06, "loss": 0.0594, "step": 190800 }, { "epoch": 2.8103415266343648, "grad_norm": 1.8177307844161987, "learning_rate": 9.736871387708883e-06, "loss": 0.0676, "step": 190825 }, { "epoch": 2.8107097097244518, "grad_norm": 1.16116201877594, "learning_rate": 9.7352350157419e-06, "loss": 0.0592, "step": 190850 }, { "epoch": 2.811077892814539, "grad_norm": 1.3950963020324707, "learning_rate": 9.733598643774914e-06, "loss": 0.0591, "step": 190875 }, { "epoch": 2.811446075904626, "grad_norm": 1.4604241847991943, "learning_rate": 9.73196227180793e-06, "loss": 0.0623, "step": 190900 }, { "epoch": 2.811814258994713, "grad_norm": 1.6212618350982666, "learning_rate": 9.730325899840946e-06, "loss": 0.065, "step": 190925 }, { "epoch": 2.8121824420848, "grad_norm": 1.211201548576355, "learning_rate": 9.72868952787396e-06, "loss": 0.0585, "step": 190950 }, { "epoch": 2.812550625174887, "grad_norm": 1.1423178911209106, "learning_rate": 9.727053155906977e-06, "loss": 0.0554, "step": 190975 }, { "epoch": 2.812918808264974, "grad_norm": 0.9317471385002136, "learning_rate": 9.725416783939991e-06, "loss": 0.0626, "step": 191000 }, { "epoch": 2.813286991355061, "grad_norm": 1.9102895259857178, "learning_rate": 9.723780411973007e-06, "loss": 0.0574, "step": 191025 }, { "epoch": 2.813655174445148, "grad_norm": 1.346814751625061, "learning_rate": 9.722144040006023e-06, "loss": 0.0589, "step": 191050 }, { "epoch": 2.814023357535235, "grad_norm": 1.6709024906158447, "learning_rate": 9.720507668039038e-06, "loss": 0.0641, "step": 191075 }, { "epoch": 2.814391540625322, "grad_norm": 1.28024423122406, "learning_rate": 9.718871296072054e-06, "loss": 0.0578, "step": 191100 }, { "epoch": 2.814759723715409, "grad_norm": 1.3274121284484863, "learning_rate": 9.717234924105068e-06, "loss": 0.0573, "step": 191125 }, { "epoch": 2.815127906805496, "grad_norm": 1.5620265007019043, "learning_rate": 9.715598552138085e-06, "loss": 0.0617, "step": 191150 }, { "epoch": 2.8154960898955834, "grad_norm": 0.5231131911277771, "learning_rate": 9.7139621801711e-06, "loss": 0.051, "step": 191175 }, { "epoch": 2.8158642729856704, "grad_norm": 1.5454624891281128, "learning_rate": 9.712325808204115e-06, "loss": 0.0581, "step": 191200 }, { "epoch": 2.8162324560757575, "grad_norm": 0.9377264380455017, "learning_rate": 9.710689436237131e-06, "loss": 0.0558, "step": 191225 }, { "epoch": 2.8166006391658445, "grad_norm": 1.467632532119751, "learning_rate": 9.709053064270146e-06, "loss": 0.0579, "step": 191250 }, { "epoch": 2.8169688222559315, "grad_norm": 1.655173659324646, "learning_rate": 9.707416692303162e-06, "loss": 0.0616, "step": 191275 }, { "epoch": 2.8173370053460185, "grad_norm": 1.3735733032226562, "learning_rate": 9.705780320336176e-06, "loss": 0.0591, "step": 191300 }, { "epoch": 2.8177051884361055, "grad_norm": 1.1340259313583374, "learning_rate": 9.704143948369193e-06, "loss": 0.0587, "step": 191325 }, { "epoch": 2.8180733715261925, "grad_norm": 1.3795756101608276, "learning_rate": 9.702507576402209e-06, "loss": 0.0538, "step": 191350 }, { "epoch": 2.8184415546162795, "grad_norm": 0.8095846772193909, "learning_rate": 9.700871204435223e-06, "loss": 0.0554, "step": 191375 }, { "epoch": 2.8188097377063666, "grad_norm": 1.9934107065200806, "learning_rate": 9.699234832468238e-06, "loss": 0.0655, "step": 191400 }, { "epoch": 2.8191779207964536, "grad_norm": 1.474034309387207, "learning_rate": 9.697598460501254e-06, "loss": 0.0621, "step": 191425 }, { "epoch": 2.8195461038865406, "grad_norm": 0.9095655679702759, "learning_rate": 9.69596208853427e-06, "loss": 0.053, "step": 191450 }, { "epoch": 2.8199142869766276, "grad_norm": 0.9106353521347046, "learning_rate": 9.694325716567286e-06, "loss": 0.0551, "step": 191475 }, { "epoch": 2.8202824700667146, "grad_norm": 1.4420366287231445, "learning_rate": 9.6926893446003e-06, "loss": 0.0645, "step": 191500 }, { "epoch": 2.820650653156802, "grad_norm": 1.4797770977020264, "learning_rate": 9.691052972633315e-06, "loss": 0.0609, "step": 191525 }, { "epoch": 2.821018836246889, "grad_norm": 1.2653862237930298, "learning_rate": 9.689416600666331e-06, "loss": 0.0594, "step": 191550 }, { "epoch": 2.821387019336976, "grad_norm": 1.404115915298462, "learning_rate": 9.687780228699347e-06, "loss": 0.062, "step": 191575 }, { "epoch": 2.821755202427063, "grad_norm": 1.2101653814315796, "learning_rate": 9.686143856732362e-06, "loss": 0.0537, "step": 191600 }, { "epoch": 2.82212338551715, "grad_norm": 1.290461540222168, "learning_rate": 9.684507484765378e-06, "loss": 0.0535, "step": 191625 }, { "epoch": 2.822491568607237, "grad_norm": 1.5852519273757935, "learning_rate": 9.682936567677072e-06, "loss": 0.0689, "step": 191650 }, { "epoch": 2.822859751697324, "grad_norm": 1.3701965808868408, "learning_rate": 9.681300195710088e-06, "loss": 0.0636, "step": 191675 }, { "epoch": 2.823227934787411, "grad_norm": 1.1893665790557861, "learning_rate": 9.679663823743104e-06, "loss": 0.0517, "step": 191700 }, { "epoch": 2.823596117877498, "grad_norm": 1.3377516269683838, "learning_rate": 9.678027451776119e-06, "loss": 0.0552, "step": 191725 }, { "epoch": 2.823964300967585, "grad_norm": 1.5781387090682983, "learning_rate": 9.676391079809135e-06, "loss": 0.0565, "step": 191750 }, { "epoch": 2.8243324840576722, "grad_norm": 1.5812073945999146, "learning_rate": 9.67475470784215e-06, "loss": 0.0554, "step": 191775 }, { "epoch": 2.8247006671477592, "grad_norm": 1.8488749265670776, "learning_rate": 9.673118335875165e-06, "loss": 0.0596, "step": 191800 }, { "epoch": 2.8250688502378463, "grad_norm": 1.237788200378418, "learning_rate": 9.67148196390818e-06, "loss": 0.0626, "step": 191825 }, { "epoch": 2.8254370333279333, "grad_norm": 1.0844690799713135, "learning_rate": 9.669845591941196e-06, "loss": 0.0583, "step": 191850 }, { "epoch": 2.8258052164180203, "grad_norm": 1.0248757600784302, "learning_rate": 9.668209219974212e-06, "loss": 0.0657, "step": 191875 }, { "epoch": 2.8261733995081073, "grad_norm": 1.0006017684936523, "learning_rate": 9.666572848007227e-06, "loss": 0.0588, "step": 191900 }, { "epoch": 2.8265415825981943, "grad_norm": 1.8060781955718994, "learning_rate": 9.664936476040243e-06, "loss": 0.0618, "step": 191925 }, { "epoch": 2.8269097656882813, "grad_norm": 0.8489823341369629, "learning_rate": 9.663300104073257e-06, "loss": 0.0511, "step": 191950 }, { "epoch": 2.8272779487783684, "grad_norm": 1.4254478216171265, "learning_rate": 9.661663732106273e-06, "loss": 0.0586, "step": 191975 }, { "epoch": 2.8276461318684554, "grad_norm": 1.8723089694976807, "learning_rate": 9.66002736013929e-06, "loss": 0.0651, "step": 192000 }, { "epoch": 2.8280143149585424, "grad_norm": 1.5793136358261108, "learning_rate": 9.658390988172304e-06, "loss": 0.0566, "step": 192025 }, { "epoch": 2.8283824980486294, "grad_norm": 0.5374088883399963, "learning_rate": 9.65675461620532e-06, "loss": 0.0612, "step": 192050 }, { "epoch": 2.8287506811387164, "grad_norm": 0.8083250522613525, "learning_rate": 9.655118244238335e-06, "loss": 0.056, "step": 192075 }, { "epoch": 2.8291188642288034, "grad_norm": 2.0485692024230957, "learning_rate": 9.65348187227135e-06, "loss": 0.0553, "step": 192100 }, { "epoch": 2.829487047318891, "grad_norm": 1.5060235261917114, "learning_rate": 9.651845500304367e-06, "loss": 0.0591, "step": 192125 }, { "epoch": 2.829855230408978, "grad_norm": 1.2350915670394897, "learning_rate": 9.650209128337381e-06, "loss": 0.0597, "step": 192150 }, { "epoch": 2.830223413499065, "grad_norm": 1.228594422340393, "learning_rate": 9.648572756370397e-06, "loss": 0.0626, "step": 192175 }, { "epoch": 2.830591596589152, "grad_norm": 1.490804672241211, "learning_rate": 9.646936384403412e-06, "loss": 0.065, "step": 192200 }, { "epoch": 2.830959779679239, "grad_norm": 1.1721911430358887, "learning_rate": 9.645300012436428e-06, "loss": 0.0636, "step": 192225 }, { "epoch": 2.831327962769326, "grad_norm": 1.5919784307479858, "learning_rate": 9.643663640469443e-06, "loss": 0.0646, "step": 192250 }, { "epoch": 2.831696145859413, "grad_norm": 1.3846238851547241, "learning_rate": 9.642027268502459e-06, "loss": 0.0604, "step": 192275 }, { "epoch": 2.8320643289495, "grad_norm": 1.2938507795333862, "learning_rate": 9.640390896535475e-06, "loss": 0.0586, "step": 192300 }, { "epoch": 2.832432512039587, "grad_norm": 1.9460163116455078, "learning_rate": 9.63875452456849e-06, "loss": 0.0597, "step": 192325 }, { "epoch": 2.832800695129674, "grad_norm": 1.302515983581543, "learning_rate": 9.637118152601504e-06, "loss": 0.0601, "step": 192350 }, { "epoch": 2.833168878219761, "grad_norm": 1.4734759330749512, "learning_rate": 9.63548178063452e-06, "loss": 0.0626, "step": 192375 }, { "epoch": 2.833537061309848, "grad_norm": 1.3492851257324219, "learning_rate": 9.633845408667536e-06, "loss": 0.0581, "step": 192400 }, { "epoch": 2.833905244399935, "grad_norm": 1.4389796257019043, "learning_rate": 9.632209036700552e-06, "loss": 0.0604, "step": 192425 }, { "epoch": 2.834273427490022, "grad_norm": 1.309415578842163, "learning_rate": 9.630572664733567e-06, "loss": 0.0644, "step": 192450 }, { "epoch": 2.8346416105801095, "grad_norm": 1.527631402015686, "learning_rate": 9.628936292766581e-06, "loss": 0.0616, "step": 192475 }, { "epoch": 2.8350097936701966, "grad_norm": 1.1195049285888672, "learning_rate": 9.627299920799597e-06, "loss": 0.0597, "step": 192500 }, { "epoch": 2.8353779767602836, "grad_norm": 1.2835516929626465, "learning_rate": 9.625663548832613e-06, "loss": 0.0567, "step": 192525 }, { "epoch": 2.8357461598503706, "grad_norm": 1.5211371183395386, "learning_rate": 9.624027176865628e-06, "loss": 0.0539, "step": 192550 }, { "epoch": 2.8361143429404576, "grad_norm": 1.4089497327804565, "learning_rate": 9.622390804898644e-06, "loss": 0.06, "step": 192575 }, { "epoch": 2.8364825260305446, "grad_norm": 1.1427394151687622, "learning_rate": 9.62075443293166e-06, "loss": 0.0577, "step": 192600 }, { "epoch": 2.8368507091206316, "grad_norm": 1.4656684398651123, "learning_rate": 9.619118060964675e-06, "loss": 0.0575, "step": 192625 }, { "epoch": 2.8372188922107187, "grad_norm": 1.2018636465072632, "learning_rate": 9.61748168899769e-06, "loss": 0.0594, "step": 192650 }, { "epoch": 2.8375870753008057, "grad_norm": 1.888146162033081, "learning_rate": 9.615845317030705e-06, "loss": 0.0596, "step": 192675 }, { "epoch": 2.8379552583908927, "grad_norm": 1.2280315160751343, "learning_rate": 9.614208945063721e-06, "loss": 0.0596, "step": 192700 }, { "epoch": 2.8383234414809797, "grad_norm": 1.18323814868927, "learning_rate": 9.612572573096738e-06, "loss": 0.053, "step": 192725 }, { "epoch": 2.8386916245710667, "grad_norm": 1.4740184545516968, "learning_rate": 9.610936201129752e-06, "loss": 0.0569, "step": 192750 }, { "epoch": 2.8390598076611537, "grad_norm": 1.4008374214172363, "learning_rate": 9.609299829162766e-06, "loss": 0.0659, "step": 192775 }, { "epoch": 2.8394279907512407, "grad_norm": 1.2718358039855957, "learning_rate": 9.607663457195783e-06, "loss": 0.0608, "step": 192800 }, { "epoch": 2.8397961738413278, "grad_norm": 1.1847437620162964, "learning_rate": 9.606027085228799e-06, "loss": 0.0551, "step": 192825 }, { "epoch": 2.840164356931415, "grad_norm": 1.2949888706207275, "learning_rate": 9.604390713261815e-06, "loss": 0.0625, "step": 192850 }, { "epoch": 2.840532540021502, "grad_norm": 1.5082757472991943, "learning_rate": 9.60275434129483e-06, "loss": 0.0596, "step": 192875 }, { "epoch": 2.840900723111589, "grad_norm": 1.6205650568008423, "learning_rate": 9.601117969327844e-06, "loss": 0.0605, "step": 192900 }, { "epoch": 2.841268906201676, "grad_norm": 1.9189810752868652, "learning_rate": 9.59948159736086e-06, "loss": 0.0649, "step": 192925 }, { "epoch": 2.841637089291763, "grad_norm": 1.4988007545471191, "learning_rate": 9.597845225393876e-06, "loss": 0.0558, "step": 192950 }, { "epoch": 2.84200527238185, "grad_norm": 1.3511971235275269, "learning_rate": 9.59620885342689e-06, "loss": 0.0567, "step": 192975 }, { "epoch": 2.842373455471937, "grad_norm": 1.4849319458007812, "learning_rate": 9.594572481459907e-06, "loss": 0.0648, "step": 193000 }, { "epoch": 2.842741638562024, "grad_norm": 1.265085220336914, "learning_rate": 9.592936109492921e-06, "loss": 0.0598, "step": 193025 }, { "epoch": 2.843109821652111, "grad_norm": 1.3015947341918945, "learning_rate": 9.591299737525937e-06, "loss": 0.059, "step": 193050 }, { "epoch": 2.8434780047421984, "grad_norm": 1.3761097192764282, "learning_rate": 9.589663365558953e-06, "loss": 0.0505, "step": 193075 }, { "epoch": 2.8438461878322854, "grad_norm": 1.2499710321426392, "learning_rate": 9.588026993591968e-06, "loss": 0.0639, "step": 193100 }, { "epoch": 2.8442143709223724, "grad_norm": 0.9448404908180237, "learning_rate": 9.586390621624984e-06, "loss": 0.0584, "step": 193125 }, { "epoch": 2.8445825540124594, "grad_norm": 0.8087537884712219, "learning_rate": 9.584754249657999e-06, "loss": 0.0629, "step": 193150 }, { "epoch": 2.8449507371025464, "grad_norm": 1.224900245666504, "learning_rate": 9.583117877691015e-06, "loss": 0.0561, "step": 193175 }, { "epoch": 2.8453189201926334, "grad_norm": 1.2548829317092896, "learning_rate": 9.58148150572403e-06, "loss": 0.0615, "step": 193200 }, { "epoch": 2.8456871032827205, "grad_norm": 1.1911438703536987, "learning_rate": 9.579845133757045e-06, "loss": 0.0556, "step": 193225 }, { "epoch": 2.8460552863728075, "grad_norm": 1.4781550168991089, "learning_rate": 9.578208761790061e-06, "loss": 0.0599, "step": 193250 }, { "epoch": 2.8464234694628945, "grad_norm": 1.4878883361816406, "learning_rate": 9.576572389823076e-06, "loss": 0.0603, "step": 193275 }, { "epoch": 2.8467916525529815, "grad_norm": 1.0348962545394897, "learning_rate": 9.574936017856092e-06, "loss": 0.0594, "step": 193300 }, { "epoch": 2.8471598356430685, "grad_norm": 1.458094835281372, "learning_rate": 9.573299645889107e-06, "loss": 0.0619, "step": 193325 }, { "epoch": 2.8475280187331555, "grad_norm": 0.9041836857795715, "learning_rate": 9.571663273922123e-06, "loss": 0.0569, "step": 193350 }, { "epoch": 2.8478962018232425, "grad_norm": 1.3955724239349365, "learning_rate": 9.570026901955139e-06, "loss": 0.0633, "step": 193375 }, { "epoch": 2.8482643849133296, "grad_norm": 1.040228009223938, "learning_rate": 9.568390529988153e-06, "loss": 0.0605, "step": 193400 }, { "epoch": 2.8486325680034166, "grad_norm": 0.4016520082950592, "learning_rate": 9.56675415802117e-06, "loss": 0.061, "step": 193425 }, { "epoch": 2.849000751093504, "grad_norm": 1.5045928955078125, "learning_rate": 9.565117786054184e-06, "loss": 0.0589, "step": 193450 }, { "epoch": 2.849368934183591, "grad_norm": 1.043886661529541, "learning_rate": 9.5634814140872e-06, "loss": 0.0527, "step": 193475 }, { "epoch": 2.849737117273678, "grad_norm": 1.453575849533081, "learning_rate": 9.561845042120215e-06, "loss": 0.0613, "step": 193500 }, { "epoch": 2.850105300363765, "grad_norm": 1.272387146949768, "learning_rate": 9.56020867015323e-06, "loss": 0.0514, "step": 193525 }, { "epoch": 2.850473483453852, "grad_norm": 1.2844825983047485, "learning_rate": 9.558572298186247e-06, "loss": 0.0662, "step": 193550 }, { "epoch": 2.850841666543939, "grad_norm": 2.290719509124756, "learning_rate": 9.556935926219261e-06, "loss": 0.06, "step": 193575 }, { "epoch": 2.851209849634026, "grad_norm": 1.6012428998947144, "learning_rate": 9.555299554252277e-06, "loss": 0.0598, "step": 193600 }, { "epoch": 2.851578032724113, "grad_norm": 1.329693078994751, "learning_rate": 9.553663182285292e-06, "loss": 0.0666, "step": 193625 }, { "epoch": 2.8519462158142, "grad_norm": 1.206016182899475, "learning_rate": 9.552026810318308e-06, "loss": 0.0569, "step": 193650 }, { "epoch": 2.852314398904287, "grad_norm": 1.5623241662979126, "learning_rate": 9.550390438351324e-06, "loss": 0.0614, "step": 193675 }, { "epoch": 2.852682581994374, "grad_norm": 1.1449590921401978, "learning_rate": 9.548754066384339e-06, "loss": 0.0513, "step": 193700 }, { "epoch": 2.853050765084461, "grad_norm": 1.553481101989746, "learning_rate": 9.547117694417353e-06, "loss": 0.0553, "step": 193725 }, { "epoch": 2.853418948174548, "grad_norm": 1.6526141166687012, "learning_rate": 9.54548132245037e-06, "loss": 0.0574, "step": 193750 }, { "epoch": 2.8537871312646352, "grad_norm": 0.970827579498291, "learning_rate": 9.543844950483385e-06, "loss": 0.0504, "step": 193775 }, { "epoch": 2.8541553143547223, "grad_norm": 0.8908568620681763, "learning_rate": 9.542208578516402e-06, "loss": 0.0592, "step": 193800 }, { "epoch": 2.8545234974448093, "grad_norm": 1.5711696147918701, "learning_rate": 9.540572206549416e-06, "loss": 0.0597, "step": 193825 }, { "epoch": 2.8548916805348963, "grad_norm": 1.2680684328079224, "learning_rate": 9.53893583458243e-06, "loss": 0.0555, "step": 193850 }, { "epoch": 2.8552598636249833, "grad_norm": 1.4148808717727661, "learning_rate": 9.537299462615447e-06, "loss": 0.0617, "step": 193875 }, { "epoch": 2.8556280467150703, "grad_norm": 1.4733003377914429, "learning_rate": 9.535663090648463e-06, "loss": 0.0561, "step": 193900 }, { "epoch": 2.8559962298051573, "grad_norm": 1.8704921007156372, "learning_rate": 9.534026718681477e-06, "loss": 0.0557, "step": 193925 }, { "epoch": 2.8563644128952443, "grad_norm": 1.3796284198760986, "learning_rate": 9.532390346714493e-06, "loss": 0.0627, "step": 193950 }, { "epoch": 2.8567325959853314, "grad_norm": 1.5480225086212158, "learning_rate": 9.530753974747508e-06, "loss": 0.0536, "step": 193975 }, { "epoch": 2.8571007790754184, "grad_norm": 1.2552282810211182, "learning_rate": 9.529117602780524e-06, "loss": 0.0569, "step": 194000 }, { "epoch": 2.8574689621655054, "grad_norm": 1.0808730125427246, "learning_rate": 9.527481230813538e-06, "loss": 0.0578, "step": 194025 }, { "epoch": 2.857837145255593, "grad_norm": 1.4201253652572632, "learning_rate": 9.525844858846555e-06, "loss": 0.0571, "step": 194050 }, { "epoch": 2.85820532834568, "grad_norm": 1.670086145401001, "learning_rate": 9.52420848687957e-06, "loss": 0.0618, "step": 194075 }, { "epoch": 2.858573511435767, "grad_norm": 1.5384279489517212, "learning_rate": 9.522572114912585e-06, "loss": 0.0603, "step": 194100 }, { "epoch": 2.858941694525854, "grad_norm": 2.0621936321258545, "learning_rate": 9.520935742945601e-06, "loss": 0.0569, "step": 194125 }, { "epoch": 2.859309877615941, "grad_norm": 1.7698636054992676, "learning_rate": 9.519299370978616e-06, "loss": 0.0597, "step": 194150 }, { "epoch": 2.859678060706028, "grad_norm": 1.7682251930236816, "learning_rate": 9.517662999011632e-06, "loss": 0.0596, "step": 194175 }, { "epoch": 2.860046243796115, "grad_norm": 1.6107062101364136, "learning_rate": 9.516026627044648e-06, "loss": 0.0608, "step": 194200 }, { "epoch": 2.860414426886202, "grad_norm": 1.0938622951507568, "learning_rate": 9.514390255077664e-06, "loss": 0.0572, "step": 194225 }, { "epoch": 2.860782609976289, "grad_norm": 1.2379047870635986, "learning_rate": 9.512753883110679e-06, "loss": 0.0575, "step": 194250 }, { "epoch": 2.861150793066376, "grad_norm": 1.2361403703689575, "learning_rate": 9.511117511143693e-06, "loss": 0.0634, "step": 194275 }, { "epoch": 2.861518976156463, "grad_norm": 0.9031369686126709, "learning_rate": 9.50948113917671e-06, "loss": 0.0578, "step": 194300 }, { "epoch": 2.86188715924655, "grad_norm": 1.6158697605133057, "learning_rate": 9.507844767209725e-06, "loss": 0.0577, "step": 194325 }, { "epoch": 2.862255342336637, "grad_norm": 1.3848204612731934, "learning_rate": 9.50620839524274e-06, "loss": 0.0631, "step": 194350 }, { "epoch": 2.862623525426724, "grad_norm": 1.1341049671173096, "learning_rate": 9.504572023275756e-06, "loss": 0.0628, "step": 194375 }, { "epoch": 2.8629917085168115, "grad_norm": 1.663681983947754, "learning_rate": 9.50293565130877e-06, "loss": 0.0583, "step": 194400 }, { "epoch": 2.8633598916068985, "grad_norm": 1.6502104997634888, "learning_rate": 9.501299279341787e-06, "loss": 0.0617, "step": 194425 }, { "epoch": 2.8637280746969855, "grad_norm": 1.509641170501709, "learning_rate": 9.499662907374801e-06, "loss": 0.0594, "step": 194450 }, { "epoch": 2.8640962577870726, "grad_norm": 1.3160899877548218, "learning_rate": 9.498091990286497e-06, "loss": 0.0549, "step": 194475 }, { "epoch": 2.8644644408771596, "grad_norm": 1.005088448524475, "learning_rate": 9.496455618319513e-06, "loss": 0.0578, "step": 194500 }, { "epoch": 2.8648326239672466, "grad_norm": 1.698104977607727, "learning_rate": 9.494819246352527e-06, "loss": 0.0589, "step": 194525 }, { "epoch": 2.8652008070573336, "grad_norm": 0.8770014643669128, "learning_rate": 9.493182874385543e-06, "loss": 0.0618, "step": 194550 }, { "epoch": 2.8655689901474206, "grad_norm": 1.386398196220398, "learning_rate": 9.491546502418558e-06, "loss": 0.0615, "step": 194575 }, { "epoch": 2.8659371732375076, "grad_norm": 1.7904797792434692, "learning_rate": 9.489910130451574e-06, "loss": 0.0643, "step": 194600 }, { "epoch": 2.8663053563275946, "grad_norm": 1.0641735792160034, "learning_rate": 9.48827375848459e-06, "loss": 0.0583, "step": 194625 }, { "epoch": 2.8666735394176817, "grad_norm": 1.3294438123703003, "learning_rate": 9.486637386517605e-06, "loss": 0.0579, "step": 194650 }, { "epoch": 2.8670417225077687, "grad_norm": 1.261055827140808, "learning_rate": 9.48500101455062e-06, "loss": 0.0602, "step": 194675 }, { "epoch": 2.8674099055978557, "grad_norm": 1.0894662141799927, "learning_rate": 9.483364642583635e-06, "loss": 0.0589, "step": 194700 }, { "epoch": 2.8677780886879427, "grad_norm": 1.9460142850875854, "learning_rate": 9.481728270616651e-06, "loss": 0.0592, "step": 194725 }, { "epoch": 2.8681462717780297, "grad_norm": 1.5306462049484253, "learning_rate": 9.480091898649668e-06, "loss": 0.0643, "step": 194750 }, { "epoch": 2.8685144548681167, "grad_norm": 1.7471768856048584, "learning_rate": 9.478455526682682e-06, "loss": 0.0592, "step": 194775 }, { "epoch": 2.8688826379582038, "grad_norm": 1.1541129350662231, "learning_rate": 9.476819154715697e-06, "loss": 0.0547, "step": 194800 }, { "epoch": 2.8692508210482908, "grad_norm": 1.23127281665802, "learning_rate": 9.475182782748713e-06, "loss": 0.0625, "step": 194825 }, { "epoch": 2.869619004138378, "grad_norm": 1.8355498313903809, "learning_rate": 9.473546410781729e-06, "loss": 0.0646, "step": 194850 }, { "epoch": 2.869987187228465, "grad_norm": 1.1252093315124512, "learning_rate": 9.471910038814743e-06, "loss": 0.0601, "step": 194875 }, { "epoch": 2.870355370318552, "grad_norm": 1.7468353509902954, "learning_rate": 9.47027366684776e-06, "loss": 0.0624, "step": 194900 }, { "epoch": 2.870723553408639, "grad_norm": 1.70834219455719, "learning_rate": 9.468637294880776e-06, "loss": 0.0636, "step": 194925 }, { "epoch": 2.871091736498726, "grad_norm": 1.8979442119598389, "learning_rate": 9.46700092291379e-06, "loss": 0.0595, "step": 194950 }, { "epoch": 2.871459919588813, "grad_norm": 1.704552412033081, "learning_rate": 9.465364550946805e-06, "loss": 0.0559, "step": 194975 }, { "epoch": 2.8718281026789003, "grad_norm": 1.685447335243225, "learning_rate": 9.46372817897982e-06, "loss": 0.0633, "step": 195000 }, { "epoch": 2.8721962857689873, "grad_norm": 1.3126622438430786, "learning_rate": 9.462091807012837e-06, "loss": 0.0629, "step": 195025 }, { "epoch": 2.8725644688590743, "grad_norm": 1.6387419700622559, "learning_rate": 9.460455435045853e-06, "loss": 0.0617, "step": 195050 }, { "epoch": 2.8729326519491614, "grad_norm": 1.2038549184799194, "learning_rate": 9.458819063078867e-06, "loss": 0.0581, "step": 195075 }, { "epoch": 2.8733008350392484, "grad_norm": 1.4955627918243408, "learning_rate": 9.457182691111882e-06, "loss": 0.0631, "step": 195100 }, { "epoch": 2.8736690181293354, "grad_norm": 2.1704089641571045, "learning_rate": 9.455546319144898e-06, "loss": 0.0614, "step": 195125 }, { "epoch": 2.8740372012194224, "grad_norm": 0.9554424285888672, "learning_rate": 9.453909947177914e-06, "loss": 0.0629, "step": 195150 }, { "epoch": 2.8744053843095094, "grad_norm": 1.3388690948486328, "learning_rate": 9.45227357521093e-06, "loss": 0.059, "step": 195175 }, { "epoch": 2.8747735673995964, "grad_norm": 1.060782551765442, "learning_rate": 9.450637203243945e-06, "loss": 0.0617, "step": 195200 }, { "epoch": 2.8751417504896835, "grad_norm": 1.4692620038986206, "learning_rate": 9.44900083127696e-06, "loss": 0.0555, "step": 195225 }, { "epoch": 2.8755099335797705, "grad_norm": 1.1032123565673828, "learning_rate": 9.447364459309975e-06, "loss": 0.0612, "step": 195250 }, { "epoch": 2.8758781166698575, "grad_norm": 1.2734709978103638, "learning_rate": 9.445728087342992e-06, "loss": 0.0573, "step": 195275 }, { "epoch": 2.8762462997599445, "grad_norm": 1.4377365112304688, "learning_rate": 9.444091715376006e-06, "loss": 0.0535, "step": 195300 }, { "epoch": 2.8766144828500315, "grad_norm": 1.3288897275924683, "learning_rate": 9.442455343409022e-06, "loss": 0.0546, "step": 195325 }, { "epoch": 2.876982665940119, "grad_norm": 1.4181761741638184, "learning_rate": 9.440818971442037e-06, "loss": 0.0606, "step": 195350 }, { "epoch": 2.877350849030206, "grad_norm": 1.2221509218215942, "learning_rate": 9.439182599475053e-06, "loss": 0.0523, "step": 195375 }, { "epoch": 2.877719032120293, "grad_norm": 0.9942285418510437, "learning_rate": 9.437546227508067e-06, "loss": 0.0583, "step": 195400 }, { "epoch": 2.87808721521038, "grad_norm": 1.4243788719177246, "learning_rate": 9.435909855541083e-06, "loss": 0.0582, "step": 195425 }, { "epoch": 2.878455398300467, "grad_norm": 1.2714022397994995, "learning_rate": 9.4342734835741e-06, "loss": 0.0633, "step": 195450 }, { "epoch": 2.878823581390554, "grad_norm": 1.3662664890289307, "learning_rate": 9.432637111607114e-06, "loss": 0.0602, "step": 195475 }, { "epoch": 2.879191764480641, "grad_norm": 1.4568315744400024, "learning_rate": 9.431000739640128e-06, "loss": 0.0698, "step": 195500 }, { "epoch": 2.879559947570728, "grad_norm": 1.3788787126541138, "learning_rate": 9.429364367673145e-06, "loss": 0.0584, "step": 195525 }, { "epoch": 2.879928130660815, "grad_norm": 1.4465004205703735, "learning_rate": 9.42772799570616e-06, "loss": 0.0563, "step": 195550 }, { "epoch": 2.880296313750902, "grad_norm": 1.3699222803115845, "learning_rate": 9.426091623739177e-06, "loss": 0.0591, "step": 195575 }, { "epoch": 2.880664496840989, "grad_norm": 1.3015691041946411, "learning_rate": 9.424455251772191e-06, "loss": 0.06, "step": 195600 }, { "epoch": 2.881032679931076, "grad_norm": 1.9338303804397583, "learning_rate": 9.422818879805208e-06, "loss": 0.0577, "step": 195625 }, { "epoch": 2.881400863021163, "grad_norm": 1.7295316457748413, "learning_rate": 9.421182507838222e-06, "loss": 0.0693, "step": 195650 }, { "epoch": 2.88176904611125, "grad_norm": 1.2147678136825562, "learning_rate": 9.419546135871238e-06, "loss": 0.0608, "step": 195675 }, { "epoch": 2.882137229201337, "grad_norm": 0.9938950538635254, "learning_rate": 9.417909763904254e-06, "loss": 0.0562, "step": 195700 }, { "epoch": 2.882505412291424, "grad_norm": 1.5220997333526611, "learning_rate": 9.416273391937269e-06, "loss": 0.0556, "step": 195725 }, { "epoch": 2.8828735953815112, "grad_norm": 1.557158350944519, "learning_rate": 9.414637019970285e-06, "loss": 0.0599, "step": 195750 }, { "epoch": 2.8832417784715982, "grad_norm": 1.4891746044158936, "learning_rate": 9.4130006480033e-06, "loss": 0.0526, "step": 195775 }, { "epoch": 2.8836099615616853, "grad_norm": 1.334468126296997, "learning_rate": 9.411364276036315e-06, "loss": 0.0585, "step": 195800 }, { "epoch": 2.8839781446517723, "grad_norm": 0.9822672009468079, "learning_rate": 9.40972790406933e-06, "loss": 0.0563, "step": 195825 }, { "epoch": 2.8843463277418593, "grad_norm": 1.0776010751724243, "learning_rate": 9.408091532102346e-06, "loss": 0.053, "step": 195850 }, { "epoch": 2.8847145108319463, "grad_norm": 1.5215586423873901, "learning_rate": 9.406455160135362e-06, "loss": 0.059, "step": 195875 }, { "epoch": 2.8850826939220333, "grad_norm": 1.5357248783111572, "learning_rate": 9.404818788168377e-06, "loss": 0.0637, "step": 195900 }, { "epoch": 2.8854508770121203, "grad_norm": 1.2864866256713867, "learning_rate": 9.403182416201391e-06, "loss": 0.056, "step": 195925 }, { "epoch": 2.885819060102208, "grad_norm": 1.7680096626281738, "learning_rate": 9.401546044234407e-06, "loss": 0.0611, "step": 195950 }, { "epoch": 2.886187243192295, "grad_norm": 1.2400763034820557, "learning_rate": 9.399909672267423e-06, "loss": 0.0527, "step": 195975 }, { "epoch": 2.886555426282382, "grad_norm": 1.0655015707015991, "learning_rate": 9.39827330030044e-06, "loss": 0.0686, "step": 196000 }, { "epoch": 2.886923609372469, "grad_norm": 1.801745057106018, "learning_rate": 9.396636928333454e-06, "loss": 0.065, "step": 196025 }, { "epoch": 2.887291792462556, "grad_norm": 1.1378285884857178, "learning_rate": 9.395000556366469e-06, "loss": 0.0592, "step": 196050 }, { "epoch": 2.887659975552643, "grad_norm": 1.8716405630111694, "learning_rate": 9.393364184399485e-06, "loss": 0.0656, "step": 196075 }, { "epoch": 2.88802815864273, "grad_norm": 1.603352427482605, "learning_rate": 9.3917278124325e-06, "loss": 0.0562, "step": 196100 }, { "epoch": 2.888396341732817, "grad_norm": 1.1756269931793213, "learning_rate": 9.390091440465517e-06, "loss": 0.0566, "step": 196125 }, { "epoch": 2.888764524822904, "grad_norm": 1.252945899963379, "learning_rate": 9.388455068498531e-06, "loss": 0.0642, "step": 196150 }, { "epoch": 2.889132707912991, "grad_norm": 1.6491494178771973, "learning_rate": 9.386818696531546e-06, "loss": 0.061, "step": 196175 }, { "epoch": 2.889500891003078, "grad_norm": 1.4682763814926147, "learning_rate": 9.385182324564562e-06, "loss": 0.0612, "step": 196200 }, { "epoch": 2.889869074093165, "grad_norm": 1.8137708902359009, "learning_rate": 9.383545952597578e-06, "loss": 0.0563, "step": 196225 }, { "epoch": 2.890237257183252, "grad_norm": 1.2542399168014526, "learning_rate": 9.381909580630593e-06, "loss": 0.0631, "step": 196250 }, { "epoch": 2.890605440273339, "grad_norm": 1.7905925512313843, "learning_rate": 9.380273208663609e-06, "loss": 0.0564, "step": 196275 }, { "epoch": 2.890973623363426, "grad_norm": 1.4301177263259888, "learning_rate": 9.378636836696623e-06, "loss": 0.0633, "step": 196300 }, { "epoch": 2.8913418064535135, "grad_norm": 1.5495744943618774, "learning_rate": 9.37700046472964e-06, "loss": 0.0541, "step": 196325 }, { "epoch": 2.8917099895436005, "grad_norm": 1.355925440788269, "learning_rate": 9.375364092762654e-06, "loss": 0.0589, "step": 196350 }, { "epoch": 2.8920781726336875, "grad_norm": 1.479114055633545, "learning_rate": 9.37372772079567e-06, "loss": 0.0607, "step": 196375 }, { "epoch": 2.8924463557237745, "grad_norm": 1.381609320640564, "learning_rate": 9.372091348828686e-06, "loss": 0.0608, "step": 196400 }, { "epoch": 2.8928145388138615, "grad_norm": 0.8497568368911743, "learning_rate": 9.3704549768617e-06, "loss": 0.0556, "step": 196425 }, { "epoch": 2.8931827219039485, "grad_norm": 1.6425498723983765, "learning_rate": 9.368818604894717e-06, "loss": 0.0596, "step": 196450 }, { "epoch": 2.8935509049940356, "grad_norm": 1.2890487909317017, "learning_rate": 9.367182232927731e-06, "loss": 0.0525, "step": 196475 }, { "epoch": 2.8939190880841226, "grad_norm": 1.6077224016189575, "learning_rate": 9.365545860960747e-06, "loss": 0.0558, "step": 196500 }, { "epoch": 2.8942872711742096, "grad_norm": 1.4123090505599976, "learning_rate": 9.363909488993764e-06, "loss": 0.0582, "step": 196525 }, { "epoch": 2.8946554542642966, "grad_norm": 1.333767056465149, "learning_rate": 9.36227311702678e-06, "loss": 0.0609, "step": 196550 }, { "epoch": 2.8950236373543836, "grad_norm": 0.9862918257713318, "learning_rate": 9.360636745059794e-06, "loss": 0.0604, "step": 196575 }, { "epoch": 2.8953918204444706, "grad_norm": 1.1639968156814575, "learning_rate": 9.359000373092809e-06, "loss": 0.0567, "step": 196600 }, { "epoch": 2.8957600035345576, "grad_norm": 1.0329935550689697, "learning_rate": 9.357364001125825e-06, "loss": 0.0574, "step": 196625 }, { "epoch": 2.8961281866246447, "grad_norm": 1.156352162361145, "learning_rate": 9.355727629158841e-06, "loss": 0.0612, "step": 196650 }, { "epoch": 2.8964963697147317, "grad_norm": 1.3425453901290894, "learning_rate": 9.354091257191855e-06, "loss": 0.0593, "step": 196675 }, { "epoch": 2.8968645528048187, "grad_norm": 1.839534878730774, "learning_rate": 9.352454885224872e-06, "loss": 0.0631, "step": 196700 }, { "epoch": 2.8972327358949057, "grad_norm": 1.8900396823883057, "learning_rate": 9.350818513257886e-06, "loss": 0.056, "step": 196725 }, { "epoch": 2.8976009189849927, "grad_norm": 0.9381417036056519, "learning_rate": 9.349182141290902e-06, "loss": 0.052, "step": 196750 }, { "epoch": 2.8979691020750797, "grad_norm": 1.9540637731552124, "learning_rate": 9.347545769323917e-06, "loss": 0.0603, "step": 196775 }, { "epoch": 2.8983372851651668, "grad_norm": 1.345794677734375, "learning_rate": 9.345909397356933e-06, "loss": 0.056, "step": 196800 }, { "epoch": 2.8987054682552538, "grad_norm": 1.263490915298462, "learning_rate": 9.344273025389949e-06, "loss": 0.054, "step": 196825 }, { "epoch": 2.899073651345341, "grad_norm": 1.1459684371948242, "learning_rate": 9.342636653422963e-06, "loss": 0.0543, "step": 196850 }, { "epoch": 2.899441834435428, "grad_norm": 1.0339220762252808, "learning_rate": 9.341000281455978e-06, "loss": 0.0596, "step": 196875 }, { "epoch": 2.899810017525515, "grad_norm": 1.0365582704544067, "learning_rate": 9.339363909488994e-06, "loss": 0.0649, "step": 196900 }, { "epoch": 2.9001782006156023, "grad_norm": 1.3709920644760132, "learning_rate": 9.33772753752201e-06, "loss": 0.0697, "step": 196925 }, { "epoch": 2.9005463837056893, "grad_norm": 0.9024575352668762, "learning_rate": 9.336091165555026e-06, "loss": 0.0477, "step": 196950 }, { "epoch": 2.9009145667957763, "grad_norm": 1.5772453546524048, "learning_rate": 9.33445479358804e-06, "loss": 0.0617, "step": 196975 }, { "epoch": 2.9012827498858633, "grad_norm": 1.1659525632858276, "learning_rate": 9.332818421621055e-06, "loss": 0.056, "step": 197000 }, { "epoch": 2.9016509329759503, "grad_norm": 1.7544668912887573, "learning_rate": 9.331182049654071e-06, "loss": 0.0618, "step": 197025 }, { "epoch": 2.9020191160660374, "grad_norm": 1.6566730737686157, "learning_rate": 9.329545677687087e-06, "loss": 0.058, "step": 197050 }, { "epoch": 2.9023872991561244, "grad_norm": 1.152753472328186, "learning_rate": 9.327909305720104e-06, "loss": 0.063, "step": 197075 }, { "epoch": 2.9027554822462114, "grad_norm": 1.2455028295516968, "learning_rate": 9.326272933753118e-06, "loss": 0.0614, "step": 197100 }, { "epoch": 2.9031236653362984, "grad_norm": 1.3942725658416748, "learning_rate": 9.324636561786134e-06, "loss": 0.055, "step": 197125 }, { "epoch": 2.9034918484263854, "grad_norm": 1.179827094078064, "learning_rate": 9.323000189819149e-06, "loss": 0.0592, "step": 197150 }, { "epoch": 2.9038600315164724, "grad_norm": 1.185843825340271, "learning_rate": 9.321363817852165e-06, "loss": 0.0595, "step": 197175 }, { "epoch": 2.9042282146065594, "grad_norm": 1.4101228713989258, "learning_rate": 9.31972744588518e-06, "loss": 0.0604, "step": 197200 }, { "epoch": 2.9045963976966465, "grad_norm": 1.3172492980957031, "learning_rate": 9.318091073918195e-06, "loss": 0.0562, "step": 197225 }, { "epoch": 2.9049645807867335, "grad_norm": 1.3225313425064087, "learning_rate": 9.316454701951212e-06, "loss": 0.0604, "step": 197250 }, { "epoch": 2.905332763876821, "grad_norm": 1.6840494871139526, "learning_rate": 9.314818329984226e-06, "loss": 0.0543, "step": 197275 }, { "epoch": 2.905700946966908, "grad_norm": 1.3955174684524536, "learning_rate": 9.31318195801724e-06, "loss": 0.0597, "step": 197300 }, { "epoch": 2.906069130056995, "grad_norm": 1.1126434803009033, "learning_rate": 9.311545586050257e-06, "loss": 0.0571, "step": 197325 }, { "epoch": 2.906437313147082, "grad_norm": 1.7837954759597778, "learning_rate": 9.309909214083273e-06, "loss": 0.0552, "step": 197350 }, { "epoch": 2.906805496237169, "grad_norm": 1.4580146074295044, "learning_rate": 9.308272842116289e-06, "loss": 0.0573, "step": 197375 }, { "epoch": 2.907173679327256, "grad_norm": 1.662578821182251, "learning_rate": 9.306636470149303e-06, "loss": 0.0571, "step": 197400 }, { "epoch": 2.907541862417343, "grad_norm": 1.2120906114578247, "learning_rate": 9.305000098182318e-06, "loss": 0.0608, "step": 197425 }, { "epoch": 2.90791004550743, "grad_norm": 1.644189715385437, "learning_rate": 9.303363726215334e-06, "loss": 0.0601, "step": 197450 }, { "epoch": 2.908278228597517, "grad_norm": 1.4400080442428589, "learning_rate": 9.30172735424835e-06, "loss": 0.0661, "step": 197475 }, { "epoch": 2.908646411687604, "grad_norm": 1.6892335414886475, "learning_rate": 9.300090982281366e-06, "loss": 0.0634, "step": 197500 }, { "epoch": 2.909014594777691, "grad_norm": 1.0236717462539673, "learning_rate": 9.29845461031438e-06, "loss": 0.0546, "step": 197525 }, { "epoch": 2.909382777867778, "grad_norm": 1.3156410455703735, "learning_rate": 9.296818238347395e-06, "loss": 0.0505, "step": 197550 }, { "epoch": 2.909750960957865, "grad_norm": 0.8439908027648926, "learning_rate": 9.295181866380411e-06, "loss": 0.0625, "step": 197575 }, { "epoch": 2.910119144047952, "grad_norm": 1.2165777683258057, "learning_rate": 9.293545494413428e-06, "loss": 0.052, "step": 197600 }, { "epoch": 2.910487327138039, "grad_norm": 1.3894983530044556, "learning_rate": 9.291909122446442e-06, "loss": 0.0577, "step": 197625 }, { "epoch": 2.910855510228126, "grad_norm": 1.21957528591156, "learning_rate": 9.290338205358138e-06, "loss": 0.0574, "step": 197650 }, { "epoch": 2.911223693318213, "grad_norm": 1.7782771587371826, "learning_rate": 9.288701833391152e-06, "loss": 0.06, "step": 197675 }, { "epoch": 2.9115918764083, "grad_norm": 1.6327035427093506, "learning_rate": 9.287065461424168e-06, "loss": 0.0569, "step": 197700 }, { "epoch": 2.911960059498387, "grad_norm": 1.442806363105774, "learning_rate": 9.285429089457183e-06, "loss": 0.0612, "step": 197725 }, { "epoch": 2.9123282425884742, "grad_norm": 1.6058634519577026, "learning_rate": 9.283792717490199e-06, "loss": 0.0602, "step": 197750 }, { "epoch": 2.9126964256785612, "grad_norm": 1.7758101224899292, "learning_rate": 9.282156345523215e-06, "loss": 0.0549, "step": 197775 }, { "epoch": 2.9130646087686483, "grad_norm": 1.0256634950637817, "learning_rate": 9.28051997355623e-06, "loss": 0.054, "step": 197800 }, { "epoch": 2.9134327918587353, "grad_norm": 1.1333004236221313, "learning_rate": 9.278883601589244e-06, "loss": 0.0582, "step": 197825 }, { "epoch": 2.9138009749488223, "grad_norm": 1.649069905281067, "learning_rate": 9.27724722962226e-06, "loss": 0.0603, "step": 197850 }, { "epoch": 2.9141691580389097, "grad_norm": 1.7726221084594727, "learning_rate": 9.275610857655276e-06, "loss": 0.0603, "step": 197875 }, { "epoch": 2.9145373411289968, "grad_norm": 0.7078713178634644, "learning_rate": 9.273974485688292e-06, "loss": 0.058, "step": 197900 }, { "epoch": 2.9149055242190838, "grad_norm": 1.1463229656219482, "learning_rate": 9.272338113721307e-06, "loss": 0.0632, "step": 197925 }, { "epoch": 2.915273707309171, "grad_norm": 1.6430611610412598, "learning_rate": 9.270701741754323e-06, "loss": 0.0608, "step": 197950 }, { "epoch": 2.915641890399258, "grad_norm": 1.5461478233337402, "learning_rate": 9.269065369787337e-06, "loss": 0.0612, "step": 197975 }, { "epoch": 2.916010073489345, "grad_norm": 1.0525239706039429, "learning_rate": 9.267428997820354e-06, "loss": 0.055, "step": 198000 }, { "epoch": 2.916378256579432, "grad_norm": 1.4291107654571533, "learning_rate": 9.26579262585337e-06, "loss": 0.054, "step": 198025 }, { "epoch": 2.916746439669519, "grad_norm": 0.988843560218811, "learning_rate": 9.264156253886384e-06, "loss": 0.0608, "step": 198050 }, { "epoch": 2.917114622759606, "grad_norm": 1.3206547498703003, "learning_rate": 9.2625198819194e-06, "loss": 0.0537, "step": 198075 }, { "epoch": 2.917482805849693, "grad_norm": 1.5068535804748535, "learning_rate": 9.260883509952415e-06, "loss": 0.0538, "step": 198100 }, { "epoch": 2.91785098893978, "grad_norm": 1.1686986684799194, "learning_rate": 9.259247137985431e-06, "loss": 0.063, "step": 198125 }, { "epoch": 2.918219172029867, "grad_norm": 1.5856209993362427, "learning_rate": 9.257610766018445e-06, "loss": 0.0515, "step": 198150 }, { "epoch": 2.918587355119954, "grad_norm": 1.4969663619995117, "learning_rate": 9.255974394051462e-06, "loss": 0.0587, "step": 198175 }, { "epoch": 2.918955538210041, "grad_norm": 1.4149047136306763, "learning_rate": 9.254338022084478e-06, "loss": 0.0609, "step": 198200 }, { "epoch": 2.9193237213001284, "grad_norm": 1.1060090065002441, "learning_rate": 9.252701650117492e-06, "loss": 0.0565, "step": 198225 }, { "epoch": 2.9196919043902154, "grad_norm": 1.0932490825653076, "learning_rate": 9.251065278150507e-06, "loss": 0.0558, "step": 198250 }, { "epoch": 2.9200600874803024, "grad_norm": 1.1115100383758545, "learning_rate": 9.249428906183523e-06, "loss": 0.0587, "step": 198275 }, { "epoch": 2.9204282705703895, "grad_norm": 1.6504942178726196, "learning_rate": 9.247792534216539e-06, "loss": 0.0586, "step": 198300 }, { "epoch": 2.9207964536604765, "grad_norm": 1.6405014991760254, "learning_rate": 9.246156162249555e-06, "loss": 0.0609, "step": 198325 }, { "epoch": 2.9211646367505635, "grad_norm": 1.4614890813827515, "learning_rate": 9.24451979028257e-06, "loss": 0.0601, "step": 198350 }, { "epoch": 2.9215328198406505, "grad_norm": 1.1469447612762451, "learning_rate": 9.242883418315584e-06, "loss": 0.0569, "step": 198375 }, { "epoch": 2.9219010029307375, "grad_norm": 1.199053406715393, "learning_rate": 9.2412470463486e-06, "loss": 0.057, "step": 198400 }, { "epoch": 2.9222691860208245, "grad_norm": 1.3672250509262085, "learning_rate": 9.239610674381616e-06, "loss": 0.0545, "step": 198425 }, { "epoch": 2.9226373691109115, "grad_norm": 1.0338282585144043, "learning_rate": 9.237974302414632e-06, "loss": 0.0598, "step": 198450 }, { "epoch": 2.9230055522009986, "grad_norm": 1.2343991994857788, "learning_rate": 9.236337930447647e-06, "loss": 0.058, "step": 198475 }, { "epoch": 2.9233737352910856, "grad_norm": 1.6773756742477417, "learning_rate": 9.234701558480661e-06, "loss": 0.0596, "step": 198500 }, { "epoch": 2.9237419183811726, "grad_norm": 1.4384047985076904, "learning_rate": 9.233065186513677e-06, "loss": 0.061, "step": 198525 }, { "epoch": 2.9241101014712596, "grad_norm": 1.8510615825653076, "learning_rate": 9.231428814546694e-06, "loss": 0.0583, "step": 198550 }, { "epoch": 2.9244782845613466, "grad_norm": 1.3557552099227905, "learning_rate": 9.229792442579708e-06, "loss": 0.0562, "step": 198575 }, { "epoch": 2.9248464676514336, "grad_norm": 1.180547833442688, "learning_rate": 9.228156070612724e-06, "loss": 0.0526, "step": 198600 }, { "epoch": 2.9252146507415207, "grad_norm": 1.7722705602645874, "learning_rate": 9.226519698645739e-06, "loss": 0.0636, "step": 198625 }, { "epoch": 2.9255828338316077, "grad_norm": 0.9078608751296997, "learning_rate": 9.224883326678755e-06, "loss": 0.0607, "step": 198650 }, { "epoch": 2.9259510169216947, "grad_norm": 1.3928896188735962, "learning_rate": 9.22324695471177e-06, "loss": 0.0588, "step": 198675 }, { "epoch": 2.9263192000117817, "grad_norm": 1.5424017906188965, "learning_rate": 9.221610582744785e-06, "loss": 0.0616, "step": 198700 }, { "epoch": 2.9266873831018687, "grad_norm": 1.3789658546447754, "learning_rate": 9.219974210777802e-06, "loss": 0.0584, "step": 198725 }, { "epoch": 2.9270555661919557, "grad_norm": 0.9868493676185608, "learning_rate": 9.218337838810816e-06, "loss": 0.0649, "step": 198750 }, { "epoch": 2.9274237492820427, "grad_norm": 0.9839689135551453, "learning_rate": 9.216701466843832e-06, "loss": 0.0616, "step": 198775 }, { "epoch": 2.9277919323721298, "grad_norm": 1.1599619388580322, "learning_rate": 9.215065094876847e-06, "loss": 0.0574, "step": 198800 }, { "epoch": 2.928160115462217, "grad_norm": 1.0458452701568604, "learning_rate": 9.213428722909863e-06, "loss": 0.0501, "step": 198825 }, { "epoch": 2.9285282985523042, "grad_norm": 1.0514401197433472, "learning_rate": 9.211792350942879e-06, "loss": 0.0658, "step": 198850 }, { "epoch": 2.9288964816423912, "grad_norm": 1.8742988109588623, "learning_rate": 9.210155978975893e-06, "loss": 0.0627, "step": 198875 }, { "epoch": 2.9292646647324783, "grad_norm": 1.4387950897216797, "learning_rate": 9.20851960700891e-06, "loss": 0.0605, "step": 198900 }, { "epoch": 2.9296328478225653, "grad_norm": 0.8504129648208618, "learning_rate": 9.206883235041924e-06, "loss": 0.0612, "step": 198925 }, { "epoch": 2.9300010309126523, "grad_norm": 0.9518362283706665, "learning_rate": 9.20524686307494e-06, "loss": 0.0659, "step": 198950 }, { "epoch": 2.9303692140027393, "grad_norm": 1.277635931968689, "learning_rate": 9.203610491107956e-06, "loss": 0.0616, "step": 198975 }, { "epoch": 2.9307373970928263, "grad_norm": 1.2637767791748047, "learning_rate": 9.20197411914097e-06, "loss": 0.0636, "step": 199000 }, { "epoch": 2.9311055801829133, "grad_norm": 1.6286725997924805, "learning_rate": 9.200337747173987e-06, "loss": 0.0535, "step": 199025 }, { "epoch": 2.9314737632730004, "grad_norm": 1.6246912479400635, "learning_rate": 9.198701375207001e-06, "loss": 0.0609, "step": 199050 }, { "epoch": 2.9318419463630874, "grad_norm": 1.269919514656067, "learning_rate": 9.197065003240018e-06, "loss": 0.0646, "step": 199075 }, { "epoch": 2.9322101294531744, "grad_norm": 1.518126368522644, "learning_rate": 9.195428631273032e-06, "loss": 0.0554, "step": 199100 }, { "epoch": 2.9325783125432614, "grad_norm": 1.5499554872512817, "learning_rate": 9.193792259306048e-06, "loss": 0.064, "step": 199125 }, { "epoch": 2.9329464956333484, "grad_norm": 1.7440624237060547, "learning_rate": 9.192155887339064e-06, "loss": 0.0578, "step": 199150 }, { "epoch": 2.9333146787234354, "grad_norm": 1.2438544034957886, "learning_rate": 9.190519515372079e-06, "loss": 0.0614, "step": 199175 }, { "epoch": 2.933682861813523, "grad_norm": 1.824089527130127, "learning_rate": 9.188883143405093e-06, "loss": 0.0548, "step": 199200 }, { "epoch": 2.93405104490361, "grad_norm": 1.448590636253357, "learning_rate": 9.18724677143811e-06, "loss": 0.0616, "step": 199225 }, { "epoch": 2.934419227993697, "grad_norm": 1.3385372161865234, "learning_rate": 9.185610399471126e-06, "loss": 0.0615, "step": 199250 }, { "epoch": 2.934787411083784, "grad_norm": 1.1906473636627197, "learning_rate": 9.183974027504142e-06, "loss": 0.0554, "step": 199275 }, { "epoch": 2.935155594173871, "grad_norm": 1.8147873878479004, "learning_rate": 9.182337655537156e-06, "loss": 0.0562, "step": 199300 }, { "epoch": 2.935523777263958, "grad_norm": 1.5189435482025146, "learning_rate": 9.18070128357017e-06, "loss": 0.0559, "step": 199325 }, { "epoch": 2.935891960354045, "grad_norm": 1.525354027748108, "learning_rate": 9.179064911603187e-06, "loss": 0.0564, "step": 199350 }, { "epoch": 2.936260143444132, "grad_norm": 1.9442694187164307, "learning_rate": 9.177428539636203e-06, "loss": 0.0565, "step": 199375 }, { "epoch": 2.936628326534219, "grad_norm": 1.1037707328796387, "learning_rate": 9.175792167669217e-06, "loss": 0.0606, "step": 199400 }, { "epoch": 2.936996509624306, "grad_norm": 1.0446170568466187, "learning_rate": 9.174155795702234e-06, "loss": 0.0557, "step": 199425 }, { "epoch": 2.937364692714393, "grad_norm": 1.5533865690231323, "learning_rate": 9.172519423735248e-06, "loss": 0.0559, "step": 199450 }, { "epoch": 2.93773287580448, "grad_norm": 1.2778167724609375, "learning_rate": 9.170883051768264e-06, "loss": 0.0583, "step": 199475 }, { "epoch": 2.938101058894567, "grad_norm": 1.3502825498580933, "learning_rate": 9.16924667980128e-06, "loss": 0.0588, "step": 199500 }, { "epoch": 2.938469241984654, "grad_norm": 1.5013251304626465, "learning_rate": 9.167610307834295e-06, "loss": 0.0586, "step": 199525 }, { "epoch": 2.938837425074741, "grad_norm": 1.59925377368927, "learning_rate": 9.165973935867311e-06, "loss": 0.0609, "step": 199550 }, { "epoch": 2.939205608164828, "grad_norm": 1.480793833732605, "learning_rate": 9.164337563900327e-06, "loss": 0.0593, "step": 199575 }, { "epoch": 2.939573791254915, "grad_norm": 1.4268132448196411, "learning_rate": 9.162701191933342e-06, "loss": 0.0571, "step": 199600 }, { "epoch": 2.939941974345002, "grad_norm": 1.4799308776855469, "learning_rate": 9.161064819966356e-06, "loss": 0.0535, "step": 199625 }, { "epoch": 2.940310157435089, "grad_norm": 1.0943411588668823, "learning_rate": 9.159428447999372e-06, "loss": 0.0608, "step": 199650 }, { "epoch": 2.940678340525176, "grad_norm": 1.4410858154296875, "learning_rate": 9.157792076032388e-06, "loss": 0.0656, "step": 199675 }, { "epoch": 2.941046523615263, "grad_norm": 1.3466460704803467, "learning_rate": 9.156155704065404e-06, "loss": 0.06, "step": 199700 }, { "epoch": 2.94141470670535, "grad_norm": 0.9451752305030823, "learning_rate": 9.154519332098419e-06, "loss": 0.0545, "step": 199725 }, { "epoch": 2.9417828897954372, "grad_norm": 0.7051096558570862, "learning_rate": 9.152882960131433e-06, "loss": 0.0615, "step": 199750 }, { "epoch": 2.9421510728855242, "grad_norm": 1.1519935131072998, "learning_rate": 9.15124658816445e-06, "loss": 0.0599, "step": 199775 }, { "epoch": 2.9425192559756117, "grad_norm": 1.8065868616104126, "learning_rate": 9.149610216197466e-06, "loss": 0.064, "step": 199800 }, { "epoch": 2.9428874390656987, "grad_norm": 1.9942238330841064, "learning_rate": 9.14797384423048e-06, "loss": 0.0599, "step": 199825 }, { "epoch": 2.9432556221557857, "grad_norm": 1.9204283952713013, "learning_rate": 9.146337472263496e-06, "loss": 0.0658, "step": 199850 }, { "epoch": 2.9436238052458727, "grad_norm": 1.3510510921478271, "learning_rate": 9.14470110029651e-06, "loss": 0.0569, "step": 199875 }, { "epoch": 2.9439919883359598, "grad_norm": 1.260505199432373, "learning_rate": 9.143064728329527e-06, "loss": 0.0515, "step": 199900 }, { "epoch": 2.944360171426047, "grad_norm": 1.3015497922897339, "learning_rate": 9.141428356362543e-06, "loss": 0.0483, "step": 199925 }, { "epoch": 2.944728354516134, "grad_norm": 1.0609599351882935, "learning_rate": 9.139791984395557e-06, "loss": 0.0621, "step": 199950 }, { "epoch": 2.945096537606221, "grad_norm": 1.3109073638916016, "learning_rate": 9.138155612428574e-06, "loss": 0.0612, "step": 199975 }, { "epoch": 2.945464720696308, "grad_norm": 1.788352608680725, "learning_rate": 9.136519240461588e-06, "loss": 0.0572, "step": 200000 }, { "epoch": 2.945832903786395, "grad_norm": 1.0755196809768677, "learning_rate": 9.134882868494604e-06, "loss": 0.0545, "step": 200025 }, { "epoch": 2.946201086876482, "grad_norm": 1.3027923107147217, "learning_rate": 9.133246496527619e-06, "loss": 0.0629, "step": 200050 }, { "epoch": 2.946569269966569, "grad_norm": 1.7882213592529297, "learning_rate": 9.131610124560635e-06, "loss": 0.0601, "step": 200075 }, { "epoch": 2.946937453056656, "grad_norm": 1.8756657838821411, "learning_rate": 9.129973752593651e-06, "loss": 0.0638, "step": 200100 }, { "epoch": 2.947305636146743, "grad_norm": 1.380398154258728, "learning_rate": 9.128337380626665e-06, "loss": 0.0557, "step": 200125 }, { "epoch": 2.9476738192368304, "grad_norm": 1.7494428157806396, "learning_rate": 9.126701008659682e-06, "loss": 0.0545, "step": 200150 }, { "epoch": 2.9480420023269174, "grad_norm": 1.886902928352356, "learning_rate": 9.125064636692696e-06, "loss": 0.0556, "step": 200175 }, { "epoch": 2.9484101854170044, "grad_norm": 0.9188262224197388, "learning_rate": 9.123428264725712e-06, "loss": 0.0586, "step": 200200 }, { "epoch": 2.9487783685070914, "grad_norm": 1.4986294507980347, "learning_rate": 9.121791892758728e-06, "loss": 0.0536, "step": 200225 }, { "epoch": 2.9491465515971784, "grad_norm": 1.2192198038101196, "learning_rate": 9.120155520791743e-06, "loss": 0.0552, "step": 200250 }, { "epoch": 2.9495147346872654, "grad_norm": 1.0087519884109497, "learning_rate": 9.118519148824759e-06, "loss": 0.05, "step": 200275 }, { "epoch": 2.9498829177773525, "grad_norm": 1.7876970767974854, "learning_rate": 9.116882776857773e-06, "loss": 0.0607, "step": 200300 }, { "epoch": 2.9502511008674395, "grad_norm": 0.9782829880714417, "learning_rate": 9.11524640489079e-06, "loss": 0.053, "step": 200325 }, { "epoch": 2.9506192839575265, "grad_norm": 1.240944743156433, "learning_rate": 9.113610032923804e-06, "loss": 0.0592, "step": 200350 }, { "epoch": 2.9509874670476135, "grad_norm": 1.4407756328582764, "learning_rate": 9.11197366095682e-06, "loss": 0.0614, "step": 200375 }, { "epoch": 2.9513556501377005, "grad_norm": 2.1824145317077637, "learning_rate": 9.110337288989836e-06, "loss": 0.0605, "step": 200400 }, { "epoch": 2.9517238332277875, "grad_norm": 1.4299499988555908, "learning_rate": 9.10870091702285e-06, "loss": 0.0575, "step": 200425 }, { "epoch": 2.9520920163178745, "grad_norm": 1.2828664779663086, "learning_rate": 9.107064545055867e-06, "loss": 0.0557, "step": 200450 }, { "epoch": 2.9524601994079616, "grad_norm": 1.1413097381591797, "learning_rate": 9.105428173088881e-06, "loss": 0.0514, "step": 200475 }, { "epoch": 2.9528283824980486, "grad_norm": 2.0417468547821045, "learning_rate": 9.103791801121898e-06, "loss": 0.0568, "step": 200500 }, { "epoch": 2.9531965655881356, "grad_norm": 1.0465189218521118, "learning_rate": 9.102155429154914e-06, "loss": 0.0556, "step": 200525 }, { "epoch": 2.9535647486782226, "grad_norm": 1.49721360206604, "learning_rate": 9.100519057187928e-06, "loss": 0.0662, "step": 200550 }, { "epoch": 2.9539329317683096, "grad_norm": 1.5440685749053955, "learning_rate": 9.098882685220943e-06, "loss": 0.0557, "step": 200575 }, { "epoch": 2.9543011148583966, "grad_norm": 1.5729954242706299, "learning_rate": 9.097246313253959e-06, "loss": 0.0699, "step": 200600 }, { "epoch": 2.9546692979484837, "grad_norm": 1.790576457977295, "learning_rate": 9.095609941286975e-06, "loss": 0.062, "step": 200625 }, { "epoch": 2.9550374810385707, "grad_norm": 0.7784260511398315, "learning_rate": 9.093973569319991e-06, "loss": 0.0564, "step": 200650 }, { "epoch": 2.9554056641286577, "grad_norm": 1.3388988971710205, "learning_rate": 9.092337197353006e-06, "loss": 0.0562, "step": 200675 }, { "epoch": 2.9557738472187447, "grad_norm": 1.1975830793380737, "learning_rate": 9.09070082538602e-06, "loss": 0.0563, "step": 200700 }, { "epoch": 2.9561420303088317, "grad_norm": 1.378483772277832, "learning_rate": 9.089064453419036e-06, "loss": 0.0588, "step": 200725 }, { "epoch": 2.956510213398919, "grad_norm": 1.3845373392105103, "learning_rate": 9.087428081452052e-06, "loss": 0.0582, "step": 200750 }, { "epoch": 2.956878396489006, "grad_norm": 1.903778314590454, "learning_rate": 9.085791709485067e-06, "loss": 0.0548, "step": 200775 }, { "epoch": 2.957246579579093, "grad_norm": 1.680147647857666, "learning_rate": 9.084155337518083e-06, "loss": 0.0606, "step": 200800 }, { "epoch": 2.95761476266918, "grad_norm": 1.376102328300476, "learning_rate": 9.082518965551097e-06, "loss": 0.0615, "step": 200825 }, { "epoch": 2.9579829457592672, "grad_norm": 1.7232086658477783, "learning_rate": 9.080882593584114e-06, "loss": 0.0573, "step": 200850 }, { "epoch": 2.9583511288493543, "grad_norm": 1.1853758096694946, "learning_rate": 9.079246221617128e-06, "loss": 0.0613, "step": 200875 }, { "epoch": 2.9587193119394413, "grad_norm": 1.0940239429473877, "learning_rate": 9.077609849650144e-06, "loss": 0.0551, "step": 200900 }, { "epoch": 2.9590874950295283, "grad_norm": 1.3854516744613647, "learning_rate": 9.07597347768316e-06, "loss": 0.0622, "step": 200925 }, { "epoch": 2.9594556781196153, "grad_norm": 0.9698584675788879, "learning_rate": 9.074337105716175e-06, "loss": 0.054, "step": 200950 }, { "epoch": 2.9598238612097023, "grad_norm": 1.4902364015579224, "learning_rate": 9.072700733749191e-06, "loss": 0.0583, "step": 200975 }, { "epoch": 2.9601920442997893, "grad_norm": 1.326014518737793, "learning_rate": 9.071064361782205e-06, "loss": 0.0629, "step": 201000 }, { "epoch": 2.9605602273898763, "grad_norm": 1.4434059858322144, "learning_rate": 9.069427989815221e-06, "loss": 0.057, "step": 201025 }, { "epoch": 2.9609284104799634, "grad_norm": 1.6042250394821167, "learning_rate": 9.067791617848238e-06, "loss": 0.0554, "step": 201050 }, { "epoch": 2.9612965935700504, "grad_norm": 1.7414075136184692, "learning_rate": 9.066155245881254e-06, "loss": 0.0574, "step": 201075 }, { "epoch": 2.9616647766601374, "grad_norm": 1.4316617250442505, "learning_rate": 9.064518873914268e-06, "loss": 0.0581, "step": 201100 }, { "epoch": 2.962032959750225, "grad_norm": 1.2498750686645508, "learning_rate": 9.062882501947283e-06, "loss": 0.0556, "step": 201125 }, { "epoch": 2.962401142840312, "grad_norm": 1.5134295225143433, "learning_rate": 9.061246129980299e-06, "loss": 0.0615, "step": 201150 }, { "epoch": 2.962769325930399, "grad_norm": 1.3428409099578857, "learning_rate": 9.059609758013315e-06, "loss": 0.0595, "step": 201175 }, { "epoch": 2.963137509020486, "grad_norm": 1.099374771118164, "learning_rate": 9.05797338604633e-06, "loss": 0.062, "step": 201200 }, { "epoch": 2.963505692110573, "grad_norm": 1.5205073356628418, "learning_rate": 9.056337014079346e-06, "loss": 0.0608, "step": 201225 }, { "epoch": 2.96387387520066, "grad_norm": 0.7118973731994629, "learning_rate": 9.05470064211236e-06, "loss": 0.0672, "step": 201250 }, { "epoch": 2.964242058290747, "grad_norm": 1.3384051322937012, "learning_rate": 9.053064270145376e-06, "loss": 0.0537, "step": 201275 }, { "epoch": 2.964610241380834, "grad_norm": 1.144583821296692, "learning_rate": 9.05142789817839e-06, "loss": 0.064, "step": 201300 }, { "epoch": 2.964978424470921, "grad_norm": 1.328416347503662, "learning_rate": 9.049791526211407e-06, "loss": 0.056, "step": 201325 }, { "epoch": 2.965346607561008, "grad_norm": 1.483860969543457, "learning_rate": 9.048155154244423e-06, "loss": 0.0551, "step": 201350 }, { "epoch": 2.965714790651095, "grad_norm": 1.4993629455566406, "learning_rate": 9.046518782277437e-06, "loss": 0.0649, "step": 201375 }, { "epoch": 2.966082973741182, "grad_norm": 1.240976095199585, "learning_rate": 9.044882410310454e-06, "loss": 0.0623, "step": 201400 }, { "epoch": 2.966451156831269, "grad_norm": 0.7770968079566956, "learning_rate": 9.043246038343468e-06, "loss": 0.0542, "step": 201425 }, { "epoch": 2.966819339921356, "grad_norm": 0.8711233735084534, "learning_rate": 9.041609666376484e-06, "loss": 0.0562, "step": 201450 }, { "epoch": 2.967187523011443, "grad_norm": 1.3974469900131226, "learning_rate": 9.0399732944095e-06, "loss": 0.0617, "step": 201475 }, { "epoch": 2.96755570610153, "grad_norm": 1.3322957754135132, "learning_rate": 9.038336922442515e-06, "loss": 0.0611, "step": 201500 }, { "epoch": 2.967923889191617, "grad_norm": 1.739691972732544, "learning_rate": 9.03670055047553e-06, "loss": 0.0596, "step": 201525 }, { "epoch": 2.968292072281704, "grad_norm": 1.4545892477035522, "learning_rate": 9.035129633387225e-06, "loss": 0.0627, "step": 201550 }, { "epoch": 2.968660255371791, "grad_norm": 1.595164179801941, "learning_rate": 9.033493261420241e-06, "loss": 0.0578, "step": 201575 }, { "epoch": 2.969028438461878, "grad_norm": 1.5550832748413086, "learning_rate": 9.031856889453257e-06, "loss": 0.0572, "step": 201600 }, { "epoch": 2.969396621551965, "grad_norm": 0.9695870876312256, "learning_rate": 9.030220517486272e-06, "loss": 0.0557, "step": 201625 }, { "epoch": 2.969764804642052, "grad_norm": 1.5909755229949951, "learning_rate": 9.028584145519286e-06, "loss": 0.0558, "step": 201650 }, { "epoch": 2.970132987732139, "grad_norm": 1.476220965385437, "learning_rate": 9.026947773552302e-06, "loss": 0.0612, "step": 201675 }, { "epoch": 2.970501170822226, "grad_norm": 1.614236831665039, "learning_rate": 9.025311401585318e-06, "loss": 0.0528, "step": 201700 }, { "epoch": 2.9708693539123137, "grad_norm": 1.523722529411316, "learning_rate": 9.023675029618333e-06, "loss": 0.054, "step": 201725 }, { "epoch": 2.9712375370024007, "grad_norm": 0.7770445942878723, "learning_rate": 9.022038657651349e-06, "loss": 0.0628, "step": 201750 }, { "epoch": 2.9716057200924877, "grad_norm": 1.3149023056030273, "learning_rate": 9.020402285684363e-06, "loss": 0.0623, "step": 201775 }, { "epoch": 2.9719739031825747, "grad_norm": 1.6708271503448486, "learning_rate": 9.01876591371738e-06, "loss": 0.0618, "step": 201800 }, { "epoch": 2.9723420862726617, "grad_norm": 1.0455543994903564, "learning_rate": 9.017129541750394e-06, "loss": 0.0539, "step": 201825 }, { "epoch": 2.9727102693627487, "grad_norm": 1.2296518087387085, "learning_rate": 9.01549316978341e-06, "loss": 0.0632, "step": 201850 }, { "epoch": 2.9730784524528358, "grad_norm": 1.0646535158157349, "learning_rate": 9.013856797816426e-06, "loss": 0.0502, "step": 201875 }, { "epoch": 2.9734466355429228, "grad_norm": 1.8293626308441162, "learning_rate": 9.012220425849443e-06, "loss": 0.0555, "step": 201900 }, { "epoch": 2.97381481863301, "grad_norm": 1.4057419300079346, "learning_rate": 9.010584053882457e-06, "loss": 0.0578, "step": 201925 }, { "epoch": 2.974183001723097, "grad_norm": 1.6182752847671509, "learning_rate": 9.008947681915471e-06, "loss": 0.0578, "step": 201950 }, { "epoch": 2.974551184813184, "grad_norm": 1.1909171342849731, "learning_rate": 9.007311309948488e-06, "loss": 0.0606, "step": 201975 }, { "epoch": 2.974919367903271, "grad_norm": 1.3816593885421753, "learning_rate": 9.005674937981504e-06, "loss": 0.0545, "step": 202000 }, { "epoch": 2.975287550993358, "grad_norm": 1.3536043167114258, "learning_rate": 9.00403856601452e-06, "loss": 0.0511, "step": 202025 }, { "epoch": 2.975655734083445, "grad_norm": 1.563293218612671, "learning_rate": 9.002402194047534e-06, "loss": 0.0567, "step": 202050 }, { "epoch": 2.9760239171735323, "grad_norm": 1.500271201133728, "learning_rate": 9.000765822080549e-06, "loss": 0.0553, "step": 202075 }, { "epoch": 2.9763921002636193, "grad_norm": 1.253664493560791, "learning_rate": 8.999129450113565e-06, "loss": 0.0568, "step": 202100 }, { "epoch": 2.9767602833537063, "grad_norm": 1.2963294982910156, "learning_rate": 8.997493078146581e-06, "loss": 0.0566, "step": 202125 }, { "epoch": 2.9771284664437934, "grad_norm": 0.7844330072402954, "learning_rate": 8.995856706179596e-06, "loss": 0.0574, "step": 202150 }, { "epoch": 2.9774966495338804, "grad_norm": 1.3363014459609985, "learning_rate": 8.994220334212612e-06, "loss": 0.0627, "step": 202175 }, { "epoch": 2.9778648326239674, "grad_norm": 0.9135760068893433, "learning_rate": 8.992583962245626e-06, "loss": 0.0576, "step": 202200 }, { "epoch": 2.9782330157140544, "grad_norm": 0.970763623714447, "learning_rate": 8.990947590278642e-06, "loss": 0.0569, "step": 202225 }, { "epoch": 2.9786011988041414, "grad_norm": 1.6973836421966553, "learning_rate": 8.989311218311657e-06, "loss": 0.0597, "step": 202250 }, { "epoch": 2.9789693818942284, "grad_norm": 1.1714738607406616, "learning_rate": 8.987674846344673e-06, "loss": 0.055, "step": 202275 }, { "epoch": 2.9793375649843155, "grad_norm": 0.9725155830383301, "learning_rate": 8.986038474377689e-06, "loss": 0.0542, "step": 202300 }, { "epoch": 2.9797057480744025, "grad_norm": 1.2944258451461792, "learning_rate": 8.984402102410704e-06, "loss": 0.0546, "step": 202325 }, { "epoch": 2.9800739311644895, "grad_norm": 1.6862541437149048, "learning_rate": 8.98276573044372e-06, "loss": 0.0538, "step": 202350 }, { "epoch": 2.9804421142545765, "grad_norm": 0.9478617906570435, "learning_rate": 8.981129358476734e-06, "loss": 0.0555, "step": 202375 }, { "epoch": 2.9808102973446635, "grad_norm": 1.4293984174728394, "learning_rate": 8.97949298650975e-06, "loss": 0.0619, "step": 202400 }, { "epoch": 2.9811784804347505, "grad_norm": 1.564691185951233, "learning_rate": 8.977856614542766e-06, "loss": 0.0564, "step": 202425 }, { "epoch": 2.9815466635248375, "grad_norm": 1.2600538730621338, "learning_rate": 8.976220242575781e-06, "loss": 0.0525, "step": 202450 }, { "epoch": 2.9819148466149246, "grad_norm": 0.8747884035110474, "learning_rate": 8.974583870608797e-06, "loss": 0.0524, "step": 202475 }, { "epoch": 2.9822830297050116, "grad_norm": 1.313508152961731, "learning_rate": 8.972947498641812e-06, "loss": 0.0638, "step": 202500 }, { "epoch": 2.9826512127950986, "grad_norm": 1.1806327104568481, "learning_rate": 8.971311126674828e-06, "loss": 0.0597, "step": 202525 }, { "epoch": 2.9830193958851856, "grad_norm": 1.6251575946807861, "learning_rate": 8.969674754707844e-06, "loss": 0.0625, "step": 202550 }, { "epoch": 2.9833875789752726, "grad_norm": 1.1241422891616821, "learning_rate": 8.968038382740858e-06, "loss": 0.0569, "step": 202575 }, { "epoch": 2.9837557620653596, "grad_norm": 1.5648202896118164, "learning_rate": 8.966402010773874e-06, "loss": 0.0571, "step": 202600 }, { "epoch": 2.9841239451554467, "grad_norm": 1.427259922027588, "learning_rate": 8.964765638806889e-06, "loss": 0.0574, "step": 202625 }, { "epoch": 2.9844921282455337, "grad_norm": 1.5021920204162598, "learning_rate": 8.963129266839905e-06, "loss": 0.0554, "step": 202650 }, { "epoch": 2.984860311335621, "grad_norm": 1.5881065130233765, "learning_rate": 8.96149289487292e-06, "loss": 0.0637, "step": 202675 }, { "epoch": 2.985228494425708, "grad_norm": 1.0749396085739136, "learning_rate": 8.959856522905936e-06, "loss": 0.0513, "step": 202700 }, { "epoch": 2.985596677515795, "grad_norm": 1.4305154085159302, "learning_rate": 8.958220150938952e-06, "loss": 0.061, "step": 202725 }, { "epoch": 2.985964860605882, "grad_norm": 1.6493737697601318, "learning_rate": 8.956583778971966e-06, "loss": 0.054, "step": 202750 }, { "epoch": 2.986333043695969, "grad_norm": 1.2181254625320435, "learning_rate": 8.95494740700498e-06, "loss": 0.0627, "step": 202775 }, { "epoch": 2.986701226786056, "grad_norm": 1.3569931983947754, "learning_rate": 8.953311035037997e-06, "loss": 0.0571, "step": 202800 }, { "epoch": 2.9870694098761432, "grad_norm": 1.7755765914916992, "learning_rate": 8.951674663071013e-06, "loss": 0.0591, "step": 202825 }, { "epoch": 2.9874375929662302, "grad_norm": 1.5601924657821655, "learning_rate": 8.950038291104029e-06, "loss": 0.0562, "step": 202850 }, { "epoch": 2.9878057760563173, "grad_norm": 1.135627269744873, "learning_rate": 8.948401919137044e-06, "loss": 0.068, "step": 202875 }, { "epoch": 2.9881739591464043, "grad_norm": 1.6177241802215576, "learning_rate": 8.946765547170058e-06, "loss": 0.0546, "step": 202900 }, { "epoch": 2.9885421422364913, "grad_norm": 1.8751848936080933, "learning_rate": 8.945129175203074e-06, "loss": 0.0577, "step": 202925 }, { "epoch": 2.9889103253265783, "grad_norm": 1.1617170572280884, "learning_rate": 8.94349280323609e-06, "loss": 0.0544, "step": 202950 }, { "epoch": 2.9892785084166653, "grad_norm": 1.0930885076522827, "learning_rate": 8.941856431269107e-06, "loss": 0.0616, "step": 202975 }, { "epoch": 2.9896466915067523, "grad_norm": 1.3557119369506836, "learning_rate": 8.940220059302121e-06, "loss": 0.0615, "step": 203000 }, { "epoch": 2.99001487459684, "grad_norm": 1.8400906324386597, "learning_rate": 8.938583687335135e-06, "loss": 0.0619, "step": 203025 }, { "epoch": 2.990383057686927, "grad_norm": 1.0624231100082397, "learning_rate": 8.936947315368152e-06, "loss": 0.0588, "step": 203050 }, { "epoch": 2.990751240777014, "grad_norm": 1.1801598072052002, "learning_rate": 8.935310943401168e-06, "loss": 0.0612, "step": 203075 }, { "epoch": 2.991119423867101, "grad_norm": 1.715049386024475, "learning_rate": 8.933674571434182e-06, "loss": 0.061, "step": 203100 }, { "epoch": 2.991487606957188, "grad_norm": 1.2938154935836792, "learning_rate": 8.932038199467198e-06, "loss": 0.051, "step": 203125 }, { "epoch": 2.991855790047275, "grad_norm": 1.0665907859802246, "learning_rate": 8.930401827500213e-06, "loss": 0.0647, "step": 203150 }, { "epoch": 2.992223973137362, "grad_norm": 1.6673771142959595, "learning_rate": 8.928765455533229e-06, "loss": 0.0506, "step": 203175 }, { "epoch": 2.992592156227449, "grad_norm": 1.6066478490829468, "learning_rate": 8.927129083566243e-06, "loss": 0.0584, "step": 203200 }, { "epoch": 2.992960339317536, "grad_norm": 1.2847256660461426, "learning_rate": 8.92549271159926e-06, "loss": 0.0567, "step": 203225 }, { "epoch": 2.993328522407623, "grad_norm": 1.3385009765625, "learning_rate": 8.923856339632276e-06, "loss": 0.0508, "step": 203250 }, { "epoch": 2.99369670549771, "grad_norm": 1.4735643863677979, "learning_rate": 8.92221996766529e-06, "loss": 0.0552, "step": 203275 }, { "epoch": 2.994064888587797, "grad_norm": 1.4854005575180054, "learning_rate": 8.920583595698306e-06, "loss": 0.062, "step": 203300 }, { "epoch": 2.994433071677884, "grad_norm": 2.120142698287964, "learning_rate": 8.91894722373132e-06, "loss": 0.0566, "step": 203325 }, { "epoch": 2.994801254767971, "grad_norm": 1.3660907745361328, "learning_rate": 8.917310851764337e-06, "loss": 0.0588, "step": 203350 }, { "epoch": 2.995169437858058, "grad_norm": 0.9961246848106384, "learning_rate": 8.915674479797353e-06, "loss": 0.0563, "step": 203375 }, { "epoch": 2.995537620948145, "grad_norm": 1.458559513092041, "learning_rate": 8.914038107830368e-06, "loss": 0.0606, "step": 203400 }, { "epoch": 2.995905804038232, "grad_norm": 1.472598671913147, "learning_rate": 8.912401735863384e-06, "loss": 0.063, "step": 203425 }, { "epoch": 2.996273987128319, "grad_norm": 1.3394176959991455, "learning_rate": 8.910765363896398e-06, "loss": 0.0547, "step": 203450 }, { "epoch": 2.996642170218406, "grad_norm": 1.1453043222427368, "learning_rate": 8.909128991929414e-06, "loss": 0.0596, "step": 203475 }, { "epoch": 2.997010353308493, "grad_norm": 1.615200400352478, "learning_rate": 8.90749261996243e-06, "loss": 0.0607, "step": 203500 }, { "epoch": 2.99737853639858, "grad_norm": 1.1500487327575684, "learning_rate": 8.905856247995445e-06, "loss": 0.0564, "step": 203525 }, { "epoch": 2.997746719488667, "grad_norm": 1.5393701791763306, "learning_rate": 8.904219876028461e-06, "loss": 0.0578, "step": 203550 }, { "epoch": 2.998114902578754, "grad_norm": 0.8329442143440247, "learning_rate": 8.902583504061476e-06, "loss": 0.063, "step": 203575 }, { "epoch": 2.998483085668841, "grad_norm": 1.5081946849822998, "learning_rate": 8.900947132094492e-06, "loss": 0.0597, "step": 203600 }, { "epoch": 2.9988512687589286, "grad_norm": 1.3962911367416382, "learning_rate": 8.899310760127506e-06, "loss": 0.0704, "step": 203625 }, { "epoch": 2.9992194518490156, "grad_norm": 1.4709864854812622, "learning_rate": 8.897674388160522e-06, "loss": 0.0514, "step": 203650 }, { "epoch": 2.9995876349391026, "grad_norm": 1.0525881052017212, "learning_rate": 8.896038016193538e-06, "loss": 0.0569, "step": 203675 }, { "epoch": 2.9999558180291896, "grad_norm": 1.207714557647705, "learning_rate": 8.894401644226553e-06, "loss": 0.0535, "step": 203700 }, { "epoch": 3.0, "eval_loss": 0.06028754264116287, "eval_runtime": 112.9082, "eval_samples_per_second": 3141.916, "eval_steps_per_second": 6.138, "step": 203703 }, { "epoch": 3.0003240011192767, "grad_norm": 1.2547531127929688, "learning_rate": 8.892765272259567e-06, "loss": 0.0586, "step": 203725 }, { "epoch": 3.0006921842093637, "grad_norm": 0.9733414053916931, "learning_rate": 8.891128900292583e-06, "loss": 0.0563, "step": 203750 }, { "epoch": 3.0010603672994507, "grad_norm": 1.3888561725616455, "learning_rate": 8.8894925283256e-06, "loss": 0.0544, "step": 203775 }, { "epoch": 3.0014285503895377, "grad_norm": 1.5861791372299194, "learning_rate": 8.887856156358616e-06, "loss": 0.0526, "step": 203800 }, { "epoch": 3.0017967334796247, "grad_norm": 1.382179856300354, "learning_rate": 8.88621978439163e-06, "loss": 0.059, "step": 203825 }, { "epoch": 3.0021649165697117, "grad_norm": 1.2970032691955566, "learning_rate": 8.884583412424645e-06, "loss": 0.0511, "step": 203850 }, { "epoch": 3.0025330996597988, "grad_norm": 1.096897840499878, "learning_rate": 8.882947040457661e-06, "loss": 0.0616, "step": 203875 }, { "epoch": 3.0029012827498858, "grad_norm": 1.4717472791671753, "learning_rate": 8.881310668490677e-06, "loss": 0.0544, "step": 203900 }, { "epoch": 3.003269465839973, "grad_norm": 1.314205527305603, "learning_rate": 8.879674296523693e-06, "loss": 0.0562, "step": 203925 }, { "epoch": 3.00363764893006, "grad_norm": 1.1430346965789795, "learning_rate": 8.878037924556708e-06, "loss": 0.0515, "step": 203950 }, { "epoch": 3.004005832020147, "grad_norm": 1.457344651222229, "learning_rate": 8.876401552589722e-06, "loss": 0.0498, "step": 203975 }, { "epoch": 3.004374015110234, "grad_norm": 1.669184684753418, "learning_rate": 8.874765180622738e-06, "loss": 0.0593, "step": 204000 }, { "epoch": 3.004742198200321, "grad_norm": 1.449445128440857, "learning_rate": 8.873128808655754e-06, "loss": 0.0548, "step": 204025 }, { "epoch": 3.0051103812904083, "grad_norm": 1.431996464729309, "learning_rate": 8.871492436688769e-06, "loss": 0.0537, "step": 204050 }, { "epoch": 3.0054785643804953, "grad_norm": 0.9228318333625793, "learning_rate": 8.869856064721785e-06, "loss": 0.0592, "step": 204075 }, { "epoch": 3.0058467474705823, "grad_norm": 1.6553899049758911, "learning_rate": 8.868219692754801e-06, "loss": 0.0557, "step": 204100 }, { "epoch": 3.0062149305606694, "grad_norm": 1.8629385232925415, "learning_rate": 8.866583320787816e-06, "loss": 0.0577, "step": 204125 }, { "epoch": 3.0065831136507564, "grad_norm": 1.2063720226287842, "learning_rate": 8.86494694882083e-06, "loss": 0.0594, "step": 204150 }, { "epoch": 3.0069512967408434, "grad_norm": 0.875553548336029, "learning_rate": 8.863310576853846e-06, "loss": 0.0564, "step": 204175 }, { "epoch": 3.0073194798309304, "grad_norm": 1.0950829982757568, "learning_rate": 8.861674204886862e-06, "loss": 0.0533, "step": 204200 }, { "epoch": 3.0076876629210174, "grad_norm": 1.1200106143951416, "learning_rate": 8.860037832919879e-06, "loss": 0.0531, "step": 204225 }, { "epoch": 3.0080558460111044, "grad_norm": 0.9940056204795837, "learning_rate": 8.858401460952893e-06, "loss": 0.0574, "step": 204250 }, { "epoch": 3.0084240291011914, "grad_norm": 1.985337257385254, "learning_rate": 8.856765088985907e-06, "loss": 0.0631, "step": 204275 }, { "epoch": 3.0087922121912785, "grad_norm": 1.3617161512374878, "learning_rate": 8.855128717018924e-06, "loss": 0.0542, "step": 204300 }, { "epoch": 3.0091603952813655, "grad_norm": 2.3319380283355713, "learning_rate": 8.85349234505194e-06, "loss": 0.0554, "step": 204325 }, { "epoch": 3.0095285783714525, "grad_norm": 1.4009677171707153, "learning_rate": 8.851855973084956e-06, "loss": 0.0568, "step": 204350 }, { "epoch": 3.0098967614615395, "grad_norm": 0.9551072716712952, "learning_rate": 8.85021960111797e-06, "loss": 0.0503, "step": 204375 }, { "epoch": 3.0102649445516265, "grad_norm": 1.7699204683303833, "learning_rate": 8.848583229150985e-06, "loss": 0.0598, "step": 204400 }, { "epoch": 3.0106331276417135, "grad_norm": 1.314049482345581, "learning_rate": 8.846946857184001e-06, "loss": 0.0571, "step": 204425 }, { "epoch": 3.0110013107318006, "grad_norm": 1.1528035402297974, "learning_rate": 8.845310485217017e-06, "loss": 0.0577, "step": 204450 }, { "epoch": 3.0113694938218876, "grad_norm": 1.3904610872268677, "learning_rate": 8.843674113250032e-06, "loss": 0.0514, "step": 204475 }, { "epoch": 3.0117376769119746, "grad_norm": 1.1227728128433228, "learning_rate": 8.842037741283048e-06, "loss": 0.0568, "step": 204500 }, { "epoch": 3.012105860002062, "grad_norm": 1.5878812074661255, "learning_rate": 8.840401369316062e-06, "loss": 0.0521, "step": 204525 }, { "epoch": 3.012474043092149, "grad_norm": 1.4791150093078613, "learning_rate": 8.838764997349078e-06, "loss": 0.0559, "step": 204550 }, { "epoch": 3.012842226182236, "grad_norm": 1.7063350677490234, "learning_rate": 8.837128625382093e-06, "loss": 0.0521, "step": 204575 }, { "epoch": 3.013210409272323, "grad_norm": 1.2506731748580933, "learning_rate": 8.835492253415109e-06, "loss": 0.0524, "step": 204600 }, { "epoch": 3.01357859236241, "grad_norm": 1.1786755323410034, "learning_rate": 8.833855881448125e-06, "loss": 0.054, "step": 204625 }, { "epoch": 3.013946775452497, "grad_norm": 1.3485809564590454, "learning_rate": 8.83221950948114e-06, "loss": 0.0499, "step": 204650 }, { "epoch": 3.014314958542584, "grad_norm": 1.3246203660964966, "learning_rate": 8.830583137514156e-06, "loss": 0.0598, "step": 204675 }, { "epoch": 3.014683141632671, "grad_norm": 0.915708065032959, "learning_rate": 8.82894676554717e-06, "loss": 0.0589, "step": 204700 }, { "epoch": 3.015051324722758, "grad_norm": 1.1194193363189697, "learning_rate": 8.827310393580186e-06, "loss": 0.0566, "step": 204725 }, { "epoch": 3.015419507812845, "grad_norm": 0.7385013699531555, "learning_rate": 8.825674021613202e-06, "loss": 0.0546, "step": 204750 }, { "epoch": 3.015787690902932, "grad_norm": 1.127184271812439, "learning_rate": 8.824037649646217e-06, "loss": 0.0537, "step": 204775 }, { "epoch": 3.016155873993019, "grad_norm": 1.0766645669937134, "learning_rate": 8.822401277679233e-06, "loss": 0.0615, "step": 204800 }, { "epoch": 3.0165240570831062, "grad_norm": 1.3340160846710205, "learning_rate": 8.820764905712248e-06, "loss": 0.056, "step": 204825 }, { "epoch": 3.0168922401731932, "grad_norm": 1.3009932041168213, "learning_rate": 8.819128533745264e-06, "loss": 0.0511, "step": 204850 }, { "epoch": 3.0172604232632803, "grad_norm": 1.339657187461853, "learning_rate": 8.81749216177828e-06, "loss": 0.0588, "step": 204875 }, { "epoch": 3.0176286063533673, "grad_norm": 1.4364664554595947, "learning_rate": 8.815855789811294e-06, "loss": 0.0541, "step": 204900 }, { "epoch": 3.0179967894434543, "grad_norm": 1.4257307052612305, "learning_rate": 8.81421941784431e-06, "loss": 0.0583, "step": 204925 }, { "epoch": 3.0183649725335413, "grad_norm": 1.0394444465637207, "learning_rate": 8.812583045877325e-06, "loss": 0.0503, "step": 204950 }, { "epoch": 3.0187331556236283, "grad_norm": 1.498558759689331, "learning_rate": 8.810946673910341e-06, "loss": 0.0567, "step": 204975 }, { "epoch": 3.0191013387137158, "grad_norm": 1.1389732360839844, "learning_rate": 8.809310301943355e-06, "loss": 0.0566, "step": 205000 }, { "epoch": 3.019469521803803, "grad_norm": 1.5723432302474976, "learning_rate": 8.807673929976372e-06, "loss": 0.0536, "step": 205025 }, { "epoch": 3.01983770489389, "grad_norm": 1.2834051847457886, "learning_rate": 8.806037558009388e-06, "loss": 0.0591, "step": 205050 }, { "epoch": 3.020205887983977, "grad_norm": 1.2777409553527832, "learning_rate": 8.804401186042402e-06, "loss": 0.0575, "step": 205075 }, { "epoch": 3.020574071074064, "grad_norm": 1.115095853805542, "learning_rate": 8.802764814075417e-06, "loss": 0.0553, "step": 205100 }, { "epoch": 3.020942254164151, "grad_norm": 1.5836453437805176, "learning_rate": 8.801128442108433e-06, "loss": 0.0568, "step": 205125 }, { "epoch": 3.021310437254238, "grad_norm": 1.372407078742981, "learning_rate": 8.799492070141449e-06, "loss": 0.0552, "step": 205150 }, { "epoch": 3.021678620344325, "grad_norm": 1.5378984212875366, "learning_rate": 8.797855698174465e-06, "loss": 0.0615, "step": 205175 }, { "epoch": 3.022046803434412, "grad_norm": 1.590171217918396, "learning_rate": 8.79621932620748e-06, "loss": 0.0553, "step": 205200 }, { "epoch": 3.022414986524499, "grad_norm": 1.4625080823898315, "learning_rate": 8.794582954240494e-06, "loss": 0.0505, "step": 205225 }, { "epoch": 3.022783169614586, "grad_norm": 1.2913883924484253, "learning_rate": 8.79294658227351e-06, "loss": 0.0584, "step": 205250 }, { "epoch": 3.023151352704673, "grad_norm": 1.7851440906524658, "learning_rate": 8.791310210306526e-06, "loss": 0.0608, "step": 205275 }, { "epoch": 3.02351953579476, "grad_norm": 1.3867429494857788, "learning_rate": 8.789673838339543e-06, "loss": 0.0579, "step": 205300 }, { "epoch": 3.023887718884847, "grad_norm": 1.428532600402832, "learning_rate": 8.788037466372557e-06, "loss": 0.0502, "step": 205325 }, { "epoch": 3.024255901974934, "grad_norm": 1.3660624027252197, "learning_rate": 8.786401094405571e-06, "loss": 0.0576, "step": 205350 }, { "epoch": 3.024624085065021, "grad_norm": 1.6445670127868652, "learning_rate": 8.784764722438588e-06, "loss": 0.0601, "step": 205375 }, { "epoch": 3.024992268155108, "grad_norm": 1.4324208498001099, "learning_rate": 8.783128350471604e-06, "loss": 0.0666, "step": 205400 }, { "epoch": 3.025360451245195, "grad_norm": 0.9703266620635986, "learning_rate": 8.781491978504618e-06, "loss": 0.0522, "step": 205425 }, { "epoch": 3.025728634335282, "grad_norm": 1.2462025880813599, "learning_rate": 8.779855606537634e-06, "loss": 0.0556, "step": 205450 }, { "epoch": 3.026096817425369, "grad_norm": 0.980064332485199, "learning_rate": 8.778219234570649e-06, "loss": 0.0499, "step": 205475 }, { "epoch": 3.0264650005154565, "grad_norm": 1.148585319519043, "learning_rate": 8.776582862603665e-06, "loss": 0.0566, "step": 205500 }, { "epoch": 3.0268331836055435, "grad_norm": 1.5142290592193604, "learning_rate": 8.77494649063668e-06, "loss": 0.0591, "step": 205525 }, { "epoch": 3.0272013666956306, "grad_norm": 1.345026969909668, "learning_rate": 8.773375573548375e-06, "loss": 0.0563, "step": 205550 }, { "epoch": 3.0275695497857176, "grad_norm": 1.6352711915969849, "learning_rate": 8.771739201581391e-06, "loss": 0.0598, "step": 205575 }, { "epoch": 3.0279377328758046, "grad_norm": 1.354448676109314, "learning_rate": 8.770102829614406e-06, "loss": 0.0531, "step": 205600 }, { "epoch": 3.0283059159658916, "grad_norm": 1.3734674453735352, "learning_rate": 8.768466457647422e-06, "loss": 0.0595, "step": 205625 }, { "epoch": 3.0286740990559786, "grad_norm": 1.3386586904525757, "learning_rate": 8.766830085680436e-06, "loss": 0.0573, "step": 205650 }, { "epoch": 3.0290422821460656, "grad_norm": 1.2852375507354736, "learning_rate": 8.765193713713452e-06, "loss": 0.0601, "step": 205675 }, { "epoch": 3.0294104652361526, "grad_norm": 1.6926796436309814, "learning_rate": 8.763557341746469e-06, "loss": 0.0571, "step": 205700 }, { "epoch": 3.0297786483262397, "grad_norm": 1.0144689083099365, "learning_rate": 8.761920969779483e-06, "loss": 0.0509, "step": 205725 }, { "epoch": 3.0301468314163267, "grad_norm": 1.588392734527588, "learning_rate": 8.760284597812499e-06, "loss": 0.0545, "step": 205750 }, { "epoch": 3.0305150145064137, "grad_norm": 1.201865792274475, "learning_rate": 8.758648225845514e-06, "loss": 0.0504, "step": 205775 }, { "epoch": 3.0308831975965007, "grad_norm": 1.4972093105316162, "learning_rate": 8.75701185387853e-06, "loss": 0.0568, "step": 205800 }, { "epoch": 3.0312513806865877, "grad_norm": 1.420656442642212, "learning_rate": 8.755375481911546e-06, "loss": 0.0516, "step": 205825 }, { "epoch": 3.0316195637766747, "grad_norm": 1.4574306011199951, "learning_rate": 8.75373910994456e-06, "loss": 0.0535, "step": 205850 }, { "epoch": 3.0319877468667618, "grad_norm": 1.0388892889022827, "learning_rate": 8.752102737977577e-06, "loss": 0.0557, "step": 205875 }, { "epoch": 3.0323559299568488, "grad_norm": 1.3123277425765991, "learning_rate": 8.750466366010591e-06, "loss": 0.0609, "step": 205900 }, { "epoch": 3.032724113046936, "grad_norm": 1.2200952768325806, "learning_rate": 8.748829994043607e-06, "loss": 0.0484, "step": 205925 }, { "epoch": 3.033092296137023, "grad_norm": 1.0080150365829468, "learning_rate": 8.747193622076622e-06, "loss": 0.0581, "step": 205950 }, { "epoch": 3.0334604792271103, "grad_norm": 1.3287502527236938, "learning_rate": 8.745557250109638e-06, "loss": 0.0568, "step": 205975 }, { "epoch": 3.0338286623171973, "grad_norm": 1.698480248451233, "learning_rate": 8.743920878142654e-06, "loss": 0.0586, "step": 206000 }, { "epoch": 3.0341968454072843, "grad_norm": 1.0519176721572876, "learning_rate": 8.742284506175668e-06, "loss": 0.0511, "step": 206025 }, { "epoch": 3.0345650284973713, "grad_norm": 1.6990594863891602, "learning_rate": 8.740648134208683e-06, "loss": 0.0595, "step": 206050 }, { "epoch": 3.0349332115874583, "grad_norm": 1.2284382581710815, "learning_rate": 8.739011762241699e-06, "loss": 0.0609, "step": 206075 }, { "epoch": 3.0353013946775453, "grad_norm": 1.9676870107650757, "learning_rate": 8.737375390274715e-06, "loss": 0.059, "step": 206100 }, { "epoch": 3.0356695777676324, "grad_norm": 1.1042401790618896, "learning_rate": 8.735739018307731e-06, "loss": 0.054, "step": 206125 }, { "epoch": 3.0360377608577194, "grad_norm": 1.369608759880066, "learning_rate": 8.734102646340746e-06, "loss": 0.0533, "step": 206150 }, { "epoch": 3.0364059439478064, "grad_norm": 1.3927401304244995, "learning_rate": 8.73246627437376e-06, "loss": 0.0541, "step": 206175 }, { "epoch": 3.0367741270378934, "grad_norm": 1.536010980606079, "learning_rate": 8.730829902406776e-06, "loss": 0.0533, "step": 206200 }, { "epoch": 3.0371423101279804, "grad_norm": 1.2623169422149658, "learning_rate": 8.729193530439792e-06, "loss": 0.0581, "step": 206225 }, { "epoch": 3.0375104932180674, "grad_norm": 1.2068862915039062, "learning_rate": 8.727557158472807e-06, "loss": 0.0572, "step": 206250 }, { "epoch": 3.0378786763081544, "grad_norm": 1.3528281450271606, "learning_rate": 8.725920786505823e-06, "loss": 0.0546, "step": 206275 }, { "epoch": 3.0382468593982415, "grad_norm": 1.5392383337020874, "learning_rate": 8.724284414538838e-06, "loss": 0.0603, "step": 206300 }, { "epoch": 3.0386150424883285, "grad_norm": 1.4895553588867188, "learning_rate": 8.722648042571854e-06, "loss": 0.0505, "step": 206325 }, { "epoch": 3.0389832255784155, "grad_norm": 1.4076825380325317, "learning_rate": 8.72101167060487e-06, "loss": 0.0524, "step": 206350 }, { "epoch": 3.0393514086685025, "grad_norm": 1.4680297374725342, "learning_rate": 8.719375298637884e-06, "loss": 0.0511, "step": 206375 }, { "epoch": 3.0397195917585895, "grad_norm": 1.3740334510803223, "learning_rate": 8.7177389266709e-06, "loss": 0.0602, "step": 206400 }, { "epoch": 3.0400877748486765, "grad_norm": 1.6759331226348877, "learning_rate": 8.716102554703917e-06, "loss": 0.0566, "step": 206425 }, { "epoch": 3.040455957938764, "grad_norm": 1.0811715126037598, "learning_rate": 8.714466182736931e-06, "loss": 0.0514, "step": 206450 }, { "epoch": 3.040824141028851, "grad_norm": 1.4177398681640625, "learning_rate": 8.712829810769946e-06, "loss": 0.0546, "step": 206475 }, { "epoch": 3.041192324118938, "grad_norm": 1.4018912315368652, "learning_rate": 8.711193438802962e-06, "loss": 0.0549, "step": 206500 }, { "epoch": 3.041560507209025, "grad_norm": 1.1432408094406128, "learning_rate": 8.709557066835978e-06, "loss": 0.0579, "step": 206525 }, { "epoch": 3.041928690299112, "grad_norm": 0.9459879398345947, "learning_rate": 8.707920694868994e-06, "loss": 0.0518, "step": 206550 }, { "epoch": 3.042296873389199, "grad_norm": 1.3017204999923706, "learning_rate": 8.706284322902008e-06, "loss": 0.0532, "step": 206575 }, { "epoch": 3.042665056479286, "grad_norm": 1.4303687810897827, "learning_rate": 8.704647950935023e-06, "loss": 0.0592, "step": 206600 }, { "epoch": 3.043033239569373, "grad_norm": 1.5987792015075684, "learning_rate": 8.703011578968039e-06, "loss": 0.0548, "step": 206625 }, { "epoch": 3.04340142265946, "grad_norm": 1.5667405128479004, "learning_rate": 8.701375207001055e-06, "loss": 0.0602, "step": 206650 }, { "epoch": 3.043769605749547, "grad_norm": 1.3785927295684814, "learning_rate": 8.69973883503407e-06, "loss": 0.0552, "step": 206675 }, { "epoch": 3.044137788839634, "grad_norm": 1.4220634698867798, "learning_rate": 8.698102463067086e-06, "loss": 0.0648, "step": 206700 }, { "epoch": 3.044505971929721, "grad_norm": 1.6044307947158813, "learning_rate": 8.6964660911001e-06, "loss": 0.0559, "step": 206725 }, { "epoch": 3.044874155019808, "grad_norm": 1.194157600402832, "learning_rate": 8.694829719133116e-06, "loss": 0.0547, "step": 206750 }, { "epoch": 3.045242338109895, "grad_norm": 0.9766545295715332, "learning_rate": 8.693258802044812e-06, "loss": 0.0572, "step": 206775 }, { "epoch": 3.045610521199982, "grad_norm": 1.2415086030960083, "learning_rate": 8.691622430077826e-06, "loss": 0.0521, "step": 206800 }, { "epoch": 3.0459787042900692, "grad_norm": 0.9433152675628662, "learning_rate": 8.689986058110843e-06, "loss": 0.0596, "step": 206825 }, { "epoch": 3.0463468873801562, "grad_norm": 1.1773619651794434, "learning_rate": 8.688349686143857e-06, "loss": 0.0606, "step": 206850 }, { "epoch": 3.0467150704702433, "grad_norm": 1.4022718667984009, "learning_rate": 8.686713314176873e-06, "loss": 0.0459, "step": 206875 }, { "epoch": 3.0470832535603303, "grad_norm": 1.40048348903656, "learning_rate": 8.685076942209888e-06, "loss": 0.0577, "step": 206900 }, { "epoch": 3.0474514366504177, "grad_norm": 1.2992240190505981, "learning_rate": 8.683440570242904e-06, "loss": 0.0548, "step": 206925 }, { "epoch": 3.0478196197405047, "grad_norm": 1.4067257642745972, "learning_rate": 8.68180419827592e-06, "loss": 0.0571, "step": 206950 }, { "epoch": 3.0481878028305918, "grad_norm": 1.6077123880386353, "learning_rate": 8.680167826308934e-06, "loss": 0.059, "step": 206975 }, { "epoch": 3.048555985920679, "grad_norm": 1.639776587486267, "learning_rate": 8.678531454341949e-06, "loss": 0.0514, "step": 207000 }, { "epoch": 3.048924169010766, "grad_norm": 1.1378449201583862, "learning_rate": 8.676895082374965e-06, "loss": 0.056, "step": 207025 }, { "epoch": 3.049292352100853, "grad_norm": 1.3766497373580933, "learning_rate": 8.675258710407981e-06, "loss": 0.0566, "step": 207050 }, { "epoch": 3.04966053519094, "grad_norm": 1.2629650831222534, "learning_rate": 8.673622338440997e-06, "loss": 0.0508, "step": 207075 }, { "epoch": 3.050028718281027, "grad_norm": 1.3769396543502808, "learning_rate": 8.671985966474012e-06, "loss": 0.0563, "step": 207100 }, { "epoch": 3.050396901371114, "grad_norm": 1.697031021118164, "learning_rate": 8.670349594507026e-06, "loss": 0.0537, "step": 207125 }, { "epoch": 3.050765084461201, "grad_norm": 1.6207547187805176, "learning_rate": 8.668713222540042e-06, "loss": 0.0566, "step": 207150 }, { "epoch": 3.051133267551288, "grad_norm": 1.6054742336273193, "learning_rate": 8.667076850573059e-06, "loss": 0.0588, "step": 207175 }, { "epoch": 3.051501450641375, "grad_norm": 1.6720421314239502, "learning_rate": 8.665440478606073e-06, "loss": 0.0484, "step": 207200 }, { "epoch": 3.051869633731462, "grad_norm": 1.7105470895767212, "learning_rate": 8.663804106639089e-06, "loss": 0.0646, "step": 207225 }, { "epoch": 3.052237816821549, "grad_norm": 1.3694008588790894, "learning_rate": 8.662167734672105e-06, "loss": 0.0544, "step": 207250 }, { "epoch": 3.052605999911636, "grad_norm": 1.3467543125152588, "learning_rate": 8.66053136270512e-06, "loss": 0.0512, "step": 207275 }, { "epoch": 3.052974183001723, "grad_norm": 1.4946752786636353, "learning_rate": 8.658894990738136e-06, "loss": 0.0589, "step": 207300 }, { "epoch": 3.05334236609181, "grad_norm": 1.2056653499603271, "learning_rate": 8.65725861877115e-06, "loss": 0.0494, "step": 207325 }, { "epoch": 3.053710549181897, "grad_norm": 1.4029792547225952, "learning_rate": 8.655622246804167e-06, "loss": 0.0587, "step": 207350 }, { "epoch": 3.054078732271984, "grad_norm": 1.0971531867980957, "learning_rate": 8.653985874837183e-06, "loss": 0.0527, "step": 207375 }, { "epoch": 3.0544469153620715, "grad_norm": 1.40229332447052, "learning_rate": 8.652349502870197e-06, "loss": 0.0592, "step": 207400 }, { "epoch": 3.0548150984521585, "grad_norm": 1.3901351690292358, "learning_rate": 8.650713130903212e-06, "loss": 0.055, "step": 207425 }, { "epoch": 3.0551832815422455, "grad_norm": 1.6501914262771606, "learning_rate": 8.649076758936228e-06, "loss": 0.0602, "step": 207450 }, { "epoch": 3.0555514646323325, "grad_norm": 1.0003635883331299, "learning_rate": 8.647440386969244e-06, "loss": 0.0536, "step": 207475 }, { "epoch": 3.0559196477224195, "grad_norm": 1.4143460988998413, "learning_rate": 8.64580401500226e-06, "loss": 0.0499, "step": 207500 }, { "epoch": 3.0562878308125065, "grad_norm": 1.3231967687606812, "learning_rate": 8.644167643035274e-06, "loss": 0.0593, "step": 207525 }, { "epoch": 3.0566560139025936, "grad_norm": 1.5237478017807007, "learning_rate": 8.642531271068289e-06, "loss": 0.0532, "step": 207550 }, { "epoch": 3.0570241969926806, "grad_norm": 0.9174051880836487, "learning_rate": 8.640894899101305e-06, "loss": 0.0554, "step": 207575 }, { "epoch": 3.0573923800827676, "grad_norm": 1.524187684059143, "learning_rate": 8.639258527134321e-06, "loss": 0.052, "step": 207600 }, { "epoch": 3.0577605631728546, "grad_norm": 0.962013304233551, "learning_rate": 8.637622155167336e-06, "loss": 0.0465, "step": 207625 }, { "epoch": 3.0581287462629416, "grad_norm": 1.402522325515747, "learning_rate": 8.635985783200352e-06, "loss": 0.0551, "step": 207650 }, { "epoch": 3.0584969293530286, "grad_norm": 1.3614004850387573, "learning_rate": 8.634349411233366e-06, "loss": 0.0565, "step": 207675 }, { "epoch": 3.0588651124431157, "grad_norm": 1.1091094017028809, "learning_rate": 8.632713039266382e-06, "loss": 0.0597, "step": 207700 }, { "epoch": 3.0592332955332027, "grad_norm": 0.9271122217178345, "learning_rate": 8.631076667299399e-06, "loss": 0.0537, "step": 207725 }, { "epoch": 3.0596014786232897, "grad_norm": 0.9111393690109253, "learning_rate": 8.629440295332413e-06, "loss": 0.054, "step": 207750 }, { "epoch": 3.0599696617133767, "grad_norm": 1.7573012113571167, "learning_rate": 8.62780392336543e-06, "loss": 0.054, "step": 207775 }, { "epoch": 3.0603378448034637, "grad_norm": 1.514750599861145, "learning_rate": 8.626167551398444e-06, "loss": 0.057, "step": 207800 }, { "epoch": 3.0607060278935507, "grad_norm": 1.525256872177124, "learning_rate": 8.62453117943146e-06, "loss": 0.0501, "step": 207825 }, { "epoch": 3.0610742109836377, "grad_norm": 1.2433980703353882, "learning_rate": 8.622894807464474e-06, "loss": 0.0526, "step": 207850 }, { "epoch": 3.061442394073725, "grad_norm": 0.9487237930297852, "learning_rate": 8.62125843549749e-06, "loss": 0.0523, "step": 207875 }, { "epoch": 3.061810577163812, "grad_norm": 1.2259085178375244, "learning_rate": 8.619622063530507e-06, "loss": 0.0578, "step": 207900 }, { "epoch": 3.0621787602538992, "grad_norm": 1.2869065999984741, "learning_rate": 8.617985691563521e-06, "loss": 0.0536, "step": 207925 }, { "epoch": 3.0625469433439862, "grad_norm": 1.5098904371261597, "learning_rate": 8.616349319596537e-06, "loss": 0.047, "step": 207950 }, { "epoch": 3.0629151264340733, "grad_norm": 1.5204015970230103, "learning_rate": 8.614712947629552e-06, "loss": 0.056, "step": 207975 }, { "epoch": 3.0632833095241603, "grad_norm": 1.3810440301895142, "learning_rate": 8.613076575662568e-06, "loss": 0.0656, "step": 208000 }, { "epoch": 3.0636514926142473, "grad_norm": 1.6976685523986816, "learning_rate": 8.611440203695584e-06, "loss": 0.0576, "step": 208025 }, { "epoch": 3.0640196757043343, "grad_norm": 1.728892207145691, "learning_rate": 8.609803831728598e-06, "loss": 0.0496, "step": 208050 }, { "epoch": 3.0643878587944213, "grad_norm": 1.0475125312805176, "learning_rate": 8.608167459761615e-06, "loss": 0.0546, "step": 208075 }, { "epoch": 3.0647560418845083, "grad_norm": 1.3587110042572021, "learning_rate": 8.606531087794629e-06, "loss": 0.0632, "step": 208100 }, { "epoch": 3.0651242249745954, "grad_norm": 1.8400919437408447, "learning_rate": 8.604894715827645e-06, "loss": 0.0621, "step": 208125 }, { "epoch": 3.0654924080646824, "grad_norm": 1.7645920515060425, "learning_rate": 8.60325834386066e-06, "loss": 0.0601, "step": 208150 }, { "epoch": 3.0658605911547694, "grad_norm": 1.384613037109375, "learning_rate": 8.601621971893676e-06, "loss": 0.054, "step": 208175 }, { "epoch": 3.0662287742448564, "grad_norm": 1.311444878578186, "learning_rate": 8.599985599926692e-06, "loss": 0.0647, "step": 208200 }, { "epoch": 3.0665969573349434, "grad_norm": 1.5473036766052246, "learning_rate": 8.598349227959706e-06, "loss": 0.061, "step": 208225 }, { "epoch": 3.0669651404250304, "grad_norm": 1.3374066352844238, "learning_rate": 8.596712855992723e-06, "loss": 0.0517, "step": 208250 }, { "epoch": 3.0673333235151174, "grad_norm": 1.1354910135269165, "learning_rate": 8.595076484025737e-06, "loss": 0.0548, "step": 208275 }, { "epoch": 3.0677015066052045, "grad_norm": 1.0074881315231323, "learning_rate": 8.593440112058753e-06, "loss": 0.0545, "step": 208300 }, { "epoch": 3.0680696896952915, "grad_norm": 1.469630241394043, "learning_rate": 8.59180374009177e-06, "loss": 0.0557, "step": 208325 }, { "epoch": 3.0684378727853785, "grad_norm": 1.3384222984313965, "learning_rate": 8.590167368124784e-06, "loss": 0.0515, "step": 208350 }, { "epoch": 3.068806055875466, "grad_norm": 1.2757782936096191, "learning_rate": 8.588530996157798e-06, "loss": 0.0561, "step": 208375 }, { "epoch": 3.069174238965553, "grad_norm": 1.3328989744186401, "learning_rate": 8.586894624190814e-06, "loss": 0.0518, "step": 208400 }, { "epoch": 3.06954242205564, "grad_norm": 0.8638681173324585, "learning_rate": 8.58525825222383e-06, "loss": 0.052, "step": 208425 }, { "epoch": 3.069910605145727, "grad_norm": 1.485156536102295, "learning_rate": 8.583621880256847e-06, "loss": 0.0546, "step": 208450 }, { "epoch": 3.070278788235814, "grad_norm": 1.0470079183578491, "learning_rate": 8.581985508289861e-06, "loss": 0.0529, "step": 208475 }, { "epoch": 3.070646971325901, "grad_norm": 1.503570795059204, "learning_rate": 8.580349136322876e-06, "loss": 0.0565, "step": 208500 }, { "epoch": 3.071015154415988, "grad_norm": 1.1559982299804688, "learning_rate": 8.578712764355892e-06, "loss": 0.057, "step": 208525 }, { "epoch": 3.071383337506075, "grad_norm": 1.3871711492538452, "learning_rate": 8.577076392388908e-06, "loss": 0.0565, "step": 208550 }, { "epoch": 3.071751520596162, "grad_norm": 1.815867304801941, "learning_rate": 8.575440020421922e-06, "loss": 0.0525, "step": 208575 }, { "epoch": 3.072119703686249, "grad_norm": 1.232560396194458, "learning_rate": 8.573803648454939e-06, "loss": 0.054, "step": 208600 }, { "epoch": 3.072487886776336, "grad_norm": 1.6233042478561401, "learning_rate": 8.572167276487953e-06, "loss": 0.0485, "step": 208625 }, { "epoch": 3.072856069866423, "grad_norm": 1.4989091157913208, "learning_rate": 8.570530904520969e-06, "loss": 0.0633, "step": 208650 }, { "epoch": 3.07322425295651, "grad_norm": 1.7640851736068726, "learning_rate": 8.568894532553984e-06, "loss": 0.0558, "step": 208675 }, { "epoch": 3.073592436046597, "grad_norm": 1.2119972705841064, "learning_rate": 8.567258160587e-06, "loss": 0.0493, "step": 208700 }, { "epoch": 3.073960619136684, "grad_norm": 1.429002046585083, "learning_rate": 8.565621788620016e-06, "loss": 0.0583, "step": 208725 }, { "epoch": 3.074328802226771, "grad_norm": 1.5290645360946655, "learning_rate": 8.56398541665303e-06, "loss": 0.0542, "step": 208750 }, { "epoch": 3.074696985316858, "grad_norm": 1.0177441835403442, "learning_rate": 8.562349044686046e-06, "loss": 0.0495, "step": 208775 }, { "epoch": 3.075065168406945, "grad_norm": 1.3806568384170532, "learning_rate": 8.560712672719061e-06, "loss": 0.0545, "step": 208800 }, { "epoch": 3.0754333514970322, "grad_norm": 0.9060839414596558, "learning_rate": 8.559076300752077e-06, "loss": 0.0508, "step": 208825 }, { "epoch": 3.0758015345871197, "grad_norm": 1.34939444065094, "learning_rate": 8.557439928785093e-06, "loss": 0.0624, "step": 208850 }, { "epoch": 3.0761697176772067, "grad_norm": 0.9613877534866333, "learning_rate": 8.55580355681811e-06, "loss": 0.0539, "step": 208875 }, { "epoch": 3.0765379007672937, "grad_norm": 1.4004034996032715, "learning_rate": 8.554167184851124e-06, "loss": 0.0499, "step": 208900 }, { "epoch": 3.0769060838573807, "grad_norm": 1.7539432048797607, "learning_rate": 8.552530812884138e-06, "loss": 0.0573, "step": 208925 }, { "epoch": 3.0772742669474678, "grad_norm": 1.5943663120269775, "learning_rate": 8.550894440917154e-06, "loss": 0.0537, "step": 208950 }, { "epoch": 3.0776424500375548, "grad_norm": 1.2340283393859863, "learning_rate": 8.54925806895017e-06, "loss": 0.0517, "step": 208975 }, { "epoch": 3.078010633127642, "grad_norm": 1.14974045753479, "learning_rate": 8.547621696983185e-06, "loss": 0.0515, "step": 209000 }, { "epoch": 3.078378816217729, "grad_norm": 1.496490478515625, "learning_rate": 8.545985325016201e-06, "loss": 0.0548, "step": 209025 }, { "epoch": 3.078746999307816, "grad_norm": 1.1287086009979248, "learning_rate": 8.544348953049216e-06, "loss": 0.0586, "step": 209050 }, { "epoch": 3.079115182397903, "grad_norm": 1.3333474397659302, "learning_rate": 8.542712581082232e-06, "loss": 0.0548, "step": 209075 }, { "epoch": 3.07948336548799, "grad_norm": 1.4309800863265991, "learning_rate": 8.541076209115246e-06, "loss": 0.0556, "step": 209100 }, { "epoch": 3.079851548578077, "grad_norm": 1.2619292736053467, "learning_rate": 8.539439837148262e-06, "loss": 0.0574, "step": 209125 }, { "epoch": 3.080219731668164, "grad_norm": 1.5033754110336304, "learning_rate": 8.537803465181279e-06, "loss": 0.0534, "step": 209150 }, { "epoch": 3.080587914758251, "grad_norm": 1.121462345123291, "learning_rate": 8.536167093214293e-06, "loss": 0.0618, "step": 209175 }, { "epoch": 3.080956097848338, "grad_norm": 1.0048859119415283, "learning_rate": 8.53453072124731e-06, "loss": 0.0501, "step": 209200 }, { "epoch": 3.081324280938425, "grad_norm": 1.148051381111145, "learning_rate": 8.532894349280324e-06, "loss": 0.048, "step": 209225 }, { "epoch": 3.081692464028512, "grad_norm": 1.5723342895507812, "learning_rate": 8.53125797731334e-06, "loss": 0.0574, "step": 209250 }, { "epoch": 3.082060647118599, "grad_norm": 0.7662875056266785, "learning_rate": 8.529621605346356e-06, "loss": 0.0475, "step": 209275 }, { "epoch": 3.082428830208686, "grad_norm": 0.8803878426551819, "learning_rate": 8.52798523337937e-06, "loss": 0.0483, "step": 209300 }, { "epoch": 3.0827970132987734, "grad_norm": 0.956765353679657, "learning_rate": 8.526348861412385e-06, "loss": 0.0566, "step": 209325 }, { "epoch": 3.0831651963888604, "grad_norm": 0.9687982797622681, "learning_rate": 8.524712489445401e-06, "loss": 0.0548, "step": 209350 }, { "epoch": 3.0835333794789475, "grad_norm": 1.46809720993042, "learning_rate": 8.523076117478417e-06, "loss": 0.0574, "step": 209375 }, { "epoch": 3.0839015625690345, "grad_norm": 1.3305907249450684, "learning_rate": 8.521439745511433e-06, "loss": 0.0555, "step": 209400 }, { "epoch": 3.0842697456591215, "grad_norm": 1.184473991394043, "learning_rate": 8.519803373544448e-06, "loss": 0.0628, "step": 209425 }, { "epoch": 3.0846379287492085, "grad_norm": 1.128836750984192, "learning_rate": 8.518167001577464e-06, "loss": 0.055, "step": 209450 }, { "epoch": 3.0850061118392955, "grad_norm": 0.7413156032562256, "learning_rate": 8.516530629610478e-06, "loss": 0.0486, "step": 209475 }, { "epoch": 3.0853742949293825, "grad_norm": 1.2718126773834229, "learning_rate": 8.514894257643495e-06, "loss": 0.0625, "step": 209500 }, { "epoch": 3.0857424780194695, "grad_norm": 1.653568148612976, "learning_rate": 8.513257885676509e-06, "loss": 0.0579, "step": 209525 }, { "epoch": 3.0861106611095566, "grad_norm": 1.485002875328064, "learning_rate": 8.511621513709525e-06, "loss": 0.0569, "step": 209550 }, { "epoch": 3.0864788441996436, "grad_norm": 1.282753586769104, "learning_rate": 8.509985141742541e-06, "loss": 0.0599, "step": 209575 }, { "epoch": 3.0868470272897306, "grad_norm": 1.3891710042953491, "learning_rate": 8.508348769775556e-06, "loss": 0.0532, "step": 209600 }, { "epoch": 3.0872152103798176, "grad_norm": 1.182939052581787, "learning_rate": 8.50671239780857e-06, "loss": 0.0592, "step": 209625 }, { "epoch": 3.0875833934699046, "grad_norm": 1.310529351234436, "learning_rate": 8.505076025841586e-06, "loss": 0.057, "step": 209650 }, { "epoch": 3.0879515765599916, "grad_norm": 1.4837828874588013, "learning_rate": 8.503439653874603e-06, "loss": 0.0526, "step": 209675 }, { "epoch": 3.0883197596500787, "grad_norm": 1.3674213886260986, "learning_rate": 8.501803281907619e-06, "loss": 0.0565, "step": 209700 }, { "epoch": 3.0886879427401657, "grad_norm": 2.05519962310791, "learning_rate": 8.500166909940633e-06, "loss": 0.0582, "step": 209725 }, { "epoch": 3.0890561258302527, "grad_norm": 1.9169014692306519, "learning_rate": 8.498530537973648e-06, "loss": 0.0563, "step": 209750 }, { "epoch": 3.0894243089203397, "grad_norm": 1.029412031173706, "learning_rate": 8.496894166006664e-06, "loss": 0.0565, "step": 209775 }, { "epoch": 3.089792492010427, "grad_norm": 1.0069315433502197, "learning_rate": 8.49525779403968e-06, "loss": 0.062, "step": 209800 }, { "epoch": 3.090160675100514, "grad_norm": 1.0237654447555542, "learning_rate": 8.493621422072696e-06, "loss": 0.0544, "step": 209825 }, { "epoch": 3.090528858190601, "grad_norm": 1.139911413192749, "learning_rate": 8.49198505010571e-06, "loss": 0.0542, "step": 209850 }, { "epoch": 3.090897041280688, "grad_norm": 1.4588353633880615, "learning_rate": 8.490348678138725e-06, "loss": 0.0542, "step": 209875 }, { "epoch": 3.091265224370775, "grad_norm": 1.0450915098190308, "learning_rate": 8.488712306171741e-06, "loss": 0.0559, "step": 209900 }, { "epoch": 3.0916334074608622, "grad_norm": 1.5998412370681763, "learning_rate": 8.487075934204757e-06, "loss": 0.0517, "step": 209925 }, { "epoch": 3.0920015905509493, "grad_norm": 1.2449195384979248, "learning_rate": 8.485439562237772e-06, "loss": 0.0509, "step": 209950 }, { "epoch": 3.0923697736410363, "grad_norm": 2.1324307918548584, "learning_rate": 8.483803190270788e-06, "loss": 0.0577, "step": 209975 }, { "epoch": 3.0927379567311233, "grad_norm": 1.532668113708496, "learning_rate": 8.482166818303802e-06, "loss": 0.0591, "step": 210000 }, { "epoch": 3.0931061398212103, "grad_norm": 1.3548530340194702, "learning_rate": 8.480530446336818e-06, "loss": 0.0496, "step": 210025 }, { "epoch": 3.0934743229112973, "grad_norm": 0.7626944780349731, "learning_rate": 8.478894074369833e-06, "loss": 0.0537, "step": 210050 }, { "epoch": 3.0938425060013843, "grad_norm": 1.320664644241333, "learning_rate": 8.477257702402849e-06, "loss": 0.0529, "step": 210075 }, { "epoch": 3.0942106890914713, "grad_norm": 2.0523860454559326, "learning_rate": 8.475621330435865e-06, "loss": 0.0531, "step": 210100 }, { "epoch": 3.0945788721815584, "grad_norm": 1.1885137557983398, "learning_rate": 8.47398495846888e-06, "loss": 0.0482, "step": 210125 }, { "epoch": 3.0949470552716454, "grad_norm": 1.2213784456253052, "learning_rate": 8.472414041380575e-06, "loss": 0.0639, "step": 210150 }, { "epoch": 3.0953152383617324, "grad_norm": 1.1088509559631348, "learning_rate": 8.47077766941359e-06, "loss": 0.0543, "step": 210175 }, { "epoch": 3.0956834214518194, "grad_norm": 1.1907039880752563, "learning_rate": 8.469141297446606e-06, "loss": 0.0512, "step": 210200 }, { "epoch": 3.0960516045419064, "grad_norm": 1.47086501121521, "learning_rate": 8.467504925479622e-06, "loss": 0.0511, "step": 210225 }, { "epoch": 3.0964197876319934, "grad_norm": 1.4203814268112183, "learning_rate": 8.465868553512636e-06, "loss": 0.0519, "step": 210250 }, { "epoch": 3.0967879707220805, "grad_norm": 1.4843871593475342, "learning_rate": 8.464232181545653e-06, "loss": 0.0471, "step": 210275 }, { "epoch": 3.097156153812168, "grad_norm": 1.4037944078445435, "learning_rate": 8.462595809578667e-06, "loss": 0.0571, "step": 210300 }, { "epoch": 3.097524336902255, "grad_norm": 1.4150410890579224, "learning_rate": 8.460959437611683e-06, "loss": 0.0548, "step": 210325 }, { "epoch": 3.097892519992342, "grad_norm": 1.319433569908142, "learning_rate": 8.4593230656447e-06, "loss": 0.0577, "step": 210350 }, { "epoch": 3.098260703082429, "grad_norm": 1.613512396812439, "learning_rate": 8.457686693677714e-06, "loss": 0.0595, "step": 210375 }, { "epoch": 3.098628886172516, "grad_norm": 0.9161540269851685, "learning_rate": 8.45605032171073e-06, "loss": 0.0516, "step": 210400 }, { "epoch": 3.098997069262603, "grad_norm": 1.211728811264038, "learning_rate": 8.454413949743744e-06, "loss": 0.0539, "step": 210425 }, { "epoch": 3.09936525235269, "grad_norm": 1.1488088369369507, "learning_rate": 8.45277757777676e-06, "loss": 0.0641, "step": 210450 }, { "epoch": 3.099733435442777, "grad_norm": 1.1961686611175537, "learning_rate": 8.451141205809775e-06, "loss": 0.0552, "step": 210475 }, { "epoch": 3.100101618532864, "grad_norm": 0.6367381811141968, "learning_rate": 8.449504833842791e-06, "loss": 0.0547, "step": 210500 }, { "epoch": 3.100469801622951, "grad_norm": 1.1959617137908936, "learning_rate": 8.447868461875807e-06, "loss": 0.0534, "step": 210525 }, { "epoch": 3.100837984713038, "grad_norm": 1.5913487672805786, "learning_rate": 8.446232089908822e-06, "loss": 0.0577, "step": 210550 }, { "epoch": 3.101206167803125, "grad_norm": 1.4227579832077026, "learning_rate": 8.444595717941836e-06, "loss": 0.0524, "step": 210575 }, { "epoch": 3.101574350893212, "grad_norm": 1.454413652420044, "learning_rate": 8.442959345974852e-06, "loss": 0.0584, "step": 210600 }, { "epoch": 3.101942533983299, "grad_norm": 0.9828397631645203, "learning_rate": 8.441322974007869e-06, "loss": 0.0547, "step": 210625 }, { "epoch": 3.102310717073386, "grad_norm": 1.0633606910705566, "learning_rate": 8.439686602040885e-06, "loss": 0.0526, "step": 210650 }, { "epoch": 3.102678900163473, "grad_norm": 1.8901829719543457, "learning_rate": 8.4380502300739e-06, "loss": 0.0582, "step": 210675 }, { "epoch": 3.10304708325356, "grad_norm": 1.520228385925293, "learning_rate": 8.436413858106914e-06, "loss": 0.0488, "step": 210700 }, { "epoch": 3.103415266343647, "grad_norm": 1.1276758909225464, "learning_rate": 8.43477748613993e-06, "loss": 0.0621, "step": 210725 }, { "epoch": 3.1037834494337346, "grad_norm": 0.8954167366027832, "learning_rate": 8.433141114172946e-06, "loss": 0.0591, "step": 210750 }, { "epoch": 3.1041516325238216, "grad_norm": 1.574902057647705, "learning_rate": 8.431504742205962e-06, "loss": 0.0574, "step": 210775 }, { "epoch": 3.1045198156139087, "grad_norm": 1.3450320959091187, "learning_rate": 8.429868370238977e-06, "loss": 0.0593, "step": 210800 }, { "epoch": 3.1048879987039957, "grad_norm": 1.4459121227264404, "learning_rate": 8.428231998271991e-06, "loss": 0.0454, "step": 210825 }, { "epoch": 3.1052561817940827, "grad_norm": 1.0224316120147705, "learning_rate": 8.426595626305007e-06, "loss": 0.0522, "step": 210850 }, { "epoch": 3.1056243648841697, "grad_norm": 1.315860629081726, "learning_rate": 8.424959254338023e-06, "loss": 0.0568, "step": 210875 }, { "epoch": 3.1059925479742567, "grad_norm": 0.7082265019416809, "learning_rate": 8.423322882371038e-06, "loss": 0.0608, "step": 210900 }, { "epoch": 3.1063607310643437, "grad_norm": 1.3857876062393188, "learning_rate": 8.421686510404054e-06, "loss": 0.0495, "step": 210925 }, { "epoch": 3.1067289141544308, "grad_norm": 1.9409323930740356, "learning_rate": 8.420050138437068e-06, "loss": 0.0554, "step": 210950 }, { "epoch": 3.1070970972445178, "grad_norm": 1.4113516807556152, "learning_rate": 8.418413766470085e-06, "loss": 0.052, "step": 210975 }, { "epoch": 3.107465280334605, "grad_norm": 1.1748491525650024, "learning_rate": 8.416777394503099e-06, "loss": 0.0519, "step": 211000 }, { "epoch": 3.107833463424692, "grad_norm": 1.3287670612335205, "learning_rate": 8.415141022536115e-06, "loss": 0.0539, "step": 211025 }, { "epoch": 3.108201646514779, "grad_norm": 1.2790247201919556, "learning_rate": 8.413504650569131e-06, "loss": 0.052, "step": 211050 }, { "epoch": 3.108569829604866, "grad_norm": 1.527503252029419, "learning_rate": 8.411868278602146e-06, "loss": 0.0527, "step": 211075 }, { "epoch": 3.108938012694953, "grad_norm": 1.5122714042663574, "learning_rate": 8.410231906635162e-06, "loss": 0.0568, "step": 211100 }, { "epoch": 3.10930619578504, "grad_norm": 1.2688982486724854, "learning_rate": 8.408595534668176e-06, "loss": 0.0517, "step": 211125 }, { "epoch": 3.109674378875127, "grad_norm": 0.7769589424133301, "learning_rate": 8.406959162701193e-06, "loss": 0.0525, "step": 211150 }, { "epoch": 3.110042561965214, "grad_norm": 1.918821096420288, "learning_rate": 8.405322790734209e-06, "loss": 0.0649, "step": 211175 }, { "epoch": 3.110410745055301, "grad_norm": 1.1520462036132812, "learning_rate": 8.403686418767225e-06, "loss": 0.0498, "step": 211200 }, { "epoch": 3.110778928145388, "grad_norm": 1.44691002368927, "learning_rate": 8.40205004680024e-06, "loss": 0.0519, "step": 211225 }, { "epoch": 3.1111471112354754, "grad_norm": 1.6217703819274902, "learning_rate": 8.400413674833254e-06, "loss": 0.053, "step": 211250 }, { "epoch": 3.1115152943255624, "grad_norm": 1.853597640991211, "learning_rate": 8.39877730286627e-06, "loss": 0.0582, "step": 211275 }, { "epoch": 3.1118834774156494, "grad_norm": 0.60493004322052, "learning_rate": 8.397140930899286e-06, "loss": 0.0543, "step": 211300 }, { "epoch": 3.1122516605057364, "grad_norm": 1.6704847812652588, "learning_rate": 8.3955045589323e-06, "loss": 0.0631, "step": 211325 }, { "epoch": 3.1126198435958234, "grad_norm": 1.2225840091705322, "learning_rate": 8.393868186965317e-06, "loss": 0.0611, "step": 211350 }, { "epoch": 3.1129880266859105, "grad_norm": 1.5563147068023682, "learning_rate": 8.392231814998331e-06, "loss": 0.0493, "step": 211375 }, { "epoch": 3.1133562097759975, "grad_norm": 1.3426176309585571, "learning_rate": 8.390595443031347e-06, "loss": 0.0558, "step": 211400 }, { "epoch": 3.1137243928660845, "grad_norm": 1.3655571937561035, "learning_rate": 8.388959071064362e-06, "loss": 0.0581, "step": 211425 }, { "epoch": 3.1140925759561715, "grad_norm": 1.1556378602981567, "learning_rate": 8.387322699097378e-06, "loss": 0.0551, "step": 211450 }, { "epoch": 3.1144607590462585, "grad_norm": 0.9666590094566345, "learning_rate": 8.385686327130394e-06, "loss": 0.0511, "step": 211475 }, { "epoch": 3.1148289421363455, "grad_norm": 1.3704558610916138, "learning_rate": 8.384049955163408e-06, "loss": 0.0543, "step": 211500 }, { "epoch": 3.1151971252264326, "grad_norm": 1.1888535022735596, "learning_rate": 8.382413583196423e-06, "loss": 0.0578, "step": 211525 }, { "epoch": 3.1155653083165196, "grad_norm": 1.8262602090835571, "learning_rate": 8.380777211229439e-06, "loss": 0.0535, "step": 211550 }, { "epoch": 3.1159334914066066, "grad_norm": 1.3509559631347656, "learning_rate": 8.379140839262455e-06, "loss": 0.0556, "step": 211575 }, { "epoch": 3.1163016744966936, "grad_norm": 1.738990068435669, "learning_rate": 8.377504467295471e-06, "loss": 0.0557, "step": 211600 }, { "epoch": 3.1166698575867806, "grad_norm": 1.7641172409057617, "learning_rate": 8.375868095328486e-06, "loss": 0.0554, "step": 211625 }, { "epoch": 3.1170380406768676, "grad_norm": 1.572106122970581, "learning_rate": 8.3742317233615e-06, "loss": 0.0591, "step": 211650 }, { "epoch": 3.1174062237669546, "grad_norm": 1.3219088315963745, "learning_rate": 8.372595351394516e-06, "loss": 0.0557, "step": 211675 }, { "epoch": 3.1177744068570417, "grad_norm": 1.7581079006195068, "learning_rate": 8.370958979427533e-06, "loss": 0.0602, "step": 211700 }, { "epoch": 3.118142589947129, "grad_norm": 1.3362586498260498, "learning_rate": 8.369322607460549e-06, "loss": 0.0508, "step": 211725 }, { "epoch": 3.118510773037216, "grad_norm": 1.5656627416610718, "learning_rate": 8.367686235493563e-06, "loss": 0.059, "step": 211750 }, { "epoch": 3.118878956127303, "grad_norm": 1.342461109161377, "learning_rate": 8.36604986352658e-06, "loss": 0.058, "step": 211775 }, { "epoch": 3.11924713921739, "grad_norm": 1.5516109466552734, "learning_rate": 8.364413491559594e-06, "loss": 0.0573, "step": 211800 }, { "epoch": 3.119615322307477, "grad_norm": 1.3761303424835205, "learning_rate": 8.36277711959261e-06, "loss": 0.0581, "step": 211825 }, { "epoch": 3.119983505397564, "grad_norm": 1.4315694570541382, "learning_rate": 8.361140747625624e-06, "loss": 0.0587, "step": 211850 }, { "epoch": 3.120351688487651, "grad_norm": 1.0159449577331543, "learning_rate": 8.35950437565864e-06, "loss": 0.0522, "step": 211875 }, { "epoch": 3.1207198715777382, "grad_norm": 1.3247125148773193, "learning_rate": 8.357868003691657e-06, "loss": 0.0552, "step": 211900 }, { "epoch": 3.1210880546678252, "grad_norm": 0.9327214360237122, "learning_rate": 8.356231631724671e-06, "loss": 0.0531, "step": 211925 }, { "epoch": 3.1214562377579123, "grad_norm": 1.2175869941711426, "learning_rate": 8.354595259757686e-06, "loss": 0.0562, "step": 211950 }, { "epoch": 3.1218244208479993, "grad_norm": 1.0258172750473022, "learning_rate": 8.352958887790702e-06, "loss": 0.0558, "step": 211975 }, { "epoch": 3.1221926039380863, "grad_norm": 1.298048973083496, "learning_rate": 8.351322515823718e-06, "loss": 0.047, "step": 212000 }, { "epoch": 3.1225607870281733, "grad_norm": 1.1345291137695312, "learning_rate": 8.349686143856734e-06, "loss": 0.0568, "step": 212025 }, { "epoch": 3.1229289701182603, "grad_norm": 1.2308984994888306, "learning_rate": 8.348049771889749e-06, "loss": 0.0558, "step": 212050 }, { "epoch": 3.1232971532083473, "grad_norm": 1.4277386665344238, "learning_rate": 8.346413399922763e-06, "loss": 0.0511, "step": 212075 }, { "epoch": 3.1236653362984343, "grad_norm": 1.3885278701782227, "learning_rate": 8.34477702795578e-06, "loss": 0.0613, "step": 212100 }, { "epoch": 3.1240335193885214, "grad_norm": 1.5365197658538818, "learning_rate": 8.343140655988795e-06, "loss": 0.0627, "step": 212125 }, { "epoch": 3.1244017024786084, "grad_norm": 1.2687935829162598, "learning_rate": 8.341504284021811e-06, "loss": 0.0484, "step": 212150 }, { "epoch": 3.1247698855686954, "grad_norm": 1.5429891347885132, "learning_rate": 8.339867912054826e-06, "loss": 0.0558, "step": 212175 }, { "epoch": 3.1251380686587824, "grad_norm": 1.4517898559570312, "learning_rate": 8.33823154008784e-06, "loss": 0.0509, "step": 212200 }, { "epoch": 3.12550625174887, "grad_norm": 1.2259372472763062, "learning_rate": 8.336595168120857e-06, "loss": 0.0584, "step": 212225 }, { "epoch": 3.125874434838957, "grad_norm": 1.4298137426376343, "learning_rate": 8.334958796153873e-06, "loss": 0.0597, "step": 212250 }, { "epoch": 3.126242617929044, "grad_norm": 0.9515334963798523, "learning_rate": 8.333322424186887e-06, "loss": 0.0475, "step": 212275 }, { "epoch": 3.126610801019131, "grad_norm": 1.2227177619934082, "learning_rate": 8.331686052219903e-06, "loss": 0.0557, "step": 212300 }, { "epoch": 3.126978984109218, "grad_norm": 1.1603729724884033, "learning_rate": 8.330049680252918e-06, "loss": 0.0525, "step": 212325 }, { "epoch": 3.127347167199305, "grad_norm": 1.310678243637085, "learning_rate": 8.328413308285934e-06, "loss": 0.0497, "step": 212350 }, { "epoch": 3.127715350289392, "grad_norm": 1.4992417097091675, "learning_rate": 8.326776936318948e-06, "loss": 0.0495, "step": 212375 }, { "epoch": 3.128083533379479, "grad_norm": 1.0512217283248901, "learning_rate": 8.325140564351965e-06, "loss": 0.06, "step": 212400 }, { "epoch": 3.128451716469566, "grad_norm": 1.5721431970596313, "learning_rate": 8.32350419238498e-06, "loss": 0.056, "step": 212425 }, { "epoch": 3.128819899559653, "grad_norm": 0.7191261649131775, "learning_rate": 8.321867820417995e-06, "loss": 0.049, "step": 212450 }, { "epoch": 3.12918808264974, "grad_norm": 1.365514874458313, "learning_rate": 8.320231448451011e-06, "loss": 0.057, "step": 212475 }, { "epoch": 3.129556265739827, "grad_norm": 1.22919499874115, "learning_rate": 8.318595076484026e-06, "loss": 0.0594, "step": 212500 }, { "epoch": 3.129924448829914, "grad_norm": 1.4038081169128418, "learning_rate": 8.316958704517042e-06, "loss": 0.0547, "step": 212525 }, { "epoch": 3.130292631920001, "grad_norm": 1.0304944515228271, "learning_rate": 8.315322332550058e-06, "loss": 0.056, "step": 212550 }, { "epoch": 3.130660815010088, "grad_norm": 1.1844600439071655, "learning_rate": 8.313685960583073e-06, "loss": 0.0554, "step": 212575 }, { "epoch": 3.131028998100175, "grad_norm": 1.4965578317642212, "learning_rate": 8.312115043494768e-06, "loss": 0.0565, "step": 212600 }, { "epoch": 3.131397181190262, "grad_norm": 1.496036410331726, "learning_rate": 8.310478671527783e-06, "loss": 0.0581, "step": 212625 }, { "epoch": 3.131765364280349, "grad_norm": 1.601948857307434, "learning_rate": 8.308842299560799e-06, "loss": 0.0638, "step": 212650 }, { "epoch": 3.1321335473704366, "grad_norm": 0.8362765908241272, "learning_rate": 8.307205927593815e-06, "loss": 0.046, "step": 212675 }, { "epoch": 3.1325017304605236, "grad_norm": 1.1488044261932373, "learning_rate": 8.30556955562683e-06, "loss": 0.0548, "step": 212700 }, { "epoch": 3.1328699135506106, "grad_norm": 1.212314486503601, "learning_rate": 8.303933183659845e-06, "loss": 0.0532, "step": 212725 }, { "epoch": 3.1332380966406976, "grad_norm": 1.6238683462142944, "learning_rate": 8.30229681169286e-06, "loss": 0.0572, "step": 212750 }, { "epoch": 3.1336062797307846, "grad_norm": 1.1864604949951172, "learning_rate": 8.300660439725876e-06, "loss": 0.0611, "step": 212775 }, { "epoch": 3.1339744628208717, "grad_norm": 1.8363687992095947, "learning_rate": 8.29902406775889e-06, "loss": 0.0588, "step": 212800 }, { "epoch": 3.1343426459109587, "grad_norm": 1.286948561668396, "learning_rate": 8.297387695791907e-06, "loss": 0.0578, "step": 212825 }, { "epoch": 3.1347108290010457, "grad_norm": 1.3574045896530151, "learning_rate": 8.295751323824923e-06, "loss": 0.05, "step": 212850 }, { "epoch": 3.1350790120911327, "grad_norm": 1.433182716369629, "learning_rate": 8.294114951857937e-06, "loss": 0.0588, "step": 212875 }, { "epoch": 3.1354471951812197, "grad_norm": 1.83694589138031, "learning_rate": 8.292478579890952e-06, "loss": 0.0622, "step": 212900 }, { "epoch": 3.1358153782713067, "grad_norm": 1.7200795412063599, "learning_rate": 8.290842207923968e-06, "loss": 0.0576, "step": 212925 }, { "epoch": 3.1361835613613938, "grad_norm": 1.1987969875335693, "learning_rate": 8.289205835956984e-06, "loss": 0.0565, "step": 212950 }, { "epoch": 3.1365517444514808, "grad_norm": 1.3952667713165283, "learning_rate": 8.28756946399e-06, "loss": 0.0581, "step": 212975 }, { "epoch": 3.136919927541568, "grad_norm": 1.2205032110214233, "learning_rate": 8.285933092023015e-06, "loss": 0.0637, "step": 213000 }, { "epoch": 3.137288110631655, "grad_norm": 1.66941499710083, "learning_rate": 8.284296720056029e-06, "loss": 0.0553, "step": 213025 }, { "epoch": 3.137656293721742, "grad_norm": 1.975106954574585, "learning_rate": 8.282660348089045e-06, "loss": 0.0505, "step": 213050 }, { "epoch": 3.138024476811829, "grad_norm": 1.55607271194458, "learning_rate": 8.281023976122061e-06, "loss": 0.0541, "step": 213075 }, { "epoch": 3.138392659901916, "grad_norm": 1.6888313293457031, "learning_rate": 8.279387604155078e-06, "loss": 0.0577, "step": 213100 }, { "epoch": 3.138760842992003, "grad_norm": 1.2630295753479004, "learning_rate": 8.277751232188092e-06, "loss": 0.0628, "step": 213125 }, { "epoch": 3.13912902608209, "grad_norm": 1.3446288108825684, "learning_rate": 8.276114860221106e-06, "loss": 0.0536, "step": 213150 }, { "epoch": 3.1394972091721773, "grad_norm": 0.8002458810806274, "learning_rate": 8.274478488254123e-06, "loss": 0.0549, "step": 213175 }, { "epoch": 3.1398653922622644, "grad_norm": 1.483613133430481, "learning_rate": 8.272842116287139e-06, "loss": 0.0573, "step": 213200 }, { "epoch": 3.1402335753523514, "grad_norm": 1.3966134786605835, "learning_rate": 8.271205744320153e-06, "loss": 0.0576, "step": 213225 }, { "epoch": 3.1406017584424384, "grad_norm": 1.3493943214416504, "learning_rate": 8.26956937235317e-06, "loss": 0.0559, "step": 213250 }, { "epoch": 3.1409699415325254, "grad_norm": 1.8125332593917847, "learning_rate": 8.267933000386184e-06, "loss": 0.0561, "step": 213275 }, { "epoch": 3.1413381246226124, "grad_norm": 1.1806167364120483, "learning_rate": 8.2662966284192e-06, "loss": 0.0573, "step": 213300 }, { "epoch": 3.1417063077126994, "grad_norm": 1.107303500175476, "learning_rate": 8.264660256452214e-06, "loss": 0.057, "step": 213325 }, { "epoch": 3.1420744908027864, "grad_norm": 1.1827845573425293, "learning_rate": 8.26302388448523e-06, "loss": 0.0553, "step": 213350 }, { "epoch": 3.1424426738928735, "grad_norm": 1.6559511423110962, "learning_rate": 8.261387512518247e-06, "loss": 0.0516, "step": 213375 }, { "epoch": 3.1428108569829605, "grad_norm": 0.8393713235855103, "learning_rate": 8.259751140551261e-06, "loss": 0.0527, "step": 213400 }, { "epoch": 3.1431790400730475, "grad_norm": 1.2095073461532593, "learning_rate": 8.258114768584277e-06, "loss": 0.052, "step": 213425 }, { "epoch": 3.1435472231631345, "grad_norm": 0.9588323831558228, "learning_rate": 8.256478396617292e-06, "loss": 0.0555, "step": 213450 }, { "epoch": 3.1439154062532215, "grad_norm": 1.6538848876953125, "learning_rate": 8.254842024650308e-06, "loss": 0.0639, "step": 213475 }, { "epoch": 3.1442835893433085, "grad_norm": 0.5957412123680115, "learning_rate": 8.253205652683324e-06, "loss": 0.0582, "step": 213500 }, { "epoch": 3.1446517724333956, "grad_norm": 1.141301155090332, "learning_rate": 8.251569280716339e-06, "loss": 0.0514, "step": 213525 }, { "epoch": 3.1450199555234826, "grad_norm": 1.6773204803466797, "learning_rate": 8.249932908749355e-06, "loss": 0.0594, "step": 213550 }, { "epoch": 3.1453881386135696, "grad_norm": 1.4257261753082275, "learning_rate": 8.24829653678237e-06, "loss": 0.0525, "step": 213575 }, { "epoch": 3.1457563217036566, "grad_norm": 1.009519338607788, "learning_rate": 8.246660164815385e-06, "loss": 0.0544, "step": 213600 }, { "epoch": 3.146124504793744, "grad_norm": 1.6525260210037231, "learning_rate": 8.245023792848401e-06, "loss": 0.061, "step": 213625 }, { "epoch": 3.146492687883831, "grad_norm": 1.2808359861373901, "learning_rate": 8.243387420881416e-06, "loss": 0.0521, "step": 213650 }, { "epoch": 3.146860870973918, "grad_norm": 1.2175997495651245, "learning_rate": 8.241751048914432e-06, "loss": 0.0582, "step": 213675 }, { "epoch": 3.147229054064005, "grad_norm": 1.5359950065612793, "learning_rate": 8.240114676947447e-06, "loss": 0.0647, "step": 213700 }, { "epoch": 3.147597237154092, "grad_norm": 1.3045196533203125, "learning_rate": 8.238478304980463e-06, "loss": 0.0558, "step": 213725 }, { "epoch": 3.147965420244179, "grad_norm": 1.7997868061065674, "learning_rate": 8.236841933013477e-06, "loss": 0.0585, "step": 213750 }, { "epoch": 3.148333603334266, "grad_norm": 1.3765270709991455, "learning_rate": 8.235205561046493e-06, "loss": 0.063, "step": 213775 }, { "epoch": 3.148701786424353, "grad_norm": 1.4333709478378296, "learning_rate": 8.23356918907951e-06, "loss": 0.0503, "step": 213800 }, { "epoch": 3.14906996951444, "grad_norm": 1.555262804031372, "learning_rate": 8.231932817112524e-06, "loss": 0.0584, "step": 213825 }, { "epoch": 3.149438152604527, "grad_norm": 0.9358018636703491, "learning_rate": 8.230296445145538e-06, "loss": 0.0581, "step": 213850 }, { "epoch": 3.149806335694614, "grad_norm": 0.9457089900970459, "learning_rate": 8.228660073178555e-06, "loss": 0.0563, "step": 213875 }, { "epoch": 3.1501745187847012, "grad_norm": 1.402043104171753, "learning_rate": 8.22702370121157e-06, "loss": 0.053, "step": 213900 }, { "epoch": 3.1505427018747882, "grad_norm": 1.2711849212646484, "learning_rate": 8.225387329244587e-06, "loss": 0.0588, "step": 213925 }, { "epoch": 3.1509108849648753, "grad_norm": 1.3364461660385132, "learning_rate": 8.223750957277601e-06, "loss": 0.059, "step": 213950 }, { "epoch": 3.1512790680549623, "grad_norm": 1.0449579954147339, "learning_rate": 8.222114585310616e-06, "loss": 0.0504, "step": 213975 }, { "epoch": 3.1516472511450493, "grad_norm": 1.3899604082107544, "learning_rate": 8.220478213343632e-06, "loss": 0.0499, "step": 214000 }, { "epoch": 3.1520154342351363, "grad_norm": 1.4784462451934814, "learning_rate": 8.218841841376648e-06, "loss": 0.0611, "step": 214025 }, { "epoch": 3.1523836173252233, "grad_norm": 0.9336026906967163, "learning_rate": 8.217205469409663e-06, "loss": 0.054, "step": 214050 }, { "epoch": 3.1527518004153103, "grad_norm": 1.2687602043151855, "learning_rate": 8.215569097442679e-06, "loss": 0.0548, "step": 214075 }, { "epoch": 3.1531199835053974, "grad_norm": 1.118599534034729, "learning_rate": 8.213932725475695e-06, "loss": 0.0546, "step": 214100 }, { "epoch": 3.153488166595485, "grad_norm": 1.5339264869689941, "learning_rate": 8.21229635350871e-06, "loss": 0.0502, "step": 214125 }, { "epoch": 3.153856349685572, "grad_norm": 0.9971723556518555, "learning_rate": 8.210659981541725e-06, "loss": 0.0533, "step": 214150 }, { "epoch": 3.154224532775659, "grad_norm": 1.572286605834961, "learning_rate": 8.20902360957474e-06, "loss": 0.0546, "step": 214175 }, { "epoch": 3.154592715865746, "grad_norm": 1.5081692934036255, "learning_rate": 8.207387237607756e-06, "loss": 0.055, "step": 214200 }, { "epoch": 3.154960898955833, "grad_norm": 1.7470306158065796, "learning_rate": 8.205750865640772e-06, "loss": 0.0585, "step": 214225 }, { "epoch": 3.15532908204592, "grad_norm": 1.0209217071533203, "learning_rate": 8.204114493673787e-06, "loss": 0.0559, "step": 214250 }, { "epoch": 3.155697265136007, "grad_norm": 0.9646037220954895, "learning_rate": 8.202478121706801e-06, "loss": 0.054, "step": 214275 }, { "epoch": 3.156065448226094, "grad_norm": 1.0945680141448975, "learning_rate": 8.200841749739817e-06, "loss": 0.054, "step": 214300 }, { "epoch": 3.156433631316181, "grad_norm": 1.498083472251892, "learning_rate": 8.199205377772833e-06, "loss": 0.055, "step": 214325 }, { "epoch": 3.156801814406268, "grad_norm": 1.2775437831878662, "learning_rate": 8.19756900580585e-06, "loss": 0.0579, "step": 214350 }, { "epoch": 3.157169997496355, "grad_norm": 1.5905368328094482, "learning_rate": 8.195932633838864e-06, "loss": 0.0576, "step": 214375 }, { "epoch": 3.157538180586442, "grad_norm": 1.3423746824264526, "learning_rate": 8.194296261871878e-06, "loss": 0.0574, "step": 214400 }, { "epoch": 3.157906363676529, "grad_norm": 1.3613702058792114, "learning_rate": 8.192659889904895e-06, "loss": 0.0546, "step": 214425 }, { "epoch": 3.158274546766616, "grad_norm": 1.3552147150039673, "learning_rate": 8.19102351793791e-06, "loss": 0.0554, "step": 214450 }, { "epoch": 3.158642729856703, "grad_norm": 1.9894176721572876, "learning_rate": 8.189387145970925e-06, "loss": 0.0618, "step": 214475 }, { "epoch": 3.15901091294679, "grad_norm": 1.0688800811767578, "learning_rate": 8.187750774003941e-06, "loss": 0.064, "step": 214500 }, { "epoch": 3.159379096036877, "grad_norm": 1.731858730316162, "learning_rate": 8.186114402036956e-06, "loss": 0.0568, "step": 214525 }, { "epoch": 3.159747279126964, "grad_norm": 1.0963916778564453, "learning_rate": 8.184478030069972e-06, "loss": 0.0572, "step": 214550 }, { "epoch": 3.1601154622170515, "grad_norm": 0.7976915240287781, "learning_rate": 8.182841658102988e-06, "loss": 0.0546, "step": 214575 }, { "epoch": 3.1604836453071385, "grad_norm": 1.5268316268920898, "learning_rate": 8.181205286136003e-06, "loss": 0.0531, "step": 214600 }, { "epoch": 3.1608518283972256, "grad_norm": 0.8235132694244385, "learning_rate": 8.179568914169019e-06, "loss": 0.0597, "step": 214625 }, { "epoch": 3.1612200114873126, "grad_norm": 1.3213927745819092, "learning_rate": 8.177932542202033e-06, "loss": 0.0587, "step": 214650 }, { "epoch": 3.1615881945773996, "grad_norm": 1.0174158811569214, "learning_rate": 8.17629617023505e-06, "loss": 0.0541, "step": 214675 }, { "epoch": 3.1619563776674866, "grad_norm": 1.4569728374481201, "learning_rate": 8.174659798268064e-06, "loss": 0.0572, "step": 214700 }, { "epoch": 3.1623245607575736, "grad_norm": 1.13290536403656, "learning_rate": 8.17302342630108e-06, "loss": 0.0521, "step": 214725 }, { "epoch": 3.1626927438476606, "grad_norm": 1.3021537065505981, "learning_rate": 8.171387054334096e-06, "loss": 0.0518, "step": 214750 }, { "epoch": 3.1630609269377477, "grad_norm": 1.4855173826217651, "learning_rate": 8.16975068236711e-06, "loss": 0.0572, "step": 214775 }, { "epoch": 3.1634291100278347, "grad_norm": 1.2575362920761108, "learning_rate": 8.168114310400127e-06, "loss": 0.0526, "step": 214800 }, { "epoch": 3.1637972931179217, "grad_norm": 1.4837589263916016, "learning_rate": 8.166477938433141e-06, "loss": 0.0501, "step": 214825 }, { "epoch": 3.1641654762080087, "grad_norm": 1.3201878070831299, "learning_rate": 8.164841566466157e-06, "loss": 0.0529, "step": 214850 }, { "epoch": 3.1645336592980957, "grad_norm": 0.935027539730072, "learning_rate": 8.163205194499173e-06, "loss": 0.0548, "step": 214875 }, { "epoch": 3.1649018423881827, "grad_norm": 1.7025067806243896, "learning_rate": 8.161568822532188e-06, "loss": 0.0552, "step": 214900 }, { "epoch": 3.1652700254782697, "grad_norm": 1.2447409629821777, "learning_rate": 8.159932450565204e-06, "loss": 0.0571, "step": 214925 }, { "epoch": 3.1656382085683568, "grad_norm": 1.6836856603622437, "learning_rate": 8.158296078598219e-06, "loss": 0.0515, "step": 214950 }, { "epoch": 3.1660063916584438, "grad_norm": 1.638223648071289, "learning_rate": 8.156659706631235e-06, "loss": 0.0545, "step": 214975 }, { "epoch": 3.166374574748531, "grad_norm": 1.0520964860916138, "learning_rate": 8.15502333466425e-06, "loss": 0.0565, "step": 215000 }, { "epoch": 3.166742757838618, "grad_norm": 1.174172043800354, "learning_rate": 8.153452417575945e-06, "loss": 0.0568, "step": 215025 }, { "epoch": 3.167110940928705, "grad_norm": 1.3834646940231323, "learning_rate": 8.151816045608961e-06, "loss": 0.0588, "step": 215050 }, { "epoch": 3.167479124018792, "grad_norm": 0.9561067223548889, "learning_rate": 8.150179673641975e-06, "loss": 0.058, "step": 215075 }, { "epoch": 3.1678473071088793, "grad_norm": 1.3745793104171753, "learning_rate": 8.148543301674992e-06, "loss": 0.0536, "step": 215100 }, { "epoch": 3.1682154901989663, "grad_norm": 1.4004675149917603, "learning_rate": 8.146906929708006e-06, "loss": 0.0561, "step": 215125 }, { "epoch": 3.1685836732890533, "grad_norm": 1.1246006488800049, "learning_rate": 8.145270557741022e-06, "loss": 0.0568, "step": 215150 }, { "epoch": 3.1689518563791403, "grad_norm": 1.1674959659576416, "learning_rate": 8.143634185774038e-06, "loss": 0.0515, "step": 215175 }, { "epoch": 3.1693200394692274, "grad_norm": 1.3476957082748413, "learning_rate": 8.141997813807053e-06, "loss": 0.0513, "step": 215200 }, { "epoch": 3.1696882225593144, "grad_norm": 1.0233887434005737, "learning_rate": 8.140361441840067e-06, "loss": 0.0562, "step": 215225 }, { "epoch": 3.1700564056494014, "grad_norm": 1.5655750036239624, "learning_rate": 8.138725069873083e-06, "loss": 0.0531, "step": 215250 }, { "epoch": 3.1704245887394884, "grad_norm": 1.1323137283325195, "learning_rate": 8.1370886979061e-06, "loss": 0.0572, "step": 215275 }, { "epoch": 3.1707927718295754, "grad_norm": 1.1090760231018066, "learning_rate": 8.135452325939116e-06, "loss": 0.0596, "step": 215300 }, { "epoch": 3.1711609549196624, "grad_norm": 1.6740914583206177, "learning_rate": 8.13381595397213e-06, "loss": 0.0502, "step": 215325 }, { "epoch": 3.1715291380097494, "grad_norm": 1.2162855863571167, "learning_rate": 8.132179582005145e-06, "loss": 0.0483, "step": 215350 }, { "epoch": 3.1718973210998365, "grad_norm": 1.3601698875427246, "learning_rate": 8.13054321003816e-06, "loss": 0.0549, "step": 215375 }, { "epoch": 3.1722655041899235, "grad_norm": 1.5169645547866821, "learning_rate": 8.128906838071177e-06, "loss": 0.048, "step": 215400 }, { "epoch": 3.1726336872800105, "grad_norm": 1.4372507333755493, "learning_rate": 8.127270466104191e-06, "loss": 0.0533, "step": 215425 }, { "epoch": 3.1730018703700975, "grad_norm": 1.2877602577209473, "learning_rate": 8.125634094137207e-06, "loss": 0.0597, "step": 215450 }, { "epoch": 3.1733700534601845, "grad_norm": 0.972962498664856, "learning_rate": 8.123997722170222e-06, "loss": 0.0514, "step": 215475 }, { "epoch": 3.1737382365502715, "grad_norm": 1.4570029973983765, "learning_rate": 8.122361350203238e-06, "loss": 0.053, "step": 215500 }, { "epoch": 3.1741064196403586, "grad_norm": 1.2261537313461304, "learning_rate": 8.120724978236254e-06, "loss": 0.055, "step": 215525 }, { "epoch": 3.174474602730446, "grad_norm": 1.1197057962417603, "learning_rate": 8.119088606269269e-06, "loss": 0.0616, "step": 215550 }, { "epoch": 3.174842785820533, "grad_norm": 1.6672215461730957, "learning_rate": 8.117452234302285e-06, "loss": 0.0549, "step": 215575 }, { "epoch": 3.17521096891062, "grad_norm": 1.498630404472351, "learning_rate": 8.1158158623353e-06, "loss": 0.0488, "step": 215600 }, { "epoch": 3.175579152000707, "grad_norm": 1.8891905546188354, "learning_rate": 8.114179490368315e-06, "loss": 0.056, "step": 215625 }, { "epoch": 3.175947335090794, "grad_norm": 1.3721314668655396, "learning_rate": 8.11254311840133e-06, "loss": 0.0583, "step": 215650 }, { "epoch": 3.176315518180881, "grad_norm": 1.6834851503372192, "learning_rate": 8.110906746434346e-06, "loss": 0.0589, "step": 215675 }, { "epoch": 3.176683701270968, "grad_norm": 1.2526143789291382, "learning_rate": 8.109270374467362e-06, "loss": 0.0529, "step": 215700 }, { "epoch": 3.177051884361055, "grad_norm": 1.2112129926681519, "learning_rate": 8.107634002500377e-06, "loss": 0.0486, "step": 215725 }, { "epoch": 3.177420067451142, "grad_norm": 1.319957971572876, "learning_rate": 8.105997630533393e-06, "loss": 0.0524, "step": 215750 }, { "epoch": 3.177788250541229, "grad_norm": 1.3872143030166626, "learning_rate": 8.104361258566407e-06, "loss": 0.0578, "step": 215775 }, { "epoch": 3.178156433631316, "grad_norm": 1.1528382301330566, "learning_rate": 8.102724886599423e-06, "loss": 0.0534, "step": 215800 }, { "epoch": 3.178524616721403, "grad_norm": 1.0561034679412842, "learning_rate": 8.10108851463244e-06, "loss": 0.0567, "step": 215825 }, { "epoch": 3.17889279981149, "grad_norm": 1.3813204765319824, "learning_rate": 8.099452142665454e-06, "loss": 0.0506, "step": 215850 }, { "epoch": 3.179260982901577, "grad_norm": 1.1070430278778076, "learning_rate": 8.09781577069847e-06, "loss": 0.0543, "step": 215875 }, { "epoch": 3.1796291659916642, "grad_norm": 1.2274774312973022, "learning_rate": 8.096179398731485e-06, "loss": 0.0532, "step": 215900 }, { "epoch": 3.1799973490817512, "grad_norm": 1.2431498765945435, "learning_rate": 8.0945430267645e-06, "loss": 0.0594, "step": 215925 }, { "epoch": 3.1803655321718383, "grad_norm": 1.5270816087722778, "learning_rate": 8.092906654797515e-06, "loss": 0.0535, "step": 215950 }, { "epoch": 3.1807337152619253, "grad_norm": 1.6544066667556763, "learning_rate": 8.091270282830531e-06, "loss": 0.0556, "step": 215975 }, { "epoch": 3.1811018983520123, "grad_norm": 1.0528546571731567, "learning_rate": 8.089633910863548e-06, "loss": 0.0546, "step": 216000 }, { "epoch": 3.1814700814420993, "grad_norm": 1.2223880290985107, "learning_rate": 8.087997538896562e-06, "loss": 0.0572, "step": 216025 }, { "epoch": 3.1818382645321868, "grad_norm": 1.477240800857544, "learning_rate": 8.086361166929578e-06, "loss": 0.0539, "step": 216050 }, { "epoch": 3.182206447622274, "grad_norm": 1.1323323249816895, "learning_rate": 8.084724794962593e-06, "loss": 0.056, "step": 216075 }, { "epoch": 3.182574630712361, "grad_norm": 1.3468780517578125, "learning_rate": 8.083088422995609e-06, "loss": 0.0596, "step": 216100 }, { "epoch": 3.182942813802448, "grad_norm": 1.3269598484039307, "learning_rate": 8.081452051028625e-06, "loss": 0.0544, "step": 216125 }, { "epoch": 3.183310996892535, "grad_norm": 1.4006699323654175, "learning_rate": 8.07981567906164e-06, "loss": 0.0544, "step": 216150 }, { "epoch": 3.183679179982622, "grad_norm": 1.5041946172714233, "learning_rate": 8.078179307094654e-06, "loss": 0.0598, "step": 216175 }, { "epoch": 3.184047363072709, "grad_norm": 1.2448440790176392, "learning_rate": 8.07654293512767e-06, "loss": 0.0508, "step": 216200 }, { "epoch": 3.184415546162796, "grad_norm": 1.682237148284912, "learning_rate": 8.074906563160686e-06, "loss": 0.054, "step": 216225 }, { "epoch": 3.184783729252883, "grad_norm": 1.3867461681365967, "learning_rate": 8.073270191193702e-06, "loss": 0.0534, "step": 216250 }, { "epoch": 3.18515191234297, "grad_norm": 1.4437001943588257, "learning_rate": 8.071633819226717e-06, "loss": 0.0593, "step": 216275 }, { "epoch": 3.185520095433057, "grad_norm": 1.1554878950119019, "learning_rate": 8.069997447259731e-06, "loss": 0.0556, "step": 216300 }, { "epoch": 3.185888278523144, "grad_norm": 1.4752306938171387, "learning_rate": 8.068361075292747e-06, "loss": 0.0628, "step": 216325 }, { "epoch": 3.186256461613231, "grad_norm": 1.122360110282898, "learning_rate": 8.066724703325763e-06, "loss": 0.054, "step": 216350 }, { "epoch": 3.186624644703318, "grad_norm": 1.4905937910079956, "learning_rate": 8.065088331358778e-06, "loss": 0.0588, "step": 216375 }, { "epoch": 3.186992827793405, "grad_norm": 1.376253366470337, "learning_rate": 8.063451959391794e-06, "loss": 0.052, "step": 216400 }, { "epoch": 3.187361010883492, "grad_norm": 1.725206732749939, "learning_rate": 8.061815587424809e-06, "loss": 0.0672, "step": 216425 }, { "epoch": 3.187729193973579, "grad_norm": 1.6516075134277344, "learning_rate": 8.060179215457825e-06, "loss": 0.0637, "step": 216450 }, { "epoch": 3.188097377063666, "grad_norm": 1.9658507108688354, "learning_rate": 8.05854284349084e-06, "loss": 0.0559, "step": 216475 }, { "epoch": 3.1884655601537535, "grad_norm": 1.446641206741333, "learning_rate": 8.056906471523855e-06, "loss": 0.0554, "step": 216500 }, { "epoch": 3.1888337432438405, "grad_norm": 1.4606941938400269, "learning_rate": 8.055270099556871e-06, "loss": 0.0561, "step": 216525 }, { "epoch": 3.1892019263339275, "grad_norm": 1.0025992393493652, "learning_rate": 8.053633727589888e-06, "loss": 0.0522, "step": 216550 }, { "epoch": 3.1895701094240145, "grad_norm": 1.394100308418274, "learning_rate": 8.051997355622902e-06, "loss": 0.0536, "step": 216575 }, { "epoch": 3.1899382925141015, "grad_norm": 1.3884831666946411, "learning_rate": 8.050360983655917e-06, "loss": 0.0563, "step": 216600 }, { "epoch": 3.1903064756041886, "grad_norm": 1.4022445678710938, "learning_rate": 8.048724611688933e-06, "loss": 0.056, "step": 216625 }, { "epoch": 3.1906746586942756, "grad_norm": 1.6145238876342773, "learning_rate": 8.047088239721949e-06, "loss": 0.058, "step": 216650 }, { "epoch": 3.1910428417843626, "grad_norm": 1.4306340217590332, "learning_rate": 8.045451867754965e-06, "loss": 0.0552, "step": 216675 }, { "epoch": 3.1914110248744496, "grad_norm": 0.9680589437484741, "learning_rate": 8.04381549578798e-06, "loss": 0.0551, "step": 216700 }, { "epoch": 3.1917792079645366, "grad_norm": 1.6016921997070312, "learning_rate": 8.042179123820994e-06, "loss": 0.0593, "step": 216725 }, { "epoch": 3.1921473910546236, "grad_norm": 1.6941205263137817, "learning_rate": 8.04054275185401e-06, "loss": 0.0565, "step": 216750 }, { "epoch": 3.1925155741447107, "grad_norm": 1.2751888036727905, "learning_rate": 8.038906379887026e-06, "loss": 0.0561, "step": 216775 }, { "epoch": 3.1928837572347977, "grad_norm": 1.2650781869888306, "learning_rate": 8.03727000792004e-06, "loss": 0.0521, "step": 216800 }, { "epoch": 3.1932519403248847, "grad_norm": 1.5958646535873413, "learning_rate": 8.035633635953057e-06, "loss": 0.0631, "step": 216825 }, { "epoch": 3.1936201234149717, "grad_norm": 1.2072230577468872, "learning_rate": 8.033997263986071e-06, "loss": 0.0508, "step": 216850 }, { "epoch": 3.1939883065050587, "grad_norm": 1.642998456954956, "learning_rate": 8.032360892019087e-06, "loss": 0.0588, "step": 216875 }, { "epoch": 3.1943564895951457, "grad_norm": 1.6794160604476929, "learning_rate": 8.030724520052102e-06, "loss": 0.053, "step": 216900 }, { "epoch": 3.1947246726852327, "grad_norm": 1.5724201202392578, "learning_rate": 8.029088148085118e-06, "loss": 0.0476, "step": 216925 }, { "epoch": 3.1950928557753198, "grad_norm": 1.1608883142471313, "learning_rate": 8.027451776118134e-06, "loss": 0.0615, "step": 216950 }, { "epoch": 3.1954610388654068, "grad_norm": 0.874894380569458, "learning_rate": 8.025815404151149e-06, "loss": 0.0574, "step": 216975 }, { "epoch": 3.195829221955494, "grad_norm": 1.4091254472732544, "learning_rate": 8.024179032184165e-06, "loss": 0.0551, "step": 217000 }, { "epoch": 3.1961974050455813, "grad_norm": 1.1484310626983643, "learning_rate": 8.02254266021718e-06, "loss": 0.0565, "step": 217025 }, { "epoch": 3.1965655881356683, "grad_norm": 1.597987174987793, "learning_rate": 8.020906288250195e-06, "loss": 0.0535, "step": 217050 }, { "epoch": 3.1969337712257553, "grad_norm": 1.5632587671279907, "learning_rate": 8.019335371161891e-06, "loss": 0.0541, "step": 217075 }, { "epoch": 3.1973019543158423, "grad_norm": 1.4962395429611206, "learning_rate": 8.017698999194905e-06, "loss": 0.0621, "step": 217100 }, { "epoch": 3.1976701374059293, "grad_norm": 1.2469425201416016, "learning_rate": 8.01606262722792e-06, "loss": 0.0533, "step": 217125 }, { "epoch": 3.1980383204960163, "grad_norm": 1.5091272592544556, "learning_rate": 8.014426255260936e-06, "loss": 0.057, "step": 217150 }, { "epoch": 3.1984065035861033, "grad_norm": 1.6714004278182983, "learning_rate": 8.012789883293952e-06, "loss": 0.0569, "step": 217175 }, { "epoch": 3.1987746866761904, "grad_norm": 1.188004493713379, "learning_rate": 8.011153511326968e-06, "loss": 0.0571, "step": 217200 }, { "epoch": 3.1991428697662774, "grad_norm": 1.5033217668533325, "learning_rate": 8.009517139359983e-06, "loss": 0.0617, "step": 217225 }, { "epoch": 3.1995110528563644, "grad_norm": 1.7540327310562134, "learning_rate": 8.007880767392999e-06, "loss": 0.0521, "step": 217250 }, { "epoch": 3.1998792359464514, "grad_norm": 1.2581640481948853, "learning_rate": 8.006244395426013e-06, "loss": 0.0459, "step": 217275 }, { "epoch": 3.2002474190365384, "grad_norm": 1.2538645267486572, "learning_rate": 8.00460802345903e-06, "loss": 0.0527, "step": 217300 }, { "epoch": 3.2006156021266254, "grad_norm": 1.5519791841506958, "learning_rate": 8.002971651492044e-06, "loss": 0.0586, "step": 217325 }, { "epoch": 3.2009837852167125, "grad_norm": 1.1620047092437744, "learning_rate": 8.00133527952506e-06, "loss": 0.0588, "step": 217350 }, { "epoch": 3.2013519683067995, "grad_norm": 1.399783730506897, "learning_rate": 7.999698907558076e-06, "loss": 0.0553, "step": 217375 }, { "epoch": 3.2017201513968865, "grad_norm": 1.1367632150650024, "learning_rate": 7.99806253559109e-06, "loss": 0.0571, "step": 217400 }, { "epoch": 3.2020883344869735, "grad_norm": 1.2106285095214844, "learning_rate": 7.996426163624105e-06, "loss": 0.0518, "step": 217425 }, { "epoch": 3.2024565175770605, "grad_norm": 1.589905858039856, "learning_rate": 7.994789791657121e-06, "loss": 0.0645, "step": 217450 }, { "epoch": 3.202824700667148, "grad_norm": 1.630958080291748, "learning_rate": 7.993153419690138e-06, "loss": 0.0537, "step": 217475 }, { "epoch": 3.203192883757235, "grad_norm": 1.2025760412216187, "learning_rate": 7.991517047723154e-06, "loss": 0.0534, "step": 217500 }, { "epoch": 3.203561066847322, "grad_norm": 1.398712158203125, "learning_rate": 7.989880675756168e-06, "loss": 0.0541, "step": 217525 }, { "epoch": 3.203929249937409, "grad_norm": 0.9454752802848816, "learning_rate": 7.988244303789183e-06, "loss": 0.0537, "step": 217550 }, { "epoch": 3.204297433027496, "grad_norm": 1.665662407875061, "learning_rate": 7.986607931822199e-06, "loss": 0.0614, "step": 217575 }, { "epoch": 3.204665616117583, "grad_norm": 1.098403811454773, "learning_rate": 7.984971559855215e-06, "loss": 0.0609, "step": 217600 }, { "epoch": 3.20503379920767, "grad_norm": 1.2247380018234253, "learning_rate": 7.983335187888231e-06, "loss": 0.0591, "step": 217625 }, { "epoch": 3.205401982297757, "grad_norm": 1.1882036924362183, "learning_rate": 7.981698815921246e-06, "loss": 0.0536, "step": 217650 }, { "epoch": 3.205770165387844, "grad_norm": 1.586959719657898, "learning_rate": 7.98006244395426e-06, "loss": 0.0545, "step": 217675 }, { "epoch": 3.206138348477931, "grad_norm": 1.1314295530319214, "learning_rate": 7.978426071987276e-06, "loss": 0.0591, "step": 217700 }, { "epoch": 3.206506531568018, "grad_norm": 1.2921476364135742, "learning_rate": 7.976789700020292e-06, "loss": 0.0577, "step": 217725 }, { "epoch": 3.206874714658105, "grad_norm": 0.9713476896286011, "learning_rate": 7.975153328053307e-06, "loss": 0.0491, "step": 217750 }, { "epoch": 3.207242897748192, "grad_norm": 1.641406536102295, "learning_rate": 7.973516956086323e-06, "loss": 0.059, "step": 217775 }, { "epoch": 3.207611080838279, "grad_norm": 1.4800699949264526, "learning_rate": 7.971880584119337e-06, "loss": 0.05, "step": 217800 }, { "epoch": 3.207979263928366, "grad_norm": 0.901652455329895, "learning_rate": 7.970244212152354e-06, "loss": 0.0564, "step": 217825 }, { "epoch": 3.208347447018453, "grad_norm": 1.2968181371688843, "learning_rate": 7.968607840185368e-06, "loss": 0.0571, "step": 217850 }, { "epoch": 3.20871563010854, "grad_norm": 1.3947423696517944, "learning_rate": 7.966971468218384e-06, "loss": 0.0585, "step": 217875 }, { "epoch": 3.2090838131986272, "grad_norm": 0.698746919631958, "learning_rate": 7.9653350962514e-06, "loss": 0.0577, "step": 217900 }, { "epoch": 3.2094519962887142, "grad_norm": 1.2787644863128662, "learning_rate": 7.963698724284415e-06, "loss": 0.0599, "step": 217925 }, { "epoch": 3.2098201793788013, "grad_norm": 1.4996448755264282, "learning_rate": 7.962062352317431e-06, "loss": 0.0571, "step": 217950 }, { "epoch": 3.2101883624688887, "grad_norm": 1.2406779527664185, "learning_rate": 7.960425980350445e-06, "loss": 0.0509, "step": 217975 }, { "epoch": 3.2105565455589757, "grad_norm": 2.0091066360473633, "learning_rate": 7.958789608383461e-06, "loss": 0.056, "step": 218000 }, { "epoch": 3.2109247286490628, "grad_norm": 1.932629942893982, "learning_rate": 7.957153236416478e-06, "loss": 0.0506, "step": 218025 }, { "epoch": 3.2112929117391498, "grad_norm": 1.0832786560058594, "learning_rate": 7.955516864449492e-06, "loss": 0.0602, "step": 218050 }, { "epoch": 3.211661094829237, "grad_norm": 1.1417902708053589, "learning_rate": 7.953880492482508e-06, "loss": 0.0529, "step": 218075 }, { "epoch": 3.212029277919324, "grad_norm": 1.7789398431777954, "learning_rate": 7.952244120515523e-06, "loss": 0.0589, "step": 218100 }, { "epoch": 3.212397461009411, "grad_norm": 1.2323665618896484, "learning_rate": 7.950607748548539e-06, "loss": 0.0561, "step": 218125 }, { "epoch": 3.212765644099498, "grad_norm": 1.3466113805770874, "learning_rate": 7.948971376581555e-06, "loss": 0.0542, "step": 218150 }, { "epoch": 3.213133827189585, "grad_norm": 1.055102825164795, "learning_rate": 7.94733500461457e-06, "loss": 0.054, "step": 218175 }, { "epoch": 3.213502010279672, "grad_norm": 1.2706629037857056, "learning_rate": 7.945698632647586e-06, "loss": 0.0529, "step": 218200 }, { "epoch": 3.213870193369759, "grad_norm": 1.5244171619415283, "learning_rate": 7.9440622606806e-06, "loss": 0.0519, "step": 218225 }, { "epoch": 3.214238376459846, "grad_norm": 1.0767817497253418, "learning_rate": 7.942425888713616e-06, "loss": 0.057, "step": 218250 }, { "epoch": 3.214606559549933, "grad_norm": 1.303810954093933, "learning_rate": 7.94078951674663e-06, "loss": 0.057, "step": 218275 }, { "epoch": 3.21497474264002, "grad_norm": 1.4361202716827393, "learning_rate": 7.939153144779647e-06, "loss": 0.0512, "step": 218300 }, { "epoch": 3.215342925730107, "grad_norm": 1.2506037950515747, "learning_rate": 7.937516772812663e-06, "loss": 0.0606, "step": 218325 }, { "epoch": 3.215711108820194, "grad_norm": 1.189992070198059, "learning_rate": 7.935880400845677e-06, "loss": 0.0563, "step": 218350 }, { "epoch": 3.216079291910281, "grad_norm": 1.3108410835266113, "learning_rate": 7.934244028878692e-06, "loss": 0.0478, "step": 218375 }, { "epoch": 3.216447475000368, "grad_norm": 1.2580293416976929, "learning_rate": 7.932607656911708e-06, "loss": 0.0583, "step": 218400 }, { "epoch": 3.2168156580904554, "grad_norm": 1.4991685152053833, "learning_rate": 7.930971284944724e-06, "loss": 0.0591, "step": 218425 }, { "epoch": 3.2171838411805425, "grad_norm": 1.218895435333252, "learning_rate": 7.92933491297774e-06, "loss": 0.0555, "step": 218450 }, { "epoch": 3.2175520242706295, "grad_norm": 1.5605872869491577, "learning_rate": 7.927698541010755e-06, "loss": 0.0492, "step": 218475 }, { "epoch": 3.2179202073607165, "grad_norm": 1.4264897108078003, "learning_rate": 7.92606216904377e-06, "loss": 0.0561, "step": 218500 }, { "epoch": 3.2182883904508035, "grad_norm": 1.3707656860351562, "learning_rate": 7.924425797076785e-06, "loss": 0.0539, "step": 218525 }, { "epoch": 3.2186565735408905, "grad_norm": 0.6610348224639893, "learning_rate": 7.922789425109802e-06, "loss": 0.0507, "step": 218550 }, { "epoch": 3.2190247566309775, "grad_norm": 1.1268141269683838, "learning_rate": 7.921153053142818e-06, "loss": 0.0576, "step": 218575 }, { "epoch": 3.2193929397210646, "grad_norm": 1.5163272619247437, "learning_rate": 7.919516681175832e-06, "loss": 0.0616, "step": 218600 }, { "epoch": 3.2197611228111516, "grad_norm": 1.032633900642395, "learning_rate": 7.917880309208847e-06, "loss": 0.0504, "step": 218625 }, { "epoch": 3.2201293059012386, "grad_norm": 1.1997606754302979, "learning_rate": 7.916243937241863e-06, "loss": 0.0589, "step": 218650 }, { "epoch": 3.2204974889913256, "grad_norm": 0.787285327911377, "learning_rate": 7.914607565274879e-06, "loss": 0.0579, "step": 218675 }, { "epoch": 3.2208656720814126, "grad_norm": 1.094734787940979, "learning_rate": 7.912971193307893e-06, "loss": 0.0512, "step": 218700 }, { "epoch": 3.2212338551714996, "grad_norm": 1.3936247825622559, "learning_rate": 7.91133482134091e-06, "loss": 0.0556, "step": 218725 }, { "epoch": 3.2216020382615866, "grad_norm": 0.9421206116676331, "learning_rate": 7.909698449373924e-06, "loss": 0.0539, "step": 218750 }, { "epoch": 3.2219702213516737, "grad_norm": 1.6949080228805542, "learning_rate": 7.90806207740694e-06, "loss": 0.0549, "step": 218775 }, { "epoch": 3.2223384044417607, "grad_norm": 1.5208719968795776, "learning_rate": 7.906425705439955e-06, "loss": 0.0615, "step": 218800 }, { "epoch": 3.2227065875318477, "grad_norm": 1.4702141284942627, "learning_rate": 7.90478933347297e-06, "loss": 0.0506, "step": 218825 }, { "epoch": 3.2230747706219347, "grad_norm": 1.4829256534576416, "learning_rate": 7.903152961505987e-06, "loss": 0.0545, "step": 218850 }, { "epoch": 3.2234429537120217, "grad_norm": 1.4444411993026733, "learning_rate": 7.901516589539003e-06, "loss": 0.0598, "step": 218875 }, { "epoch": 3.2238111368021087, "grad_norm": 1.365617275238037, "learning_rate": 7.899880217572018e-06, "loss": 0.0511, "step": 218900 }, { "epoch": 3.224179319892196, "grad_norm": 1.714347243309021, "learning_rate": 7.898243845605032e-06, "loss": 0.0561, "step": 218925 }, { "epoch": 3.224547502982283, "grad_norm": 0.9410088658332825, "learning_rate": 7.896607473638048e-06, "loss": 0.0564, "step": 218950 }, { "epoch": 3.2249156860723702, "grad_norm": 1.8370044231414795, "learning_rate": 7.894971101671064e-06, "loss": 0.0526, "step": 218975 }, { "epoch": 3.2252838691624572, "grad_norm": 1.2382044792175293, "learning_rate": 7.89333472970408e-06, "loss": 0.0573, "step": 219000 }, { "epoch": 3.2256520522525443, "grad_norm": 1.8614258766174316, "learning_rate": 7.891698357737095e-06, "loss": 0.0631, "step": 219025 }, { "epoch": 3.2260202353426313, "grad_norm": 1.6997545957565308, "learning_rate": 7.89006198577011e-06, "loss": 0.0569, "step": 219050 }, { "epoch": 3.2263884184327183, "grad_norm": 2.166200876235962, "learning_rate": 7.888425613803126e-06, "loss": 0.0611, "step": 219075 }, { "epoch": 3.2267566015228053, "grad_norm": 1.0739834308624268, "learning_rate": 7.886789241836142e-06, "loss": 0.0496, "step": 219100 }, { "epoch": 3.2271247846128923, "grad_norm": 1.9614357948303223, "learning_rate": 7.885152869869156e-06, "loss": 0.0512, "step": 219125 }, { "epoch": 3.2274929677029793, "grad_norm": 1.7242183685302734, "learning_rate": 7.883516497902172e-06, "loss": 0.0567, "step": 219150 }, { "epoch": 3.2278611507930663, "grad_norm": 1.8307344913482666, "learning_rate": 7.881880125935187e-06, "loss": 0.0564, "step": 219175 }, { "epoch": 3.2282293338831534, "grad_norm": 1.1590774059295654, "learning_rate": 7.880243753968203e-06, "loss": 0.051, "step": 219200 }, { "epoch": 3.2285975169732404, "grad_norm": 1.5442832708358765, "learning_rate": 7.878607382001217e-06, "loss": 0.0506, "step": 219225 }, { "epoch": 3.2289657000633274, "grad_norm": 0.9634265303611755, "learning_rate": 7.876971010034233e-06, "loss": 0.0601, "step": 219250 }, { "epoch": 3.2293338831534144, "grad_norm": 1.2625250816345215, "learning_rate": 7.87533463806725e-06, "loss": 0.0562, "step": 219275 }, { "epoch": 3.2297020662435014, "grad_norm": 1.1419134140014648, "learning_rate": 7.873698266100264e-06, "loss": 0.0611, "step": 219300 }, { "epoch": 3.2300702493335884, "grad_norm": 1.245415210723877, "learning_rate": 7.872061894133279e-06, "loss": 0.061, "step": 219325 }, { "epoch": 3.2304384324236755, "grad_norm": 1.7258957624435425, "learning_rate": 7.870425522166295e-06, "loss": 0.0547, "step": 219350 }, { "epoch": 3.230806615513763, "grad_norm": 1.766653299331665, "learning_rate": 7.868789150199311e-06, "loss": 0.0571, "step": 219375 }, { "epoch": 3.23117479860385, "grad_norm": 0.9274715781211853, "learning_rate": 7.867152778232327e-06, "loss": 0.0531, "step": 219400 }, { "epoch": 3.231542981693937, "grad_norm": 1.7551229000091553, "learning_rate": 7.865581861144021e-06, "loss": 0.053, "step": 219425 }, { "epoch": 3.231911164784024, "grad_norm": 1.257689118385315, "learning_rate": 7.863945489177035e-06, "loss": 0.0591, "step": 219450 }, { "epoch": 3.232279347874111, "grad_norm": 1.8327311277389526, "learning_rate": 7.862309117210051e-06, "loss": 0.0539, "step": 219475 }, { "epoch": 3.232647530964198, "grad_norm": 1.2686773538589478, "learning_rate": 7.860672745243068e-06, "loss": 0.0525, "step": 219500 }, { "epoch": 3.233015714054285, "grad_norm": 1.14503812789917, "learning_rate": 7.859036373276084e-06, "loss": 0.0586, "step": 219525 }, { "epoch": 3.233383897144372, "grad_norm": 1.4078103303909302, "learning_rate": 7.857400001309098e-06, "loss": 0.0493, "step": 219550 }, { "epoch": 3.233752080234459, "grad_norm": 0.8492733240127563, "learning_rate": 7.855763629342114e-06, "loss": 0.0533, "step": 219575 }, { "epoch": 3.234120263324546, "grad_norm": 1.3695659637451172, "learning_rate": 7.854127257375129e-06, "loss": 0.061, "step": 219600 }, { "epoch": 3.234488446414633, "grad_norm": 1.4010695219039917, "learning_rate": 7.852490885408145e-06, "loss": 0.0503, "step": 219625 }, { "epoch": 3.23485662950472, "grad_norm": 1.7307606935501099, "learning_rate": 7.85085451344116e-06, "loss": 0.0576, "step": 219650 }, { "epoch": 3.235224812594807, "grad_norm": 1.6170222759246826, "learning_rate": 7.849218141474176e-06, "loss": 0.0533, "step": 219675 }, { "epoch": 3.235592995684894, "grad_norm": 1.5843403339385986, "learning_rate": 7.847581769507192e-06, "loss": 0.053, "step": 219700 }, { "epoch": 3.235961178774981, "grad_norm": 1.438511848449707, "learning_rate": 7.845945397540206e-06, "loss": 0.0525, "step": 219725 }, { "epoch": 3.236329361865068, "grad_norm": 1.6156588792800903, "learning_rate": 7.84430902557322e-06, "loss": 0.0607, "step": 219750 }, { "epoch": 3.236697544955155, "grad_norm": 1.6585315465927124, "learning_rate": 7.842672653606237e-06, "loss": 0.0512, "step": 219775 }, { "epoch": 3.237065728045242, "grad_norm": 0.9308322072029114, "learning_rate": 7.841036281639253e-06, "loss": 0.051, "step": 219800 }, { "epoch": 3.237433911135329, "grad_norm": 0.7710166573524475, "learning_rate": 7.839399909672269e-06, "loss": 0.0578, "step": 219825 }, { "epoch": 3.237802094225416, "grad_norm": 1.2792218923568726, "learning_rate": 7.837763537705284e-06, "loss": 0.0548, "step": 219850 }, { "epoch": 3.238170277315503, "grad_norm": 0.8353171348571777, "learning_rate": 7.836127165738298e-06, "loss": 0.0533, "step": 219875 }, { "epoch": 3.2385384604055907, "grad_norm": 1.236276388168335, "learning_rate": 7.834490793771314e-06, "loss": 0.0612, "step": 219900 }, { "epoch": 3.2389066434956777, "grad_norm": 1.0535063743591309, "learning_rate": 7.83285442180433e-06, "loss": 0.0608, "step": 219925 }, { "epoch": 3.2392748265857647, "grad_norm": 1.2724214792251587, "learning_rate": 7.831218049837347e-06, "loss": 0.0556, "step": 219950 }, { "epoch": 3.2396430096758517, "grad_norm": 1.3391023874282837, "learning_rate": 7.829581677870361e-06, "loss": 0.0543, "step": 219975 }, { "epoch": 3.2400111927659387, "grad_norm": 1.8974146842956543, "learning_rate": 7.827945305903375e-06, "loss": 0.0621, "step": 220000 }, { "epoch": 3.2403793758560258, "grad_norm": 1.2320915460586548, "learning_rate": 7.826308933936392e-06, "loss": 0.0557, "step": 220025 }, { "epoch": 3.2407475589461128, "grad_norm": 1.5294811725616455, "learning_rate": 7.824672561969408e-06, "loss": 0.0521, "step": 220050 }, { "epoch": 3.2411157420362, "grad_norm": 1.2936004400253296, "learning_rate": 7.823036190002422e-06, "loss": 0.0531, "step": 220075 }, { "epoch": 3.241483925126287, "grad_norm": 1.2569042444229126, "learning_rate": 7.821399818035438e-06, "loss": 0.0474, "step": 220100 }, { "epoch": 3.241852108216374, "grad_norm": 1.6288968324661255, "learning_rate": 7.819763446068453e-06, "loss": 0.0529, "step": 220125 }, { "epoch": 3.242220291306461, "grad_norm": 1.7834948301315308, "learning_rate": 7.818127074101469e-06, "loss": 0.0606, "step": 220150 }, { "epoch": 3.242588474396548, "grad_norm": 1.0031009912490845, "learning_rate": 7.816490702134483e-06, "loss": 0.0532, "step": 220175 }, { "epoch": 3.242956657486635, "grad_norm": 1.7676444053649902, "learning_rate": 7.8148543301675e-06, "loss": 0.0574, "step": 220200 }, { "epoch": 3.243324840576722, "grad_norm": 1.1878416538238525, "learning_rate": 7.813217958200516e-06, "loss": 0.0529, "step": 220225 }, { "epoch": 3.243693023666809, "grad_norm": 1.1438179016113281, "learning_rate": 7.81158158623353e-06, "loss": 0.0555, "step": 220250 }, { "epoch": 3.244061206756896, "grad_norm": 1.3004798889160156, "learning_rate": 7.809945214266546e-06, "loss": 0.0545, "step": 220275 }, { "epoch": 3.244429389846983, "grad_norm": 1.0291725397109985, "learning_rate": 7.80830884229956e-06, "loss": 0.049, "step": 220300 }, { "epoch": 3.24479757293707, "grad_norm": 1.079509973526001, "learning_rate": 7.806672470332577e-06, "loss": 0.055, "step": 220325 }, { "epoch": 3.2451657560271574, "grad_norm": 1.3070451021194458, "learning_rate": 7.805036098365593e-06, "loss": 0.0477, "step": 220350 }, { "epoch": 3.2455339391172444, "grad_norm": 1.6348049640655518, "learning_rate": 7.803399726398608e-06, "loss": 0.0557, "step": 220375 }, { "epoch": 3.2459021222073314, "grad_norm": 1.5824440717697144, "learning_rate": 7.801763354431624e-06, "loss": 0.0544, "step": 220400 }, { "epoch": 3.2462703052974184, "grad_norm": 1.23793625831604, "learning_rate": 7.800126982464638e-06, "loss": 0.0522, "step": 220425 }, { "epoch": 3.2466384883875055, "grad_norm": 1.3498425483703613, "learning_rate": 7.798490610497654e-06, "loss": 0.0551, "step": 220450 }, { "epoch": 3.2470066714775925, "grad_norm": 1.681096076965332, "learning_rate": 7.79685423853067e-06, "loss": 0.0566, "step": 220475 }, { "epoch": 3.2473748545676795, "grad_norm": 1.1958085298538208, "learning_rate": 7.795217866563685e-06, "loss": 0.0551, "step": 220500 }, { "epoch": 3.2477430376577665, "grad_norm": 1.3471393585205078, "learning_rate": 7.793581494596701e-06, "loss": 0.059, "step": 220525 }, { "epoch": 3.2481112207478535, "grad_norm": 0.9900381565093994, "learning_rate": 7.791945122629716e-06, "loss": 0.0549, "step": 220550 }, { "epoch": 3.2484794038379405, "grad_norm": 1.659070372581482, "learning_rate": 7.790308750662732e-06, "loss": 0.0581, "step": 220575 }, { "epoch": 3.2488475869280276, "grad_norm": 1.5467759370803833, "learning_rate": 7.788672378695746e-06, "loss": 0.0526, "step": 220600 }, { "epoch": 3.2492157700181146, "grad_norm": 1.1430433988571167, "learning_rate": 7.787036006728762e-06, "loss": 0.0567, "step": 220625 }, { "epoch": 3.2495839531082016, "grad_norm": 1.223158597946167, "learning_rate": 7.785399634761778e-06, "loss": 0.0551, "step": 220650 }, { "epoch": 3.2499521361982886, "grad_norm": 1.2660014629364014, "learning_rate": 7.783763262794793e-06, "loss": 0.0553, "step": 220675 }, { "epoch": 3.2503203192883756, "grad_norm": 1.2932569980621338, "learning_rate": 7.782126890827807e-06, "loss": 0.0536, "step": 220700 }, { "epoch": 3.2506885023784626, "grad_norm": 1.11372971534729, "learning_rate": 7.780490518860823e-06, "loss": 0.0511, "step": 220725 }, { "epoch": 3.2510566854685496, "grad_norm": 1.1959940195083618, "learning_rate": 7.77885414689384e-06, "loss": 0.0541, "step": 220750 }, { "epoch": 3.2514248685586367, "grad_norm": 1.5139520168304443, "learning_rate": 7.777217774926856e-06, "loss": 0.0592, "step": 220775 }, { "epoch": 3.2517930516487237, "grad_norm": 1.145010232925415, "learning_rate": 7.77558140295987e-06, "loss": 0.0588, "step": 220800 }, { "epoch": 3.2521612347388107, "grad_norm": 0.9240149855613708, "learning_rate": 7.773945030992885e-06, "loss": 0.0485, "step": 220825 }, { "epoch": 3.2525294178288977, "grad_norm": 1.4335905313491821, "learning_rate": 7.772308659025901e-06, "loss": 0.0573, "step": 220850 }, { "epoch": 3.252897600918985, "grad_norm": 1.374603509902954, "learning_rate": 7.770672287058917e-06, "loss": 0.0491, "step": 220875 }, { "epoch": 3.253265784009072, "grad_norm": 1.4705373048782349, "learning_rate": 7.769035915091933e-06, "loss": 0.0537, "step": 220900 }, { "epoch": 3.253633967099159, "grad_norm": 1.4508769512176514, "learning_rate": 7.767399543124948e-06, "loss": 0.061, "step": 220925 }, { "epoch": 3.254002150189246, "grad_norm": 1.0141245126724243, "learning_rate": 7.765763171157962e-06, "loss": 0.0487, "step": 220950 }, { "epoch": 3.2543703332793332, "grad_norm": 1.3924833536148071, "learning_rate": 7.764126799190978e-06, "loss": 0.052, "step": 220975 }, { "epoch": 3.2547385163694202, "grad_norm": 1.394154667854309, "learning_rate": 7.762490427223994e-06, "loss": 0.0537, "step": 221000 }, { "epoch": 3.2551066994595073, "grad_norm": 1.1420148611068726, "learning_rate": 7.760854055257009e-06, "loss": 0.0542, "step": 221025 }, { "epoch": 3.2554748825495943, "grad_norm": 1.1677132844924927, "learning_rate": 7.759217683290025e-06, "loss": 0.0553, "step": 221050 }, { "epoch": 3.2558430656396813, "grad_norm": 1.5021580457687378, "learning_rate": 7.75758131132304e-06, "loss": 0.0505, "step": 221075 }, { "epoch": 3.2562112487297683, "grad_norm": 1.3404242992401123, "learning_rate": 7.755944939356056e-06, "loss": 0.0535, "step": 221100 }, { "epoch": 3.2565794318198553, "grad_norm": 1.3072950839996338, "learning_rate": 7.75430856738907e-06, "loss": 0.0597, "step": 221125 }, { "epoch": 3.2569476149099423, "grad_norm": 1.358812689781189, "learning_rate": 7.752672195422086e-06, "loss": 0.0521, "step": 221150 }, { "epoch": 3.2573157980000293, "grad_norm": 0.9844956398010254, "learning_rate": 7.751035823455102e-06, "loss": 0.0548, "step": 221175 }, { "epoch": 3.2576839810901164, "grad_norm": 0.9220600128173828, "learning_rate": 7.749399451488119e-06, "loss": 0.0549, "step": 221200 }, { "epoch": 3.2580521641802034, "grad_norm": 0.9293469190597534, "learning_rate": 7.747763079521133e-06, "loss": 0.0521, "step": 221225 }, { "epoch": 3.2584203472702904, "grad_norm": 2.073119878768921, "learning_rate": 7.746126707554147e-06, "loss": 0.0565, "step": 221250 }, { "epoch": 3.258788530360378, "grad_norm": 1.2151141166687012, "learning_rate": 7.744490335587164e-06, "loss": 0.0498, "step": 221275 }, { "epoch": 3.259156713450465, "grad_norm": 1.2879892587661743, "learning_rate": 7.74285396362018e-06, "loss": 0.0567, "step": 221300 }, { "epoch": 3.259524896540552, "grad_norm": 0.9492918252944946, "learning_rate": 7.741217591653194e-06, "loss": 0.0536, "step": 221325 }, { "epoch": 3.259893079630639, "grad_norm": 0.952765941619873, "learning_rate": 7.73958121968621e-06, "loss": 0.0495, "step": 221350 }, { "epoch": 3.260261262720726, "grad_norm": 1.2155275344848633, "learning_rate": 7.737944847719225e-06, "loss": 0.06, "step": 221375 }, { "epoch": 3.260629445810813, "grad_norm": 0.9193699359893799, "learning_rate": 7.736308475752241e-06, "loss": 0.0503, "step": 221400 }, { "epoch": 3.2609976289009, "grad_norm": 1.2600046396255493, "learning_rate": 7.734672103785257e-06, "loss": 0.053, "step": 221425 }, { "epoch": 3.261365811990987, "grad_norm": 1.485835075378418, "learning_rate": 7.733035731818272e-06, "loss": 0.0533, "step": 221450 }, { "epoch": 3.261733995081074, "grad_norm": 1.4178886413574219, "learning_rate": 7.731399359851288e-06, "loss": 0.0627, "step": 221475 }, { "epoch": 3.262102178171161, "grad_norm": 1.5273215770721436, "learning_rate": 7.729762987884302e-06, "loss": 0.053, "step": 221500 }, { "epoch": 3.262470361261248, "grad_norm": 1.0496892929077148, "learning_rate": 7.728126615917318e-06, "loss": 0.0589, "step": 221525 }, { "epoch": 3.262838544351335, "grad_norm": 1.0900356769561768, "learning_rate": 7.726555698829012e-06, "loss": 0.0566, "step": 221550 }, { "epoch": 3.263206727441422, "grad_norm": 1.4395400285720825, "learning_rate": 7.724919326862028e-06, "loss": 0.0497, "step": 221575 }, { "epoch": 3.263574910531509, "grad_norm": 1.1687188148498535, "learning_rate": 7.723282954895044e-06, "loss": 0.0484, "step": 221600 }, { "epoch": 3.263943093621596, "grad_norm": 1.5220705270767212, "learning_rate": 7.721646582928059e-06, "loss": 0.0531, "step": 221625 }, { "epoch": 3.264311276711683, "grad_norm": 1.4611964225769043, "learning_rate": 7.720010210961073e-06, "loss": 0.0574, "step": 221650 }, { "epoch": 3.26467945980177, "grad_norm": 1.3516196012496948, "learning_rate": 7.71837383899409e-06, "loss": 0.0591, "step": 221675 }, { "epoch": 3.265047642891857, "grad_norm": 1.595139503479004, "learning_rate": 7.716737467027106e-06, "loss": 0.0592, "step": 221700 }, { "epoch": 3.265415825981944, "grad_norm": 1.6821563243865967, "learning_rate": 7.715101095060122e-06, "loss": 0.0538, "step": 221725 }, { "epoch": 3.265784009072031, "grad_norm": 1.530360221862793, "learning_rate": 7.713464723093136e-06, "loss": 0.0529, "step": 221750 }, { "epoch": 3.266152192162118, "grad_norm": 1.4588305950164795, "learning_rate": 7.71182835112615e-06, "loss": 0.0556, "step": 221775 }, { "epoch": 3.266520375252205, "grad_norm": 1.5564113855361938, "learning_rate": 7.710191979159167e-06, "loss": 0.0526, "step": 221800 }, { "epoch": 3.2668885583422926, "grad_norm": 1.1926299333572388, "learning_rate": 7.708555607192183e-06, "loss": 0.0548, "step": 221825 }, { "epoch": 3.2672567414323797, "grad_norm": 0.8720672726631165, "learning_rate": 7.7069192352252e-06, "loss": 0.0579, "step": 221850 }, { "epoch": 3.2676249245224667, "grad_norm": 1.4807361364364624, "learning_rate": 7.705282863258214e-06, "loss": 0.0551, "step": 221875 }, { "epoch": 3.2679931076125537, "grad_norm": 0.8349344730377197, "learning_rate": 7.703646491291228e-06, "loss": 0.0611, "step": 221900 }, { "epoch": 3.2683612907026407, "grad_norm": 1.3450521230697632, "learning_rate": 7.702010119324244e-06, "loss": 0.0596, "step": 221925 }, { "epoch": 3.2687294737927277, "grad_norm": 1.3241890668869019, "learning_rate": 7.70037374735726e-06, "loss": 0.0576, "step": 221950 }, { "epoch": 3.2690976568828147, "grad_norm": 1.565602421760559, "learning_rate": 7.698737375390275e-06, "loss": 0.0546, "step": 221975 }, { "epoch": 3.2694658399729017, "grad_norm": 0.6321175694465637, "learning_rate": 7.697101003423291e-06, "loss": 0.0529, "step": 222000 }, { "epoch": 3.2698340230629888, "grad_norm": 0.8589783906936646, "learning_rate": 7.695464631456307e-06, "loss": 0.0505, "step": 222025 }, { "epoch": 3.2702022061530758, "grad_norm": 1.4351216554641724, "learning_rate": 7.693828259489322e-06, "loss": 0.0538, "step": 222050 }, { "epoch": 3.270570389243163, "grad_norm": 1.1147154569625854, "learning_rate": 7.692191887522336e-06, "loss": 0.0557, "step": 222075 }, { "epoch": 3.27093857233325, "grad_norm": 1.3109734058380127, "learning_rate": 7.690555515555352e-06, "loss": 0.0634, "step": 222100 }, { "epoch": 3.271306755423337, "grad_norm": 1.5449657440185547, "learning_rate": 7.688919143588368e-06, "loss": 0.05, "step": 222125 }, { "epoch": 3.271674938513424, "grad_norm": 1.3285760879516602, "learning_rate": 7.687282771621385e-06, "loss": 0.0539, "step": 222150 }, { "epoch": 3.272043121603511, "grad_norm": 1.5189287662506104, "learning_rate": 7.685646399654399e-06, "loss": 0.0515, "step": 222175 }, { "epoch": 3.272411304693598, "grad_norm": 0.8532837629318237, "learning_rate": 7.684010027687413e-06, "loss": 0.0499, "step": 222200 }, { "epoch": 3.272779487783685, "grad_norm": 1.6627578735351562, "learning_rate": 7.68237365572043e-06, "loss": 0.0526, "step": 222225 }, { "epoch": 3.2731476708737723, "grad_norm": 1.2204079627990723, "learning_rate": 7.680737283753446e-06, "loss": 0.0464, "step": 222250 }, { "epoch": 3.2735158539638594, "grad_norm": 1.2260034084320068, "learning_rate": 7.67910091178646e-06, "loss": 0.0608, "step": 222275 }, { "epoch": 3.2738840370539464, "grad_norm": 1.2964593172073364, "learning_rate": 7.677464539819476e-06, "loss": 0.051, "step": 222300 }, { "epoch": 3.2742522201440334, "grad_norm": 1.4139735698699951, "learning_rate": 7.675828167852491e-06, "loss": 0.0528, "step": 222325 }, { "epoch": 3.2746204032341204, "grad_norm": 0.9862179160118103, "learning_rate": 7.674191795885507e-06, "loss": 0.0529, "step": 222350 }, { "epoch": 3.2749885863242074, "grad_norm": 1.6559228897094727, "learning_rate": 7.672555423918523e-06, "loss": 0.0565, "step": 222375 }, { "epoch": 3.2753567694142944, "grad_norm": 1.298337697982788, "learning_rate": 7.670919051951538e-06, "loss": 0.0555, "step": 222400 }, { "epoch": 3.2757249525043814, "grad_norm": 1.1173251867294312, "learning_rate": 7.669282679984554e-06, "loss": 0.052, "step": 222425 }, { "epoch": 3.2760931355944685, "grad_norm": 1.8743599653244019, "learning_rate": 7.667646308017568e-06, "loss": 0.0544, "step": 222450 }, { "epoch": 3.2764613186845555, "grad_norm": 1.701632022857666, "learning_rate": 7.666009936050584e-06, "loss": 0.0546, "step": 222475 }, { "epoch": 3.2768295017746425, "grad_norm": 1.3790738582611084, "learning_rate": 7.664373564083599e-06, "loss": 0.0542, "step": 222500 }, { "epoch": 3.2771976848647295, "grad_norm": 1.1943485736846924, "learning_rate": 7.662737192116615e-06, "loss": 0.0577, "step": 222525 }, { "epoch": 3.2775658679548165, "grad_norm": 1.142665147781372, "learning_rate": 7.661100820149631e-06, "loss": 0.0598, "step": 222550 }, { "epoch": 3.2779340510449035, "grad_norm": 1.531950831413269, "learning_rate": 7.659464448182646e-06, "loss": 0.0549, "step": 222575 }, { "epoch": 3.2783022341349906, "grad_norm": 1.0448013544082642, "learning_rate": 7.657828076215662e-06, "loss": 0.0558, "step": 222600 }, { "epoch": 3.2786704172250776, "grad_norm": 1.3565757274627686, "learning_rate": 7.656191704248676e-06, "loss": 0.0508, "step": 222625 }, { "epoch": 3.2790386003151646, "grad_norm": 0.9218998551368713, "learning_rate": 7.654555332281692e-06, "loss": 0.054, "step": 222650 }, { "epoch": 3.2794067834052516, "grad_norm": 1.3896846771240234, "learning_rate": 7.652918960314709e-06, "loss": 0.0561, "step": 222675 }, { "epoch": 3.2797749664953386, "grad_norm": 1.2601679563522339, "learning_rate": 7.651282588347723e-06, "loss": 0.0566, "step": 222700 }, { "epoch": 3.2801431495854256, "grad_norm": 1.8283239603042603, "learning_rate": 7.649646216380739e-06, "loss": 0.0537, "step": 222725 }, { "epoch": 3.2805113326755126, "grad_norm": 1.262842059135437, "learning_rate": 7.648009844413754e-06, "loss": 0.0514, "step": 222750 }, { "epoch": 3.2808795157656, "grad_norm": 1.830797791481018, "learning_rate": 7.64637347244677e-06, "loss": 0.0597, "step": 222775 }, { "epoch": 3.281247698855687, "grad_norm": 1.9591783285140991, "learning_rate": 7.644737100479784e-06, "loss": 0.047, "step": 222800 }, { "epoch": 3.281615881945774, "grad_norm": 1.268332600593567, "learning_rate": 7.6431007285128e-06, "loss": 0.0482, "step": 222825 }, { "epoch": 3.281984065035861, "grad_norm": 1.3524540662765503, "learning_rate": 7.641464356545816e-06, "loss": 0.05, "step": 222850 }, { "epoch": 3.282352248125948, "grad_norm": 0.8394463062286377, "learning_rate": 7.639827984578831e-06, "loss": 0.05, "step": 222875 }, { "epoch": 3.282720431216035, "grad_norm": 0.9698912501335144, "learning_rate": 7.638191612611847e-06, "loss": 0.0587, "step": 222900 }, { "epoch": 3.283088614306122, "grad_norm": 1.1424485445022583, "learning_rate": 7.636555240644862e-06, "loss": 0.0528, "step": 222925 }, { "epoch": 3.283456797396209, "grad_norm": 1.3660345077514648, "learning_rate": 7.634918868677878e-06, "loss": 0.0519, "step": 222950 }, { "epoch": 3.2838249804862962, "grad_norm": 1.464294195175171, "learning_rate": 7.633282496710894e-06, "loss": 0.0526, "step": 222975 }, { "epoch": 3.2841931635763832, "grad_norm": 1.4236983060836792, "learning_rate": 7.631646124743908e-06, "loss": 0.0545, "step": 223000 }, { "epoch": 3.2845613466664703, "grad_norm": 1.4798903465270996, "learning_rate": 7.630009752776923e-06, "loss": 0.0556, "step": 223025 }, { "epoch": 3.2849295297565573, "grad_norm": 1.7093205451965332, "learning_rate": 7.628373380809939e-06, "loss": 0.0548, "step": 223050 }, { "epoch": 3.2852977128466443, "grad_norm": 1.6315637826919556, "learning_rate": 7.626737008842955e-06, "loss": 0.0522, "step": 223075 }, { "epoch": 3.2856658959367313, "grad_norm": 1.3786839246749878, "learning_rate": 7.62510063687597e-06, "loss": 0.0552, "step": 223100 }, { "epoch": 3.2860340790268183, "grad_norm": 1.337576985359192, "learning_rate": 7.623464264908985e-06, "loss": 0.0552, "step": 223125 }, { "epoch": 3.2864022621169053, "grad_norm": 1.6407849788665771, "learning_rate": 7.621827892942001e-06, "loss": 0.0538, "step": 223150 }, { "epoch": 3.2867704452069924, "grad_norm": 1.7331223487854004, "learning_rate": 7.620191520975016e-06, "loss": 0.0566, "step": 223175 }, { "epoch": 3.28713862829708, "grad_norm": 1.4217900037765503, "learning_rate": 7.6185551490080324e-06, "loss": 0.0609, "step": 223200 }, { "epoch": 3.287506811387167, "grad_norm": 1.0790061950683594, "learning_rate": 7.616918777041047e-06, "loss": 0.0574, "step": 223225 }, { "epoch": 3.287874994477254, "grad_norm": 1.3927891254425049, "learning_rate": 7.615282405074062e-06, "loss": 0.0535, "step": 223250 }, { "epoch": 3.288243177567341, "grad_norm": 1.0607072114944458, "learning_rate": 7.613646033107078e-06, "loss": 0.0543, "step": 223275 }, { "epoch": 3.288611360657428, "grad_norm": 1.1236480474472046, "learning_rate": 7.612009661140094e-06, "loss": 0.0567, "step": 223300 }, { "epoch": 3.288979543747515, "grad_norm": 1.216027855873108, "learning_rate": 7.61037328917311e-06, "loss": 0.0479, "step": 223325 }, { "epoch": 3.289347726837602, "grad_norm": 0.8837573528289795, "learning_rate": 7.608736917206124e-06, "loss": 0.0547, "step": 223350 }, { "epoch": 3.289715909927689, "grad_norm": 1.6418942213058472, "learning_rate": 7.60710054523914e-06, "loss": 0.0575, "step": 223375 }, { "epoch": 3.290084093017776, "grad_norm": 1.8690115213394165, "learning_rate": 7.605464173272156e-06, "loss": 0.0563, "step": 223400 }, { "epoch": 3.290452276107863, "grad_norm": 1.184334635734558, "learning_rate": 7.603827801305171e-06, "loss": 0.0504, "step": 223425 }, { "epoch": 3.29082045919795, "grad_norm": 1.2403236627578735, "learning_rate": 7.602191429338186e-06, "loss": 0.0543, "step": 223450 }, { "epoch": 3.291188642288037, "grad_norm": 1.2870265245437622, "learning_rate": 7.600555057371202e-06, "loss": 0.0514, "step": 223475 }, { "epoch": 3.291556825378124, "grad_norm": 1.1583762168884277, "learning_rate": 7.598918685404218e-06, "loss": 0.0509, "step": 223500 }, { "epoch": 3.291925008468211, "grad_norm": 1.5692108869552612, "learning_rate": 7.597282313437233e-06, "loss": 0.0639, "step": 223525 }, { "epoch": 3.292293191558298, "grad_norm": 1.4680817127227783, "learning_rate": 7.5956459414702476e-06, "loss": 0.0563, "step": 223550 }, { "epoch": 3.292661374648385, "grad_norm": 0.9752763509750366, "learning_rate": 7.594009569503264e-06, "loss": 0.0619, "step": 223575 }, { "epoch": 3.293029557738472, "grad_norm": 0.8564492464065552, "learning_rate": 7.592373197536279e-06, "loss": 0.0571, "step": 223600 }, { "epoch": 3.293397740828559, "grad_norm": 1.0177819728851318, "learning_rate": 7.590736825569295e-06, "loss": 0.0602, "step": 223625 }, { "epoch": 3.293765923918646, "grad_norm": 1.4920368194580078, "learning_rate": 7.58910045360231e-06, "loss": 0.0609, "step": 223650 }, { "epoch": 3.294134107008733, "grad_norm": 1.2452385425567627, "learning_rate": 7.587464081635325e-06, "loss": 0.0541, "step": 223675 }, { "epoch": 3.29450229009882, "grad_norm": 1.4523124694824219, "learning_rate": 7.585827709668341e-06, "loss": 0.0534, "step": 223700 }, { "epoch": 3.294870473188907, "grad_norm": 1.3547228574752808, "learning_rate": 7.584191337701356e-06, "loss": 0.053, "step": 223725 }, { "epoch": 3.2952386562789946, "grad_norm": 1.2725343704223633, "learning_rate": 7.582554965734371e-06, "loss": 0.048, "step": 223750 }, { "epoch": 3.2956068393690816, "grad_norm": 1.9595698118209839, "learning_rate": 7.580918593767387e-06, "loss": 0.0587, "step": 223775 }, { "epoch": 3.2959750224591686, "grad_norm": 1.6414002180099487, "learning_rate": 7.579282221800402e-06, "loss": 0.0557, "step": 223800 }, { "epoch": 3.2963432055492556, "grad_norm": 1.3708178997039795, "learning_rate": 7.5776458498334184e-06, "loss": 0.0546, "step": 223825 }, { "epoch": 3.2967113886393427, "grad_norm": 1.808488130569458, "learning_rate": 7.576009477866434e-06, "loss": 0.0584, "step": 223850 }, { "epoch": 3.2970795717294297, "grad_norm": 1.3050129413604736, "learning_rate": 7.574373105899448e-06, "loss": 0.0456, "step": 223875 }, { "epoch": 3.2974477548195167, "grad_norm": 0.8597230315208435, "learning_rate": 7.572736733932464e-06, "loss": 0.0521, "step": 223900 }, { "epoch": 3.2978159379096037, "grad_norm": 1.233884334564209, "learning_rate": 7.57110036196548e-06, "loss": 0.055, "step": 223925 }, { "epoch": 3.2981841209996907, "grad_norm": 1.0324082374572754, "learning_rate": 7.569463989998496e-06, "loss": 0.0531, "step": 223950 }, { "epoch": 3.2985523040897777, "grad_norm": 1.320130705833435, "learning_rate": 7.56782761803151e-06, "loss": 0.0556, "step": 223975 }, { "epoch": 3.2989204871798647, "grad_norm": 1.1132978200912476, "learning_rate": 7.566191246064526e-06, "loss": 0.0532, "step": 224000 }, { "epoch": 3.2992886702699518, "grad_norm": 0.8644164800643921, "learning_rate": 7.564554874097542e-06, "loss": 0.0527, "step": 224025 }, { "epoch": 3.2996568533600388, "grad_norm": 1.2989580631256104, "learning_rate": 7.562918502130557e-06, "loss": 0.0531, "step": 224050 }, { "epoch": 3.300025036450126, "grad_norm": 1.2543586492538452, "learning_rate": 7.561282130163572e-06, "loss": 0.0528, "step": 224075 }, { "epoch": 3.300393219540213, "grad_norm": 0.7723637819290161, "learning_rate": 7.559645758196588e-06, "loss": 0.0503, "step": 224100 }, { "epoch": 3.3007614026303, "grad_norm": 1.3822739124298096, "learning_rate": 7.558009386229603e-06, "loss": 0.0564, "step": 224125 }, { "epoch": 3.3011295857203873, "grad_norm": 1.8552175760269165, "learning_rate": 7.556373014262619e-06, "loss": 0.0587, "step": 224150 }, { "epoch": 3.3014977688104743, "grad_norm": 1.2169073820114136, "learning_rate": 7.5547366422956336e-06, "loss": 0.0583, "step": 224175 }, { "epoch": 3.3018659519005613, "grad_norm": 1.2105604410171509, "learning_rate": 7.55310027032865e-06, "loss": 0.0534, "step": 224200 }, { "epoch": 3.3022341349906483, "grad_norm": 0.9494795203208923, "learning_rate": 7.551463898361665e-06, "loss": 0.0528, "step": 224225 }, { "epoch": 3.3026023180807353, "grad_norm": 1.4743846654891968, "learning_rate": 7.549827526394681e-06, "loss": 0.0533, "step": 224250 }, { "epoch": 3.3029705011708224, "grad_norm": 1.7627805471420288, "learning_rate": 7.548191154427696e-06, "loss": 0.0564, "step": 224275 }, { "epoch": 3.3033386842609094, "grad_norm": 1.4553340673446655, "learning_rate": 7.546554782460711e-06, "loss": 0.0478, "step": 224300 }, { "epoch": 3.3037068673509964, "grad_norm": 1.1054805517196655, "learning_rate": 7.544918410493727e-06, "loss": 0.0603, "step": 224325 }, { "epoch": 3.3040750504410834, "grad_norm": 1.101176142692566, "learning_rate": 7.543282038526742e-06, "loss": 0.0599, "step": 224350 }, { "epoch": 3.3044432335311704, "grad_norm": 1.6016474962234497, "learning_rate": 7.5416456665597585e-06, "loss": 0.0506, "step": 224375 }, { "epoch": 3.3048114166212574, "grad_norm": 1.2569962739944458, "learning_rate": 7.540009294592773e-06, "loss": 0.052, "step": 224400 }, { "epoch": 3.3051795997113445, "grad_norm": 1.3738640546798706, "learning_rate": 7.538372922625788e-06, "loss": 0.0489, "step": 224425 }, { "epoch": 3.3055477828014315, "grad_norm": 0.9245179295539856, "learning_rate": 7.5367365506588044e-06, "loss": 0.0504, "step": 224450 }, { "epoch": 3.3059159658915185, "grad_norm": 1.9862191677093506, "learning_rate": 7.53510017869182e-06, "loss": 0.0522, "step": 224475 }, { "epoch": 3.3062841489816055, "grad_norm": 1.3871967792510986, "learning_rate": 7.533463806724834e-06, "loss": 0.0539, "step": 224500 }, { "epoch": 3.3066523320716925, "grad_norm": 1.0422518253326416, "learning_rate": 7.53182743475785e-06, "loss": 0.0535, "step": 224525 }, { "epoch": 3.3070205151617795, "grad_norm": 0.7369499802589417, "learning_rate": 7.530191062790866e-06, "loss": 0.0526, "step": 224550 }, { "epoch": 3.3073886982518665, "grad_norm": 1.549147605895996, "learning_rate": 7.528554690823882e-06, "loss": 0.0554, "step": 224575 }, { "epoch": 3.3077568813419536, "grad_norm": 1.5977694988250732, "learning_rate": 7.526918318856896e-06, "loss": 0.0555, "step": 224600 }, { "epoch": 3.3081250644320406, "grad_norm": 1.2408370971679688, "learning_rate": 7.525281946889912e-06, "loss": 0.0538, "step": 224625 }, { "epoch": 3.3084932475221276, "grad_norm": 1.279453992843628, "learning_rate": 7.523645574922928e-06, "loss": 0.051, "step": 224650 }, { "epoch": 3.3088614306122146, "grad_norm": 1.4527429342269897, "learning_rate": 7.522009202955943e-06, "loss": 0.0575, "step": 224675 }, { "epoch": 3.309229613702302, "grad_norm": 1.3381928205490112, "learning_rate": 7.5203728309889575e-06, "loss": 0.0541, "step": 224700 }, { "epoch": 3.309597796792389, "grad_norm": 0.5282140374183655, "learning_rate": 7.518736459021974e-06, "loss": 0.0549, "step": 224725 }, { "epoch": 3.309965979882476, "grad_norm": 1.9055311679840088, "learning_rate": 7.517100087054989e-06, "loss": 0.0529, "step": 224750 }, { "epoch": 3.310334162972563, "grad_norm": 0.9043813347816467, "learning_rate": 7.515463715088005e-06, "loss": 0.0584, "step": 224775 }, { "epoch": 3.31070234606265, "grad_norm": 1.080385446548462, "learning_rate": 7.51382734312102e-06, "loss": 0.0663, "step": 224800 }, { "epoch": 3.311070529152737, "grad_norm": 1.3524212837219238, "learning_rate": 7.512190971154036e-06, "loss": 0.0558, "step": 224825 }, { "epoch": 3.311438712242824, "grad_norm": 1.1148847341537476, "learning_rate": 7.510554599187051e-06, "loss": 0.0569, "step": 224850 }, { "epoch": 3.311806895332911, "grad_norm": 1.5092246532440186, "learning_rate": 7.508918227220066e-06, "loss": 0.0605, "step": 224875 }, { "epoch": 3.312175078422998, "grad_norm": 1.1484147310256958, "learning_rate": 7.5072818552530825e-06, "loss": 0.0458, "step": 224900 }, { "epoch": 3.312543261513085, "grad_norm": 1.0924971103668213, "learning_rate": 7.505645483286097e-06, "loss": 0.0524, "step": 224925 }, { "epoch": 3.312911444603172, "grad_norm": 1.2831860780715942, "learning_rate": 7.504009111319113e-06, "loss": 0.056, "step": 224950 }, { "epoch": 3.3132796276932592, "grad_norm": 1.0059807300567627, "learning_rate": 7.502372739352128e-06, "loss": 0.0485, "step": 224975 }, { "epoch": 3.3136478107833462, "grad_norm": 1.245866060256958, "learning_rate": 7.500736367385144e-06, "loss": 0.0537, "step": 225000 }, { "epoch": 3.3140159938734333, "grad_norm": 0.3531804084777832, "learning_rate": 7.499099995418159e-06, "loss": 0.0534, "step": 225025 }, { "epoch": 3.3143841769635203, "grad_norm": 1.1652743816375732, "learning_rate": 7.497463623451174e-06, "loss": 0.0563, "step": 225050 }, { "epoch": 3.3147523600536073, "grad_norm": 0.6103206276893616, "learning_rate": 7.4958272514841904e-06, "loss": 0.0512, "step": 225075 }, { "epoch": 3.3151205431436943, "grad_norm": 0.5627954006195068, "learning_rate": 7.494190879517206e-06, "loss": 0.0495, "step": 225100 }, { "epoch": 3.3154887262337818, "grad_norm": 1.6367796659469604, "learning_rate": 7.49255450755022e-06, "loss": 0.0554, "step": 225125 }, { "epoch": 3.315856909323869, "grad_norm": 1.0813084840774536, "learning_rate": 7.490918135583236e-06, "loss": 0.0508, "step": 225150 }, { "epoch": 3.316225092413956, "grad_norm": 0.8711093068122864, "learning_rate": 7.489281763616252e-06, "loss": 0.0558, "step": 225175 }, { "epoch": 3.316593275504043, "grad_norm": 1.6254273653030396, "learning_rate": 7.487645391649268e-06, "loss": 0.0511, "step": 225200 }, { "epoch": 3.31696145859413, "grad_norm": 1.4052600860595703, "learning_rate": 7.486009019682282e-06, "loss": 0.0543, "step": 225225 }, { "epoch": 3.317329641684217, "grad_norm": 1.354114294052124, "learning_rate": 7.4843726477152976e-06, "loss": 0.0582, "step": 225250 }, { "epoch": 3.317697824774304, "grad_norm": 1.7040506601333618, "learning_rate": 7.482736275748314e-06, "loss": 0.0528, "step": 225275 }, { "epoch": 3.318066007864391, "grad_norm": 1.3592195510864258, "learning_rate": 7.481099903781329e-06, "loss": 0.0515, "step": 225300 }, { "epoch": 3.318434190954478, "grad_norm": 1.500755786895752, "learning_rate": 7.479463531814345e-06, "loss": 0.0506, "step": 225325 }, { "epoch": 3.318802374044565, "grad_norm": 1.3757877349853516, "learning_rate": 7.47782715984736e-06, "loss": 0.0528, "step": 225350 }, { "epoch": 3.319170557134652, "grad_norm": 1.2323228120803833, "learning_rate": 7.476190787880375e-06, "loss": 0.0517, "step": 225375 }, { "epoch": 3.319538740224739, "grad_norm": 1.5338979959487915, "learning_rate": 7.474554415913391e-06, "loss": 0.0597, "step": 225400 }, { "epoch": 3.319906923314826, "grad_norm": 1.4789302349090576, "learning_rate": 7.472918043946406e-06, "loss": 0.0587, "step": 225425 }, { "epoch": 3.320275106404913, "grad_norm": 0.9365647435188293, "learning_rate": 7.471281671979421e-06, "loss": 0.0507, "step": 225450 }, { "epoch": 3.320643289495, "grad_norm": 1.2973297834396362, "learning_rate": 7.469645300012437e-06, "loss": 0.0533, "step": 225475 }, { "epoch": 3.321011472585087, "grad_norm": 1.6962288618087769, "learning_rate": 7.468008928045452e-06, "loss": 0.0508, "step": 225500 }, { "epoch": 3.321379655675174, "grad_norm": 1.545454978942871, "learning_rate": 7.4663725560784685e-06, "loss": 0.0553, "step": 225525 }, { "epoch": 3.321747838765261, "grad_norm": 1.3650883436203003, "learning_rate": 7.464801638990162e-06, "loss": 0.0533, "step": 225550 }, { "epoch": 3.322116021855348, "grad_norm": 1.0064499378204346, "learning_rate": 7.463165267023178e-06, "loss": 0.0446, "step": 225575 }, { "epoch": 3.322484204945435, "grad_norm": 1.7375867366790771, "learning_rate": 7.461528895056194e-06, "loss": 0.0571, "step": 225600 }, { "epoch": 3.322852388035522, "grad_norm": 1.4480394124984741, "learning_rate": 7.459892523089209e-06, "loss": 0.0604, "step": 225625 }, { "epoch": 3.3232205711256095, "grad_norm": 1.0826350450515747, "learning_rate": 7.458256151122224e-06, "loss": 0.0596, "step": 225650 }, { "epoch": 3.3235887542156966, "grad_norm": 1.8368470668792725, "learning_rate": 7.45661977915524e-06, "loss": 0.0588, "step": 225675 }, { "epoch": 3.3239569373057836, "grad_norm": 1.5385063886642456, "learning_rate": 7.454983407188255e-06, "loss": 0.052, "step": 225700 }, { "epoch": 3.3243251203958706, "grad_norm": 1.1620228290557861, "learning_rate": 7.453347035221271e-06, "loss": 0.0578, "step": 225725 }, { "epoch": 3.3246933034859576, "grad_norm": 0.9635811448097229, "learning_rate": 7.451710663254286e-06, "loss": 0.0536, "step": 225750 }, { "epoch": 3.3250614865760446, "grad_norm": 1.5155373811721802, "learning_rate": 7.450074291287302e-06, "loss": 0.0568, "step": 225775 }, { "epoch": 3.3254296696661316, "grad_norm": 1.4668232202529907, "learning_rate": 7.448437919320317e-06, "loss": 0.0574, "step": 225800 }, { "epoch": 3.3257978527562186, "grad_norm": 1.0898630619049072, "learning_rate": 7.446801547353333e-06, "loss": 0.0559, "step": 225825 }, { "epoch": 3.3261660358463057, "grad_norm": 1.5009483098983765, "learning_rate": 7.4451651753863485e-06, "loss": 0.0578, "step": 225850 }, { "epoch": 3.3265342189363927, "grad_norm": 1.4300683736801147, "learning_rate": 7.443528803419363e-06, "loss": 0.0567, "step": 225875 }, { "epoch": 3.3269024020264797, "grad_norm": 1.7828330993652344, "learning_rate": 7.441892431452379e-06, "loss": 0.0556, "step": 225900 }, { "epoch": 3.3272705851165667, "grad_norm": 1.6853739023208618, "learning_rate": 7.4402560594853945e-06, "loss": 0.0485, "step": 225925 }, { "epoch": 3.3276387682066537, "grad_norm": 1.880748987197876, "learning_rate": 7.438619687518411e-06, "loss": 0.0519, "step": 225950 }, { "epoch": 3.3280069512967407, "grad_norm": 1.1433602571487427, "learning_rate": 7.436983315551425e-06, "loss": 0.0534, "step": 225975 }, { "epoch": 3.3283751343868277, "grad_norm": 1.6150449514389038, "learning_rate": 7.43534694358444e-06, "loss": 0.0561, "step": 226000 }, { "epoch": 3.3287433174769148, "grad_norm": 1.256623387336731, "learning_rate": 7.4337105716174565e-06, "loss": 0.0546, "step": 226025 }, { "epoch": 3.329111500567002, "grad_norm": 1.558573603630066, "learning_rate": 7.432074199650472e-06, "loss": 0.0546, "step": 226050 }, { "epoch": 3.3294796836570892, "grad_norm": 1.0619813203811646, "learning_rate": 7.430437827683486e-06, "loss": 0.0486, "step": 226075 }, { "epoch": 3.3298478667471763, "grad_norm": 1.3027280569076538, "learning_rate": 7.4288014557165024e-06, "loss": 0.0578, "step": 226100 }, { "epoch": 3.3302160498372633, "grad_norm": 1.3097243309020996, "learning_rate": 7.427165083749518e-06, "loss": 0.0589, "step": 226125 }, { "epoch": 3.3305842329273503, "grad_norm": 1.410872220993042, "learning_rate": 7.425528711782534e-06, "loss": 0.0492, "step": 226150 }, { "epoch": 3.3309524160174373, "grad_norm": 1.0632275342941284, "learning_rate": 7.423892339815548e-06, "loss": 0.0482, "step": 226175 }, { "epoch": 3.3313205991075243, "grad_norm": 1.3405839204788208, "learning_rate": 7.422255967848564e-06, "loss": 0.0573, "step": 226200 }, { "epoch": 3.3316887821976113, "grad_norm": 1.5575217008590698, "learning_rate": 7.42061959588158e-06, "loss": 0.0601, "step": 226225 }, { "epoch": 3.3320569652876983, "grad_norm": 1.3362696170806885, "learning_rate": 7.418983223914595e-06, "loss": 0.0546, "step": 226250 }, { "epoch": 3.3324251483777854, "grad_norm": 1.7967804670333862, "learning_rate": 7.417346851947611e-06, "loss": 0.0626, "step": 226275 }, { "epoch": 3.3327933314678724, "grad_norm": 1.905088186264038, "learning_rate": 7.415710479980626e-06, "loss": 0.0691, "step": 226300 }, { "epoch": 3.3331615145579594, "grad_norm": 1.680367112159729, "learning_rate": 7.414074108013641e-06, "loss": 0.0545, "step": 226325 }, { "epoch": 3.3335296976480464, "grad_norm": 1.2259553670883179, "learning_rate": 7.412437736046657e-06, "loss": 0.0588, "step": 226350 }, { "epoch": 3.3338978807381334, "grad_norm": 1.1680890321731567, "learning_rate": 7.4108013640796725e-06, "loss": 0.054, "step": 226375 }, { "epoch": 3.3342660638282204, "grad_norm": 1.0716922283172607, "learning_rate": 7.409164992112688e-06, "loss": 0.0569, "step": 226400 }, { "epoch": 3.3346342469183075, "grad_norm": 1.0347439050674438, "learning_rate": 7.407528620145703e-06, "loss": 0.0546, "step": 226425 }, { "epoch": 3.3350024300083945, "grad_norm": 1.017482876777649, "learning_rate": 7.405892248178718e-06, "loss": 0.0581, "step": 226450 }, { "epoch": 3.3353706130984815, "grad_norm": 1.238437533378601, "learning_rate": 7.4042558762117345e-06, "loss": 0.0472, "step": 226475 }, { "epoch": 3.3357387961885685, "grad_norm": 1.1666393280029297, "learning_rate": 7.402619504244749e-06, "loss": 0.0502, "step": 226500 }, { "epoch": 3.3361069792786555, "grad_norm": 1.5748651027679443, "learning_rate": 7.400983132277765e-06, "loss": 0.0515, "step": 226525 }, { "epoch": 3.3364751623687425, "grad_norm": 1.6308422088623047, "learning_rate": 7.3993467603107805e-06, "loss": 0.056, "step": 226550 }, { "epoch": 3.3368433454588295, "grad_norm": 1.0964595079421997, "learning_rate": 7.397710388343796e-06, "loss": 0.0559, "step": 226575 }, { "epoch": 3.3372115285489166, "grad_norm": 1.0749626159667969, "learning_rate": 7.396074016376811e-06, "loss": 0.0528, "step": 226600 }, { "epoch": 3.337579711639004, "grad_norm": 1.2637847661972046, "learning_rate": 7.394437644409826e-06, "loss": 0.0524, "step": 226625 }, { "epoch": 3.337947894729091, "grad_norm": 1.138476014137268, "learning_rate": 7.3928012724428425e-06, "loss": 0.0512, "step": 226650 }, { "epoch": 3.338316077819178, "grad_norm": 1.6251795291900635, "learning_rate": 7.391164900475858e-06, "loss": 0.0581, "step": 226675 }, { "epoch": 3.338684260909265, "grad_norm": 1.6297962665557861, "learning_rate": 7.389528528508872e-06, "loss": 0.0478, "step": 226700 }, { "epoch": 3.339052443999352, "grad_norm": 1.0838072299957275, "learning_rate": 7.387957611420568e-06, "loss": 0.0564, "step": 226725 }, { "epoch": 3.339420627089439, "grad_norm": 1.4168065786361694, "learning_rate": 7.386321239453583e-06, "loss": 0.0536, "step": 226750 }, { "epoch": 3.339788810179526, "grad_norm": 1.5672047138214111, "learning_rate": 7.384684867486599e-06, "loss": 0.052, "step": 226775 }, { "epoch": 3.340156993269613, "grad_norm": 1.6075565814971924, "learning_rate": 7.383048495519615e-06, "loss": 0.0557, "step": 226800 }, { "epoch": 3.3405251763597, "grad_norm": 1.1818798780441284, "learning_rate": 7.381412123552629e-06, "loss": 0.061, "step": 226825 }, { "epoch": 3.340893359449787, "grad_norm": 1.5154752731323242, "learning_rate": 7.379775751585645e-06, "loss": 0.0622, "step": 226850 }, { "epoch": 3.341261542539874, "grad_norm": 1.3670189380645752, "learning_rate": 7.3781393796186605e-06, "loss": 0.0515, "step": 226875 }, { "epoch": 3.341629725629961, "grad_norm": 1.474528193473816, "learning_rate": 7.376503007651677e-06, "loss": 0.0562, "step": 226900 }, { "epoch": 3.341997908720048, "grad_norm": 1.8051512241363525, "learning_rate": 7.374866635684691e-06, "loss": 0.0547, "step": 226925 }, { "epoch": 3.342366091810135, "grad_norm": 1.8945709466934204, "learning_rate": 7.3732302637177064e-06, "loss": 0.058, "step": 226950 }, { "epoch": 3.3427342749002222, "grad_norm": 1.313443660736084, "learning_rate": 7.371593891750723e-06, "loss": 0.0502, "step": 226975 }, { "epoch": 3.3431024579903093, "grad_norm": 1.4861992597579956, "learning_rate": 7.369957519783738e-06, "loss": 0.0553, "step": 227000 }, { "epoch": 3.3434706410803963, "grad_norm": 1.0212277173995972, "learning_rate": 7.368321147816752e-06, "loss": 0.0526, "step": 227025 }, { "epoch": 3.3438388241704837, "grad_norm": 1.5201823711395264, "learning_rate": 7.3666847758497685e-06, "loss": 0.0617, "step": 227050 }, { "epoch": 3.3442070072605707, "grad_norm": 0.9945889711380005, "learning_rate": 7.365048403882784e-06, "loss": 0.0525, "step": 227075 }, { "epoch": 3.3445751903506578, "grad_norm": 0.9937597513198853, "learning_rate": 7.3634120319158e-06, "loss": 0.0508, "step": 227100 }, { "epoch": 3.3449433734407448, "grad_norm": 1.2471506595611572, "learning_rate": 7.361775659948814e-06, "loss": 0.0567, "step": 227125 }, { "epoch": 3.345311556530832, "grad_norm": 1.2965233325958252, "learning_rate": 7.36013928798183e-06, "loss": 0.0561, "step": 227150 }, { "epoch": 3.345679739620919, "grad_norm": 1.4884933233261108, "learning_rate": 7.358502916014846e-06, "loss": 0.0543, "step": 227175 }, { "epoch": 3.346047922711006, "grad_norm": 0.7720845937728882, "learning_rate": 7.356866544047861e-06, "loss": 0.0554, "step": 227200 }, { "epoch": 3.346416105801093, "grad_norm": 1.334080457687378, "learning_rate": 7.355230172080877e-06, "loss": 0.0561, "step": 227225 }, { "epoch": 3.34678428889118, "grad_norm": 1.4637643098831177, "learning_rate": 7.353593800113892e-06, "loss": 0.0572, "step": 227250 }, { "epoch": 3.347152471981267, "grad_norm": 1.6052371263504028, "learning_rate": 7.351957428146907e-06, "loss": 0.0499, "step": 227275 }, { "epoch": 3.347520655071354, "grad_norm": 1.3566983938217163, "learning_rate": 7.350321056179923e-06, "loss": 0.0542, "step": 227300 }, { "epoch": 3.347888838161441, "grad_norm": 1.2417707443237305, "learning_rate": 7.3486846842129385e-06, "loss": 0.0595, "step": 227325 }, { "epoch": 3.348257021251528, "grad_norm": 1.0273051261901855, "learning_rate": 7.347048312245954e-06, "loss": 0.0545, "step": 227350 }, { "epoch": 3.348625204341615, "grad_norm": 1.5369634628295898, "learning_rate": 7.345411940278969e-06, "loss": 0.049, "step": 227375 }, { "epoch": 3.348993387431702, "grad_norm": 0.9035630822181702, "learning_rate": 7.343775568311985e-06, "loss": 0.0541, "step": 227400 }, { "epoch": 3.349361570521789, "grad_norm": 1.1546244621276855, "learning_rate": 7.342139196345001e-06, "loss": 0.0559, "step": 227425 }, { "epoch": 3.349729753611876, "grad_norm": 1.3052910566329956, "learning_rate": 7.340502824378015e-06, "loss": 0.0568, "step": 227450 }, { "epoch": 3.350097936701963, "grad_norm": 1.5565319061279297, "learning_rate": 7.338866452411031e-06, "loss": 0.0546, "step": 227475 }, { "epoch": 3.35046611979205, "grad_norm": 1.9359947443008423, "learning_rate": 7.3372300804440465e-06, "loss": 0.0556, "step": 227500 }, { "epoch": 3.350834302882137, "grad_norm": 1.249371886253357, "learning_rate": 7.335593708477063e-06, "loss": 0.0524, "step": 227525 }, { "epoch": 3.351202485972224, "grad_norm": 1.4207818508148193, "learning_rate": 7.333957336510077e-06, "loss": 0.0494, "step": 227550 }, { "epoch": 3.3515706690623115, "grad_norm": 1.0526865720748901, "learning_rate": 7.3323209645430924e-06, "loss": 0.057, "step": 227575 }, { "epoch": 3.3519388521523985, "grad_norm": 1.489443063735962, "learning_rate": 7.330684592576109e-06, "loss": 0.0591, "step": 227600 }, { "epoch": 3.3523070352424855, "grad_norm": 0.8755283355712891, "learning_rate": 7.329048220609124e-06, "loss": 0.0584, "step": 227625 }, { "epoch": 3.3526752183325725, "grad_norm": 1.4304816722869873, "learning_rate": 7.327411848642138e-06, "loss": 0.0529, "step": 227650 }, { "epoch": 3.3530434014226596, "grad_norm": 0.8949004411697388, "learning_rate": 7.3257754766751545e-06, "loss": 0.0477, "step": 227675 }, { "epoch": 3.3534115845127466, "grad_norm": 1.4937458038330078, "learning_rate": 7.32413910470817e-06, "loss": 0.0555, "step": 227700 }, { "epoch": 3.3537797676028336, "grad_norm": 1.59857976436615, "learning_rate": 7.322502732741186e-06, "loss": 0.0613, "step": 227725 }, { "epoch": 3.3541479506929206, "grad_norm": 1.2917006015777588, "learning_rate": 7.320866360774201e-06, "loss": 0.0516, "step": 227750 }, { "epoch": 3.3545161337830076, "grad_norm": 1.10425865650177, "learning_rate": 7.319229988807216e-06, "loss": 0.0515, "step": 227775 }, { "epoch": 3.3548843168730946, "grad_norm": 1.6088265180587769, "learning_rate": 7.317593616840232e-06, "loss": 0.0585, "step": 227800 }, { "epoch": 3.3552524999631816, "grad_norm": 1.1360076665878296, "learning_rate": 7.315957244873247e-06, "loss": 0.056, "step": 227825 }, { "epoch": 3.3556206830532687, "grad_norm": 1.2189058065414429, "learning_rate": 7.314320872906263e-06, "loss": 0.0539, "step": 227850 }, { "epoch": 3.3559888661433557, "grad_norm": 0.9098104238510132, "learning_rate": 7.312684500939278e-06, "loss": 0.0617, "step": 227875 }, { "epoch": 3.3563570492334427, "grad_norm": 1.2700965404510498, "learning_rate": 7.311048128972293e-06, "loss": 0.0516, "step": 227900 }, { "epoch": 3.3567252323235297, "grad_norm": 0.8122754693031311, "learning_rate": 7.309411757005309e-06, "loss": 0.0552, "step": 227925 }, { "epoch": 3.3570934154136167, "grad_norm": 1.3640238046646118, "learning_rate": 7.3077753850383245e-06, "loss": 0.054, "step": 227950 }, { "epoch": 3.3574615985037037, "grad_norm": 1.0851056575775146, "learning_rate": 7.30613901307134e-06, "loss": 0.05, "step": 227975 }, { "epoch": 3.357829781593791, "grad_norm": 1.0979098081588745, "learning_rate": 7.304502641104355e-06, "loss": 0.0485, "step": 228000 }, { "epoch": 3.358197964683878, "grad_norm": 1.2680208683013916, "learning_rate": 7.3028662691373705e-06, "loss": 0.0552, "step": 228025 }, { "epoch": 3.3585661477739652, "grad_norm": 1.0618665218353271, "learning_rate": 7.301229897170387e-06, "loss": 0.0533, "step": 228050 }, { "epoch": 3.3589343308640522, "grad_norm": 1.4491623640060425, "learning_rate": 7.299593525203401e-06, "loss": 0.0573, "step": 228075 }, { "epoch": 3.3593025139541393, "grad_norm": 1.700790524482727, "learning_rate": 7.297957153236417e-06, "loss": 0.0519, "step": 228100 }, { "epoch": 3.3596706970442263, "grad_norm": 1.0543335676193237, "learning_rate": 7.2963207812694325e-06, "loss": 0.062, "step": 228125 }, { "epoch": 3.3600388801343133, "grad_norm": 1.0651147365570068, "learning_rate": 7.294684409302449e-06, "loss": 0.0483, "step": 228150 }, { "epoch": 3.3604070632244003, "grad_norm": 1.6682809591293335, "learning_rate": 7.293048037335463e-06, "loss": 0.0528, "step": 228175 }, { "epoch": 3.3607752463144873, "grad_norm": 1.1392656564712524, "learning_rate": 7.2914116653684784e-06, "loss": 0.0465, "step": 228200 }, { "epoch": 3.3611434294045743, "grad_norm": 1.4422379732131958, "learning_rate": 7.289775293401495e-06, "loss": 0.0562, "step": 228225 }, { "epoch": 3.3615116124946613, "grad_norm": 1.3540560007095337, "learning_rate": 7.28813892143451e-06, "loss": 0.0502, "step": 228250 }, { "epoch": 3.3618797955847484, "grad_norm": 1.4447195529937744, "learning_rate": 7.286502549467526e-06, "loss": 0.0609, "step": 228275 }, { "epoch": 3.3622479786748354, "grad_norm": 1.2681094408035278, "learning_rate": 7.2848661775005405e-06, "loss": 0.0583, "step": 228300 }, { "epoch": 3.3626161617649224, "grad_norm": 1.6062184572219849, "learning_rate": 7.283229805533556e-06, "loss": 0.0527, "step": 228325 }, { "epoch": 3.3629843448550094, "grad_norm": 1.0012041330337524, "learning_rate": 7.281593433566572e-06, "loss": 0.06, "step": 228350 }, { "epoch": 3.3633525279450964, "grad_norm": 1.3650450706481934, "learning_rate": 7.279957061599587e-06, "loss": 0.06, "step": 228375 }, { "epoch": 3.3637207110351834, "grad_norm": 1.7142605781555176, "learning_rate": 7.278320689632602e-06, "loss": 0.0503, "step": 228400 }, { "epoch": 3.3640888941252705, "grad_norm": 1.285312294960022, "learning_rate": 7.276684317665618e-06, "loss": 0.0504, "step": 228425 }, { "epoch": 3.3644570772153575, "grad_norm": 1.5487838983535767, "learning_rate": 7.275047945698633e-06, "loss": 0.0513, "step": 228450 }, { "epoch": 3.3648252603054445, "grad_norm": 0.7486552596092224, "learning_rate": 7.273411573731649e-06, "loss": 0.0554, "step": 228475 }, { "epoch": 3.3651934433955315, "grad_norm": 0.9599723219871521, "learning_rate": 7.271775201764664e-06, "loss": 0.0551, "step": 228500 }, { "epoch": 3.365561626485619, "grad_norm": 1.0872753858566284, "learning_rate": 7.270138829797679e-06, "loss": 0.0523, "step": 228525 }, { "epoch": 3.365929809575706, "grad_norm": 1.714653491973877, "learning_rate": 7.268502457830695e-06, "loss": 0.0519, "step": 228550 }, { "epoch": 3.366297992665793, "grad_norm": 0.8374559879302979, "learning_rate": 7.2668660858637105e-06, "loss": 0.0543, "step": 228575 }, { "epoch": 3.36666617575588, "grad_norm": 1.2753009796142578, "learning_rate": 7.265229713896725e-06, "loss": 0.0565, "step": 228600 }, { "epoch": 3.367034358845967, "grad_norm": 1.3866500854492188, "learning_rate": 7.263593341929741e-06, "loss": 0.0528, "step": 228625 }, { "epoch": 3.367402541936054, "grad_norm": 1.2218362092971802, "learning_rate": 7.2619569699627565e-06, "loss": 0.0536, "step": 228650 }, { "epoch": 3.367770725026141, "grad_norm": 1.0911751985549927, "learning_rate": 7.260320597995773e-06, "loss": 0.0455, "step": 228675 }, { "epoch": 3.368138908116228, "grad_norm": 1.4038351774215698, "learning_rate": 7.258684226028788e-06, "loss": 0.049, "step": 228700 }, { "epoch": 3.368507091206315, "grad_norm": 1.325892448425293, "learning_rate": 7.257047854061802e-06, "loss": 0.0555, "step": 228725 }, { "epoch": 3.368875274296402, "grad_norm": 1.778721809387207, "learning_rate": 7.2554114820948185e-06, "loss": 0.0554, "step": 228750 }, { "epoch": 3.369243457386489, "grad_norm": 1.1900590658187866, "learning_rate": 7.253775110127834e-06, "loss": 0.0571, "step": 228775 }, { "epoch": 3.369611640476576, "grad_norm": 1.177790880203247, "learning_rate": 7.25213873816085e-06, "loss": 0.055, "step": 228800 }, { "epoch": 3.369979823566663, "grad_norm": 1.2539998292922974, "learning_rate": 7.2505023661938644e-06, "loss": 0.0593, "step": 228825 }, { "epoch": 3.37034800665675, "grad_norm": 1.387547492980957, "learning_rate": 7.248865994226881e-06, "loss": 0.0464, "step": 228850 }, { "epoch": 3.370716189746837, "grad_norm": 1.865710973739624, "learning_rate": 7.247229622259896e-06, "loss": 0.0544, "step": 228875 }, { "epoch": 3.371084372836924, "grad_norm": 1.3285530805587769, "learning_rate": 7.245593250292911e-06, "loss": 0.056, "step": 228900 }, { "epoch": 3.371452555927011, "grad_norm": 0.9275400638580322, "learning_rate": 7.2439568783259265e-06, "loss": 0.0556, "step": 228925 }, { "epoch": 3.3718207390170987, "grad_norm": 1.1684253215789795, "learning_rate": 7.242320506358942e-06, "loss": 0.051, "step": 228950 }, { "epoch": 3.3721889221071857, "grad_norm": 1.2132936716079712, "learning_rate": 7.240684134391958e-06, "loss": 0.0516, "step": 228975 }, { "epoch": 3.3725571051972727, "grad_norm": 1.4304368495941162, "learning_rate": 7.239047762424973e-06, "loss": 0.0638, "step": 229000 }, { "epoch": 3.3729252882873597, "grad_norm": 1.1957331895828247, "learning_rate": 7.237411390457988e-06, "loss": 0.0504, "step": 229025 }, { "epoch": 3.3732934713774467, "grad_norm": 1.043702244758606, "learning_rate": 7.235775018491004e-06, "loss": 0.0557, "step": 229050 }, { "epoch": 3.3736616544675337, "grad_norm": 1.3535196781158447, "learning_rate": 7.234138646524019e-06, "loss": 0.054, "step": 229075 }, { "epoch": 3.3740298375576208, "grad_norm": 1.148145318031311, "learning_rate": 7.232502274557035e-06, "loss": 0.0523, "step": 229100 }, { "epoch": 3.3743980206477078, "grad_norm": 0.9355340003967285, "learning_rate": 7.23086590259005e-06, "loss": 0.0522, "step": 229125 }, { "epoch": 3.374766203737795, "grad_norm": 1.1868191957473755, "learning_rate": 7.229229530623065e-06, "loss": 0.0605, "step": 229150 }, { "epoch": 3.375134386827882, "grad_norm": 1.4411094188690186, "learning_rate": 7.227593158656081e-06, "loss": 0.0529, "step": 229175 }, { "epoch": 3.375502569917969, "grad_norm": 1.2888022661209106, "learning_rate": 7.2259567866890965e-06, "loss": 0.0558, "step": 229200 }, { "epoch": 3.375870753008056, "grad_norm": 1.340494155883789, "learning_rate": 7.224320414722113e-06, "loss": 0.0562, "step": 229225 }, { "epoch": 3.376238936098143, "grad_norm": 1.0565993785858154, "learning_rate": 7.222684042755127e-06, "loss": 0.0558, "step": 229250 }, { "epoch": 3.37660711918823, "grad_norm": 1.1885859966278076, "learning_rate": 7.2210476707881425e-06, "loss": 0.0568, "step": 229275 }, { "epoch": 3.376975302278317, "grad_norm": 1.6759604215621948, "learning_rate": 7.219411298821159e-06, "loss": 0.0489, "step": 229300 }, { "epoch": 3.377343485368404, "grad_norm": 1.3945995569229126, "learning_rate": 7.217774926854174e-06, "loss": 0.0625, "step": 229325 }, { "epoch": 3.377711668458491, "grad_norm": 1.4647125005722046, "learning_rate": 7.216138554887188e-06, "loss": 0.0607, "step": 229350 }, { "epoch": 3.378079851548578, "grad_norm": 1.204837679862976, "learning_rate": 7.2145021829202045e-06, "loss": 0.0607, "step": 229375 }, { "epoch": 3.378448034638665, "grad_norm": 0.9089778661727905, "learning_rate": 7.21286581095322e-06, "loss": 0.051, "step": 229400 }, { "epoch": 3.378816217728752, "grad_norm": 1.097982406616211, "learning_rate": 7.211229438986236e-06, "loss": 0.0529, "step": 229425 }, { "epoch": 3.379184400818839, "grad_norm": 0.9045083522796631, "learning_rate": 7.2095930670192504e-06, "loss": 0.0499, "step": 229450 }, { "epoch": 3.379552583908926, "grad_norm": 1.2514936923980713, "learning_rate": 7.207956695052266e-06, "loss": 0.0509, "step": 229475 }, { "epoch": 3.3799207669990134, "grad_norm": 1.1990993022918701, "learning_rate": 7.206320323085282e-06, "loss": 0.0557, "step": 229500 }, { "epoch": 3.3802889500891005, "grad_norm": 1.3038480281829834, "learning_rate": 7.204683951118297e-06, "loss": 0.0618, "step": 229525 }, { "epoch": 3.3806571331791875, "grad_norm": 0.908562421798706, "learning_rate": 7.2030475791513125e-06, "loss": 0.0572, "step": 229550 }, { "epoch": 3.3810253162692745, "grad_norm": 1.503128170967102, "learning_rate": 7.201411207184328e-06, "loss": 0.0548, "step": 229575 }, { "epoch": 3.3813934993593615, "grad_norm": 1.3939077854156494, "learning_rate": 7.199774835217344e-06, "loss": 0.0553, "step": 229600 }, { "epoch": 3.3817616824494485, "grad_norm": 1.074191927909851, "learning_rate": 7.198138463250359e-06, "loss": 0.0559, "step": 229625 }, { "epoch": 3.3821298655395355, "grad_norm": 1.2822694778442383, "learning_rate": 7.196502091283374e-06, "loss": 0.0501, "step": 229650 }, { "epoch": 3.3824980486296226, "grad_norm": 1.416955590248108, "learning_rate": 7.19486571931639e-06, "loss": 0.0548, "step": 229675 }, { "epoch": 3.3828662317197096, "grad_norm": 1.4085408449172974, "learning_rate": 7.193229347349405e-06, "loss": 0.0484, "step": 229700 }, { "epoch": 3.3832344148097966, "grad_norm": 1.1416951417922974, "learning_rate": 7.191592975382421e-06, "loss": 0.0538, "step": 229725 }, { "epoch": 3.3836025978998836, "grad_norm": 1.8507288694381714, "learning_rate": 7.189956603415437e-06, "loss": 0.0531, "step": 229750 }, { "epoch": 3.3839707809899706, "grad_norm": 1.6278001070022583, "learning_rate": 7.188320231448451e-06, "loss": 0.063, "step": 229775 }, { "epoch": 3.3843389640800576, "grad_norm": 1.0278292894363403, "learning_rate": 7.186683859481467e-06, "loss": 0.0528, "step": 229800 }, { "epoch": 3.3847071471701446, "grad_norm": 0.9253020286560059, "learning_rate": 7.1850474875144825e-06, "loss": 0.0529, "step": 229825 }, { "epoch": 3.3850753302602317, "grad_norm": 1.118381142616272, "learning_rate": 7.183411115547499e-06, "loss": 0.0489, "step": 229850 }, { "epoch": 3.3854435133503187, "grad_norm": 1.3826417922973633, "learning_rate": 7.181774743580513e-06, "loss": 0.0564, "step": 229875 }, { "epoch": 3.3858116964404057, "grad_norm": 1.6267495155334473, "learning_rate": 7.1801383716135285e-06, "loss": 0.0496, "step": 229900 }, { "epoch": 3.386179879530493, "grad_norm": 1.306330919265747, "learning_rate": 7.178501999646545e-06, "loss": 0.0495, "step": 229925 }, { "epoch": 3.38654806262058, "grad_norm": 1.3307965993881226, "learning_rate": 7.17686562767956e-06, "loss": 0.0487, "step": 229950 }, { "epoch": 3.386916245710667, "grad_norm": 0.4965097904205322, "learning_rate": 7.175229255712574e-06, "loss": 0.0506, "step": 229975 }, { "epoch": 3.387284428800754, "grad_norm": 1.696635127067566, "learning_rate": 7.1735928837455905e-06, "loss": 0.0596, "step": 230000 }, { "epoch": 3.387652611890841, "grad_norm": 1.322624683380127, "learning_rate": 7.171956511778606e-06, "loss": 0.0526, "step": 230025 }, { "epoch": 3.3880207949809282, "grad_norm": 1.0035916566848755, "learning_rate": 7.170320139811622e-06, "loss": 0.0582, "step": 230050 }, { "epoch": 3.3883889780710152, "grad_norm": 0.8434239625930786, "learning_rate": 7.1686837678446364e-06, "loss": 0.0513, "step": 230075 }, { "epoch": 3.3887571611611023, "grad_norm": 1.328847050666809, "learning_rate": 7.167047395877652e-06, "loss": 0.0535, "step": 230100 }, { "epoch": 3.3891253442511893, "grad_norm": 1.5471599102020264, "learning_rate": 7.165411023910668e-06, "loss": 0.0571, "step": 230125 }, { "epoch": 3.3894935273412763, "grad_norm": 1.2296767234802246, "learning_rate": 7.163774651943683e-06, "loss": 0.0613, "step": 230150 }, { "epoch": 3.3898617104313633, "grad_norm": 1.2912365198135376, "learning_rate": 7.162138279976699e-06, "loss": 0.063, "step": 230175 }, { "epoch": 3.3902298935214503, "grad_norm": 1.0140345096588135, "learning_rate": 7.160501908009714e-06, "loss": 0.0514, "step": 230200 }, { "epoch": 3.3905980766115373, "grad_norm": 1.5158782005310059, "learning_rate": 7.158865536042729e-06, "loss": 0.0495, "step": 230225 }, { "epoch": 3.3909662597016244, "grad_norm": 1.2484372854232788, "learning_rate": 7.157229164075745e-06, "loss": 0.0536, "step": 230250 }, { "epoch": 3.3913344427917114, "grad_norm": 1.3410732746124268, "learning_rate": 7.1555927921087606e-06, "loss": 0.0509, "step": 230275 }, { "epoch": 3.3917026258817984, "grad_norm": 1.0487955808639526, "learning_rate": 7.153956420141776e-06, "loss": 0.0489, "step": 230300 }, { "epoch": 3.3920708089718854, "grad_norm": 1.2039780616760254, "learning_rate": 7.152320048174791e-06, "loss": 0.0549, "step": 230325 }, { "epoch": 3.3924389920619724, "grad_norm": 1.4711581468582153, "learning_rate": 7.150683676207807e-06, "loss": 0.0601, "step": 230350 }, { "epoch": 3.3928071751520594, "grad_norm": 1.4990562200546265, "learning_rate": 7.149047304240823e-06, "loss": 0.058, "step": 230375 }, { "epoch": 3.3931753582421464, "grad_norm": 1.0785878896713257, "learning_rate": 7.147410932273837e-06, "loss": 0.0601, "step": 230400 }, { "epoch": 3.3935435413322335, "grad_norm": 1.5416322946548462, "learning_rate": 7.145774560306853e-06, "loss": 0.054, "step": 230425 }, { "epoch": 3.393911724422321, "grad_norm": 1.7337473630905151, "learning_rate": 7.1441381883398685e-06, "loss": 0.0535, "step": 230450 }, { "epoch": 3.394279907512408, "grad_norm": 1.2901270389556885, "learning_rate": 7.142501816372885e-06, "loss": 0.0589, "step": 230475 }, { "epoch": 3.394648090602495, "grad_norm": 1.308258056640625, "learning_rate": 7.140865444405899e-06, "loss": 0.0627, "step": 230500 }, { "epoch": 3.395016273692582, "grad_norm": 0.9888496398925781, "learning_rate": 7.1392290724389145e-06, "loss": 0.0524, "step": 230525 }, { "epoch": 3.395384456782669, "grad_norm": 1.0918967723846436, "learning_rate": 7.137592700471931e-06, "loss": 0.0499, "step": 230550 }, { "epoch": 3.395752639872756, "grad_norm": 1.1751959323883057, "learning_rate": 7.135956328504946e-06, "loss": 0.0532, "step": 230575 }, { "epoch": 3.396120822962843, "grad_norm": 1.5641647577285767, "learning_rate": 7.13431995653796e-06, "loss": 0.0499, "step": 230600 }, { "epoch": 3.39648900605293, "grad_norm": 1.6345034837722778, "learning_rate": 7.1326835845709765e-06, "loss": 0.0551, "step": 230625 }, { "epoch": 3.396857189143017, "grad_norm": 1.2135564088821411, "learning_rate": 7.131047212603992e-06, "loss": 0.0487, "step": 230650 }, { "epoch": 3.397225372233104, "grad_norm": 1.112851619720459, "learning_rate": 7.129410840637008e-06, "loss": 0.0578, "step": 230675 }, { "epoch": 3.397593555323191, "grad_norm": 1.4080071449279785, "learning_rate": 7.127774468670023e-06, "loss": 0.0576, "step": 230700 }, { "epoch": 3.397961738413278, "grad_norm": 1.3997987508773804, "learning_rate": 7.126203551581717e-06, "loss": 0.0562, "step": 230725 }, { "epoch": 3.398329921503365, "grad_norm": 1.2018193006515503, "learning_rate": 7.124567179614733e-06, "loss": 0.0567, "step": 230750 }, { "epoch": 3.398698104593452, "grad_norm": 1.1015087366104126, "learning_rate": 7.122930807647749e-06, "loss": 0.0507, "step": 230775 }, { "epoch": 3.399066287683539, "grad_norm": 1.5534456968307495, "learning_rate": 7.121294435680765e-06, "loss": 0.0549, "step": 230800 }, { "epoch": 3.399434470773626, "grad_norm": 1.714211106300354, "learning_rate": 7.119658063713779e-06, "loss": 0.0549, "step": 230825 }, { "epoch": 3.399802653863713, "grad_norm": 0.5852401256561279, "learning_rate": 7.1180216917467945e-06, "loss": 0.0458, "step": 230850 }, { "epoch": 3.4001708369538006, "grad_norm": 1.0412797927856445, "learning_rate": 7.116385319779811e-06, "loss": 0.0514, "step": 230875 }, { "epoch": 3.4005390200438876, "grad_norm": 1.4167077541351318, "learning_rate": 7.114748947812826e-06, "loss": 0.0541, "step": 230900 }, { "epoch": 3.4009072031339747, "grad_norm": 1.3110685348510742, "learning_rate": 7.1131125758458404e-06, "loss": 0.0557, "step": 230925 }, { "epoch": 3.4012753862240617, "grad_norm": 0.5840967893600464, "learning_rate": 7.111476203878857e-06, "loss": 0.054, "step": 230950 }, { "epoch": 3.4016435693141487, "grad_norm": 1.4448232650756836, "learning_rate": 7.109839831911872e-06, "loss": 0.0494, "step": 230975 }, { "epoch": 3.4020117524042357, "grad_norm": 1.3682793378829956, "learning_rate": 7.108203459944888e-06, "loss": 0.0531, "step": 231000 }, { "epoch": 3.4023799354943227, "grad_norm": 1.5229613780975342, "learning_rate": 7.1065670879779025e-06, "loss": 0.059, "step": 231025 }, { "epoch": 3.4027481185844097, "grad_norm": 1.8227739334106445, "learning_rate": 7.104930716010918e-06, "loss": 0.0553, "step": 231050 }, { "epoch": 3.4031163016744967, "grad_norm": 1.4641097784042358, "learning_rate": 7.103294344043934e-06, "loss": 0.0518, "step": 231075 }, { "epoch": 3.4034844847645838, "grad_norm": 1.3100905418395996, "learning_rate": 7.101657972076949e-06, "loss": 0.0601, "step": 231100 }, { "epoch": 3.4038526678546708, "grad_norm": 1.0178974866867065, "learning_rate": 7.100021600109965e-06, "loss": 0.0556, "step": 231125 }, { "epoch": 3.404220850944758, "grad_norm": 1.049385666847229, "learning_rate": 7.09838522814298e-06, "loss": 0.0534, "step": 231150 }, { "epoch": 3.404589034034845, "grad_norm": 1.3061366081237793, "learning_rate": 7.096748856175996e-06, "loss": 0.055, "step": 231175 }, { "epoch": 3.404957217124932, "grad_norm": 1.6441543102264404, "learning_rate": 7.095112484209011e-06, "loss": 0.0492, "step": 231200 }, { "epoch": 3.405325400215019, "grad_norm": 1.2504527568817139, "learning_rate": 7.093476112242027e-06, "loss": 0.0524, "step": 231225 }, { "epoch": 3.405693583305106, "grad_norm": 1.9197078943252563, "learning_rate": 7.091839740275042e-06, "loss": 0.055, "step": 231250 }, { "epoch": 3.406061766395193, "grad_norm": 1.5676006078720093, "learning_rate": 7.090203368308057e-06, "loss": 0.0501, "step": 231275 }, { "epoch": 3.40642994948528, "grad_norm": 1.5482673645019531, "learning_rate": 7.088566996341073e-06, "loss": 0.0593, "step": 231300 }, { "epoch": 3.406798132575367, "grad_norm": 1.7814412117004395, "learning_rate": 7.086930624374089e-06, "loss": 0.053, "step": 231325 }, { "epoch": 3.407166315665454, "grad_norm": 1.2105423212051392, "learning_rate": 7.085294252407103e-06, "loss": 0.0518, "step": 231350 }, { "epoch": 3.407534498755541, "grad_norm": 1.210797667503357, "learning_rate": 7.083657880440119e-06, "loss": 0.0525, "step": 231375 }, { "epoch": 3.407902681845628, "grad_norm": 1.19452702999115, "learning_rate": 7.082021508473135e-06, "loss": 0.0474, "step": 231400 }, { "epoch": 3.4082708649357154, "grad_norm": 1.1709809303283691, "learning_rate": 7.080385136506151e-06, "loss": 0.0544, "step": 231425 }, { "epoch": 3.4086390480258024, "grad_norm": 0.8450188040733337, "learning_rate": 7.078748764539165e-06, "loss": 0.0562, "step": 231450 }, { "epoch": 3.4090072311158894, "grad_norm": 1.330223560333252, "learning_rate": 7.0771123925721805e-06, "loss": 0.0541, "step": 231475 }, { "epoch": 3.4093754142059765, "grad_norm": 1.1931023597717285, "learning_rate": 7.075476020605197e-06, "loss": 0.0479, "step": 231500 }, { "epoch": 3.4097435972960635, "grad_norm": 1.5765554904937744, "learning_rate": 7.073839648638212e-06, "loss": 0.0518, "step": 231525 }, { "epoch": 3.4101117803861505, "grad_norm": 1.6435822248458862, "learning_rate": 7.0722032766712264e-06, "loss": 0.0589, "step": 231550 }, { "epoch": 3.4104799634762375, "grad_norm": 0.9774888753890991, "learning_rate": 7.070566904704243e-06, "loss": 0.0564, "step": 231575 }, { "epoch": 3.4108481465663245, "grad_norm": 1.090767502784729, "learning_rate": 7.068930532737258e-06, "loss": 0.0587, "step": 231600 }, { "epoch": 3.4112163296564115, "grad_norm": 1.4417084455490112, "learning_rate": 7.067294160770274e-06, "loss": 0.059, "step": 231625 }, { "epoch": 3.4115845127464985, "grad_norm": 1.5252876281738281, "learning_rate": 7.065657788803289e-06, "loss": 0.0558, "step": 231650 }, { "epoch": 3.4119526958365856, "grad_norm": 1.1169487237930298, "learning_rate": 7.064021416836304e-06, "loss": 0.0562, "step": 231675 }, { "epoch": 3.4123208789266726, "grad_norm": 1.637233853340149, "learning_rate": 7.06238504486932e-06, "loss": 0.0585, "step": 231700 }, { "epoch": 3.4126890620167596, "grad_norm": 1.1862943172454834, "learning_rate": 7.060748672902335e-06, "loss": 0.0576, "step": 231725 }, { "epoch": 3.4130572451068466, "grad_norm": 1.5288313627243042, "learning_rate": 7.059112300935351e-06, "loss": 0.057, "step": 231750 }, { "epoch": 3.4134254281969336, "grad_norm": 1.928776502609253, "learning_rate": 7.057475928968366e-06, "loss": 0.059, "step": 231775 }, { "epoch": 3.4137936112870206, "grad_norm": 1.1547839641571045, "learning_rate": 7.055839557001381e-06, "loss": 0.0514, "step": 231800 }, { "epoch": 3.414161794377108, "grad_norm": 1.022402048110962, "learning_rate": 7.054203185034397e-06, "loss": 0.0555, "step": 231825 }, { "epoch": 3.414529977467195, "grad_norm": 0.7116663455963135, "learning_rate": 7.052566813067413e-06, "loss": 0.0595, "step": 231850 }, { "epoch": 3.414898160557282, "grad_norm": 1.7710940837860107, "learning_rate": 7.050930441100428e-06, "loss": 0.0579, "step": 231875 }, { "epoch": 3.415266343647369, "grad_norm": 1.4213794469833374, "learning_rate": 7.049294069133443e-06, "loss": 0.0519, "step": 231900 }, { "epoch": 3.415634526737456, "grad_norm": 1.1554690599441528, "learning_rate": 7.047657697166459e-06, "loss": 0.0537, "step": 231925 }, { "epoch": 3.416002709827543, "grad_norm": 1.626205325126648, "learning_rate": 7.046021325199475e-06, "loss": 0.0541, "step": 231950 }, { "epoch": 3.41637089291763, "grad_norm": 1.1741199493408203, "learning_rate": 7.044384953232489e-06, "loss": 0.0551, "step": 231975 }, { "epoch": 3.416739076007717, "grad_norm": 1.0156229734420776, "learning_rate": 7.042748581265505e-06, "loss": 0.0501, "step": 232000 }, { "epoch": 3.417107259097804, "grad_norm": 1.2444950342178345, "learning_rate": 7.041112209298521e-06, "loss": 0.0503, "step": 232025 }, { "epoch": 3.4174754421878912, "grad_norm": 1.3393328189849854, "learning_rate": 7.039475837331537e-06, "loss": 0.0603, "step": 232050 }, { "epoch": 3.4178436252779782, "grad_norm": 0.9660595655441284, "learning_rate": 7.037839465364551e-06, "loss": 0.058, "step": 232075 }, { "epoch": 3.4182118083680653, "grad_norm": 1.1569231748580933, "learning_rate": 7.0362030933975665e-06, "loss": 0.0473, "step": 232100 }, { "epoch": 3.4185799914581523, "grad_norm": 1.8087416887283325, "learning_rate": 7.034566721430583e-06, "loss": 0.0591, "step": 232125 }, { "epoch": 3.4189481745482393, "grad_norm": 1.3858063220977783, "learning_rate": 7.032930349463598e-06, "loss": 0.0535, "step": 232150 }, { "epoch": 3.4193163576383263, "grad_norm": 1.208710789680481, "learning_rate": 7.031293977496614e-06, "loss": 0.0552, "step": 232175 }, { "epoch": 3.4196845407284133, "grad_norm": 1.5608861446380615, "learning_rate": 7.029657605529629e-06, "loss": 0.0545, "step": 232200 }, { "epoch": 3.4200527238185003, "grad_norm": 1.5005319118499756, "learning_rate": 7.028021233562644e-06, "loss": 0.057, "step": 232225 }, { "epoch": 3.4204209069085874, "grad_norm": 1.1500242948532104, "learning_rate": 7.02638486159566e-06, "loss": 0.0565, "step": 232250 }, { "epoch": 3.4207890899986744, "grad_norm": 1.6404985189437866, "learning_rate": 7.024748489628675e-06, "loss": 0.0511, "step": 232275 }, { "epoch": 3.4211572730887614, "grad_norm": 1.4335654973983765, "learning_rate": 7.02311211766169e-06, "loss": 0.0556, "step": 232300 }, { "epoch": 3.4215254561788484, "grad_norm": 1.354269027709961, "learning_rate": 7.021475745694706e-06, "loss": 0.0563, "step": 232325 }, { "epoch": 3.4218936392689354, "grad_norm": 1.8647336959838867, "learning_rate": 7.019839373727721e-06, "loss": 0.0564, "step": 232350 }, { "epoch": 3.422261822359023, "grad_norm": 1.2038220167160034, "learning_rate": 7.018203001760737e-06, "loss": 0.0572, "step": 232375 }, { "epoch": 3.42263000544911, "grad_norm": 1.2991139888763428, "learning_rate": 7.016566629793752e-06, "loss": 0.0508, "step": 232400 }, { "epoch": 3.422998188539197, "grad_norm": 1.2577146291732788, "learning_rate": 7.014930257826767e-06, "loss": 0.0518, "step": 232425 }, { "epoch": 3.423366371629284, "grad_norm": 1.1498643159866333, "learning_rate": 7.013293885859783e-06, "loss": 0.0535, "step": 232450 }, { "epoch": 3.423734554719371, "grad_norm": 0.9440375566482544, "learning_rate": 7.011657513892799e-06, "loss": 0.0528, "step": 232475 }, { "epoch": 3.424102737809458, "grad_norm": 1.3459793329238892, "learning_rate": 7.010021141925813e-06, "loss": 0.0563, "step": 232500 }, { "epoch": 3.424470920899545, "grad_norm": 1.218077301979065, "learning_rate": 7.008384769958829e-06, "loss": 0.0562, "step": 232525 }, { "epoch": 3.424839103989632, "grad_norm": 1.4415664672851562, "learning_rate": 7.0067483979918445e-06, "loss": 0.0511, "step": 232550 }, { "epoch": 3.425207287079719, "grad_norm": 1.043233871459961, "learning_rate": 7.005112026024861e-06, "loss": 0.0574, "step": 232575 }, { "epoch": 3.425575470169806, "grad_norm": 1.5398774147033691, "learning_rate": 7.003475654057876e-06, "loss": 0.0539, "step": 232600 }, { "epoch": 3.425943653259893, "grad_norm": 1.1564363241195679, "learning_rate": 7.001839282090891e-06, "loss": 0.0582, "step": 232625 }, { "epoch": 3.42631183634998, "grad_norm": 1.0484740734100342, "learning_rate": 7.000202910123907e-06, "loss": 0.0545, "step": 232650 }, { "epoch": 3.426680019440067, "grad_norm": 0.7289891839027405, "learning_rate": 6.998566538156922e-06, "loss": 0.0551, "step": 232675 }, { "epoch": 3.427048202530154, "grad_norm": 1.0609796047210693, "learning_rate": 6.996930166189938e-06, "loss": 0.0612, "step": 232700 }, { "epoch": 3.427416385620241, "grad_norm": 1.637547254562378, "learning_rate": 6.9952937942229525e-06, "loss": 0.0547, "step": 232725 }, { "epoch": 3.427784568710328, "grad_norm": 1.0543664693832397, "learning_rate": 6.993722877134648e-06, "loss": 0.0573, "step": 232750 }, { "epoch": 3.428152751800415, "grad_norm": 0.9847107529640198, "learning_rate": 6.992086505167663e-06, "loss": 0.0516, "step": 232775 }, { "epoch": 3.4285209348905026, "grad_norm": 1.437808632850647, "learning_rate": 6.990450133200679e-06, "loss": 0.0529, "step": 232800 }, { "epoch": 3.4288891179805896, "grad_norm": 0.8354343175888062, "learning_rate": 6.988813761233694e-06, "loss": 0.0568, "step": 232825 }, { "epoch": 3.4292573010706766, "grad_norm": 1.4230772256851196, "learning_rate": 6.987177389266709e-06, "loss": 0.0568, "step": 232850 }, { "epoch": 3.4296254841607636, "grad_norm": 1.260440707206726, "learning_rate": 6.9855410172997255e-06, "loss": 0.0534, "step": 232875 }, { "epoch": 3.4299936672508506, "grad_norm": 1.4289093017578125, "learning_rate": 6.983904645332741e-06, "loss": 0.0605, "step": 232900 }, { "epoch": 3.4303618503409377, "grad_norm": 1.2366105318069458, "learning_rate": 6.982268273365755e-06, "loss": 0.0523, "step": 232925 }, { "epoch": 3.4307300334310247, "grad_norm": 1.1550147533416748, "learning_rate": 6.980631901398771e-06, "loss": 0.0574, "step": 232950 }, { "epoch": 3.4310982165211117, "grad_norm": 1.513540506362915, "learning_rate": 6.978995529431787e-06, "loss": 0.0592, "step": 232975 }, { "epoch": 3.4314663996111987, "grad_norm": 1.570636510848999, "learning_rate": 6.977359157464803e-06, "loss": 0.0549, "step": 233000 }, { "epoch": 3.4318345827012857, "grad_norm": 1.3403128385543823, "learning_rate": 6.975722785497817e-06, "loss": 0.0487, "step": 233025 }, { "epoch": 3.4322027657913727, "grad_norm": 1.6959725618362427, "learning_rate": 6.974086413530833e-06, "loss": 0.0494, "step": 233050 }, { "epoch": 3.4325709488814597, "grad_norm": 1.3732138872146606, "learning_rate": 6.972450041563849e-06, "loss": 0.0578, "step": 233075 }, { "epoch": 3.4329391319715468, "grad_norm": 1.0029683113098145, "learning_rate": 6.970813669596864e-06, "loss": 0.0648, "step": 233100 }, { "epoch": 3.433307315061634, "grad_norm": 1.0218791961669922, "learning_rate": 6.96917729762988e-06, "loss": 0.0514, "step": 233125 }, { "epoch": 3.433675498151721, "grad_norm": 1.437760829925537, "learning_rate": 6.967540925662895e-06, "loss": 0.057, "step": 233150 }, { "epoch": 3.434043681241808, "grad_norm": 1.7047245502471924, "learning_rate": 6.96590455369591e-06, "loss": 0.0621, "step": 233175 }, { "epoch": 3.434411864331895, "grad_norm": 1.3571045398712158, "learning_rate": 6.964268181728926e-06, "loss": 0.0571, "step": 233200 }, { "epoch": 3.434780047421982, "grad_norm": 1.0178911685943604, "learning_rate": 6.962631809761941e-06, "loss": 0.0593, "step": 233225 }, { "epoch": 3.435148230512069, "grad_norm": 1.3114897012710571, "learning_rate": 6.960995437794956e-06, "loss": 0.0526, "step": 233250 }, { "epoch": 3.435516413602156, "grad_norm": 1.376524567604065, "learning_rate": 6.959359065827972e-06, "loss": 0.0567, "step": 233275 }, { "epoch": 3.435884596692243, "grad_norm": 0.8352702260017395, "learning_rate": 6.957722693860987e-06, "loss": 0.0568, "step": 233300 }, { "epoch": 3.4362527797823303, "grad_norm": 1.4313902854919434, "learning_rate": 6.9560863218940035e-06, "loss": 0.0513, "step": 233325 }, { "epoch": 3.4366209628724174, "grad_norm": 1.2757450342178345, "learning_rate": 6.954449949927018e-06, "loss": 0.0535, "step": 233350 }, { "epoch": 3.4369891459625044, "grad_norm": 1.646310567855835, "learning_rate": 6.952813577960033e-06, "loss": 0.0617, "step": 233375 }, { "epoch": 3.4373573290525914, "grad_norm": 1.7936944961547852, "learning_rate": 6.951177205993049e-06, "loss": 0.0546, "step": 233400 }, { "epoch": 3.4377255121426784, "grad_norm": 0.933220624923706, "learning_rate": 6.949540834026065e-06, "loss": 0.0603, "step": 233425 }, { "epoch": 3.4380936952327654, "grad_norm": 1.5044418573379517, "learning_rate": 6.94790446205908e-06, "loss": 0.0494, "step": 233450 }, { "epoch": 3.4384618783228524, "grad_norm": 1.3826124668121338, "learning_rate": 6.946268090092095e-06, "loss": 0.0528, "step": 233475 }, { "epoch": 3.4388300614129395, "grad_norm": 1.0711140632629395, "learning_rate": 6.9446317181251115e-06, "loss": 0.0508, "step": 233500 }, { "epoch": 3.4391982445030265, "grad_norm": 1.4220081567764282, "learning_rate": 6.942995346158127e-06, "loss": 0.0602, "step": 233525 }, { "epoch": 3.4395664275931135, "grad_norm": 1.2843989133834839, "learning_rate": 6.941358974191141e-06, "loss": 0.0485, "step": 233550 }, { "epoch": 3.4399346106832005, "grad_norm": 1.3126015663146973, "learning_rate": 6.939722602224157e-06, "loss": 0.0592, "step": 233575 }, { "epoch": 3.4403027937732875, "grad_norm": 1.6128236055374146, "learning_rate": 6.938086230257173e-06, "loss": 0.0537, "step": 233600 }, { "epoch": 3.4406709768633745, "grad_norm": 0.8044310808181763, "learning_rate": 6.936449858290189e-06, "loss": 0.049, "step": 233625 }, { "epoch": 3.4410391599534615, "grad_norm": 1.4766666889190674, "learning_rate": 6.9348789412018836e-06, "loss": 0.0547, "step": 233650 }, { "epoch": 3.4414073430435486, "grad_norm": 2.1881563663482666, "learning_rate": 6.933242569234898e-06, "loss": 0.0575, "step": 233675 }, { "epoch": 3.4417755261336356, "grad_norm": 1.0944713354110718, "learning_rate": 6.931606197267914e-06, "loss": 0.0571, "step": 233700 }, { "epoch": 3.4421437092237226, "grad_norm": 1.5938423871994019, "learning_rate": 6.9299698253009295e-06, "loss": 0.0666, "step": 233725 }, { "epoch": 3.44251189231381, "grad_norm": 0.8704099655151367, "learning_rate": 6.928333453333946e-06, "loss": 0.0514, "step": 233750 }, { "epoch": 3.442880075403897, "grad_norm": 1.607376217842102, "learning_rate": 6.92669708136696e-06, "loss": 0.0601, "step": 233775 }, { "epoch": 3.443248258493984, "grad_norm": 1.2371896505355835, "learning_rate": 6.925060709399975e-06, "loss": 0.0601, "step": 233800 }, { "epoch": 3.443616441584071, "grad_norm": 1.1890604496002197, "learning_rate": 6.9234243374329915e-06, "loss": 0.0523, "step": 233825 }, { "epoch": 3.443984624674158, "grad_norm": 0.8377152681350708, "learning_rate": 6.921787965466007e-06, "loss": 0.055, "step": 233850 }, { "epoch": 3.444352807764245, "grad_norm": 1.3999760150909424, "learning_rate": 6.920151593499021e-06, "loss": 0.0543, "step": 233875 }, { "epoch": 3.444720990854332, "grad_norm": 1.1638010740280151, "learning_rate": 6.9185152215320375e-06, "loss": 0.0593, "step": 233900 }, { "epoch": 3.445089173944419, "grad_norm": 1.4614380598068237, "learning_rate": 6.916878849565053e-06, "loss": 0.0526, "step": 233925 }, { "epoch": 3.445457357034506, "grad_norm": 1.6576213836669922, "learning_rate": 6.915242477598069e-06, "loss": 0.0557, "step": 233950 }, { "epoch": 3.445825540124593, "grad_norm": 1.156779170036316, "learning_rate": 6.913606105631083e-06, "loss": 0.0528, "step": 233975 }, { "epoch": 3.44619372321468, "grad_norm": 1.4911894798278809, "learning_rate": 6.911969733664099e-06, "loss": 0.0565, "step": 234000 }, { "epoch": 3.446561906304767, "grad_norm": 1.1513633728027344, "learning_rate": 6.910333361697115e-06, "loss": 0.0526, "step": 234025 }, { "epoch": 3.4469300893948542, "grad_norm": 1.9557138681411743, "learning_rate": 6.90869698973013e-06, "loss": 0.0601, "step": 234050 }, { "epoch": 3.4472982724849413, "grad_norm": 1.3454968929290771, "learning_rate": 6.907060617763146e-06, "loss": 0.0518, "step": 234075 }, { "epoch": 3.4476664555750283, "grad_norm": 1.2555756568908691, "learning_rate": 6.905424245796161e-06, "loss": 0.0492, "step": 234100 }, { "epoch": 3.4480346386651153, "grad_norm": 1.12812340259552, "learning_rate": 6.903787873829176e-06, "loss": 0.055, "step": 234125 }, { "epoch": 3.4484028217552023, "grad_norm": 1.457519292831421, "learning_rate": 6.902151501862192e-06, "loss": 0.0505, "step": 234150 }, { "epoch": 3.4487710048452893, "grad_norm": 1.1737881898880005, "learning_rate": 6.9005151298952075e-06, "loss": 0.0526, "step": 234175 }, { "epoch": 3.4491391879353763, "grad_norm": 1.009140133857727, "learning_rate": 6.898878757928222e-06, "loss": 0.059, "step": 234200 }, { "epoch": 3.4495073710254633, "grad_norm": 1.4208605289459229, "learning_rate": 6.897242385961238e-06, "loss": 0.0505, "step": 234225 }, { "epoch": 3.4498755541155504, "grad_norm": 1.17922043800354, "learning_rate": 6.895606013994253e-06, "loss": 0.0506, "step": 234250 }, { "epoch": 3.4502437372056374, "grad_norm": 1.5503113269805908, "learning_rate": 6.8939696420272696e-06, "loss": 0.053, "step": 234275 }, { "epoch": 3.450611920295725, "grad_norm": 1.4906063079833984, "learning_rate": 6.892333270060284e-06, "loss": 0.0575, "step": 234300 }, { "epoch": 3.450980103385812, "grad_norm": 2.207207441329956, "learning_rate": 6.8906968980933e-06, "loss": 0.0526, "step": 234325 }, { "epoch": 3.451348286475899, "grad_norm": 1.1196998357772827, "learning_rate": 6.8890605261263155e-06, "loss": 0.0545, "step": 234350 }, { "epoch": 3.451716469565986, "grad_norm": 1.8272604942321777, "learning_rate": 6.887424154159331e-06, "loss": 0.0558, "step": 234375 }, { "epoch": 3.452084652656073, "grad_norm": 0.8513809442520142, "learning_rate": 6.885787782192346e-06, "loss": 0.0489, "step": 234400 }, { "epoch": 3.45245283574616, "grad_norm": 1.5188515186309814, "learning_rate": 6.884151410225361e-06, "loss": 0.0515, "step": 234425 }, { "epoch": 3.452821018836247, "grad_norm": 1.2470561265945435, "learning_rate": 6.8825150382583775e-06, "loss": 0.052, "step": 234450 }, { "epoch": 3.453189201926334, "grad_norm": 1.6135742664337158, "learning_rate": 6.880878666291393e-06, "loss": 0.0545, "step": 234475 }, { "epoch": 3.453557385016421, "grad_norm": 1.383779764175415, "learning_rate": 6.879242294324407e-06, "loss": 0.0564, "step": 234500 }, { "epoch": 3.453925568106508, "grad_norm": 1.1124207973480225, "learning_rate": 6.8776059223574235e-06, "loss": 0.05, "step": 234525 }, { "epoch": 3.454293751196595, "grad_norm": 1.3962866067886353, "learning_rate": 6.875969550390439e-06, "loss": 0.0515, "step": 234550 }, { "epoch": 3.454661934286682, "grad_norm": 1.374168038368225, "learning_rate": 6.874333178423455e-06, "loss": 0.0505, "step": 234575 }, { "epoch": 3.455030117376769, "grad_norm": 1.6096205711364746, "learning_rate": 6.87269680645647e-06, "loss": 0.0594, "step": 234600 }, { "epoch": 3.455398300466856, "grad_norm": 1.3983277082443237, "learning_rate": 6.871060434489485e-06, "loss": 0.0562, "step": 234625 }, { "epoch": 3.455766483556943, "grad_norm": 1.3306199312210083, "learning_rate": 6.869424062522501e-06, "loss": 0.0512, "step": 234650 }, { "epoch": 3.45613466664703, "grad_norm": 1.9823153018951416, "learning_rate": 6.867787690555516e-06, "loss": 0.0684, "step": 234675 }, { "epoch": 3.4565028497371175, "grad_norm": 1.5583148002624512, "learning_rate": 6.866151318588532e-06, "loss": 0.0573, "step": 234700 }, { "epoch": 3.4568710328272045, "grad_norm": 1.6550252437591553, "learning_rate": 6.864514946621547e-06, "loss": 0.0523, "step": 234725 }, { "epoch": 3.4572392159172916, "grad_norm": 1.2345621585845947, "learning_rate": 6.862878574654562e-06, "loss": 0.0519, "step": 234750 }, { "epoch": 3.4576073990073786, "grad_norm": 1.5852617025375366, "learning_rate": 6.861242202687578e-06, "loss": 0.0532, "step": 234775 }, { "epoch": 3.4579755820974656, "grad_norm": 1.7967497110366821, "learning_rate": 6.8596058307205935e-06, "loss": 0.0597, "step": 234800 }, { "epoch": 3.4583437651875526, "grad_norm": 1.4641871452331543, "learning_rate": 6.857969458753608e-06, "loss": 0.0578, "step": 234825 }, { "epoch": 3.4587119482776396, "grad_norm": 1.144559383392334, "learning_rate": 6.856333086786624e-06, "loss": 0.0468, "step": 234850 }, { "epoch": 3.4590801313677266, "grad_norm": 1.3692418336868286, "learning_rate": 6.854696714819639e-06, "loss": 0.0619, "step": 234875 }, { "epoch": 3.4594483144578136, "grad_norm": 1.4208827018737793, "learning_rate": 6.8530603428526556e-06, "loss": 0.0591, "step": 234900 }, { "epoch": 3.4598164975479007, "grad_norm": 1.0685988664627075, "learning_rate": 6.85142397088567e-06, "loss": 0.0564, "step": 234925 }, { "epoch": 3.4601846806379877, "grad_norm": 1.3136134147644043, "learning_rate": 6.849787598918685e-06, "loss": 0.0557, "step": 234950 }, { "epoch": 3.4605528637280747, "grad_norm": 1.1276713609695435, "learning_rate": 6.8481512269517015e-06, "loss": 0.0462, "step": 234975 }, { "epoch": 3.4609210468181617, "grad_norm": 1.357824444770813, "learning_rate": 6.846514854984717e-06, "loss": 0.0562, "step": 235000 }, { "epoch": 3.4612892299082487, "grad_norm": 1.4743969440460205, "learning_rate": 6.844878483017733e-06, "loss": 0.0569, "step": 235025 }, { "epoch": 3.4616574129983357, "grad_norm": 1.5803524255752563, "learning_rate": 6.843242111050747e-06, "loss": 0.0541, "step": 235050 }, { "epoch": 3.4620255960884228, "grad_norm": 1.5557721853256226, "learning_rate": 6.8416057390837635e-06, "loss": 0.0562, "step": 235075 }, { "epoch": 3.4623937791785098, "grad_norm": 1.3310322761535645, "learning_rate": 6.839969367116779e-06, "loss": 0.0584, "step": 235100 }, { "epoch": 3.462761962268597, "grad_norm": 1.1937814950942993, "learning_rate": 6.838332995149794e-06, "loss": 0.0591, "step": 235125 }, { "epoch": 3.463130145358684, "grad_norm": 0.9994338750839233, "learning_rate": 6.8366966231828094e-06, "loss": 0.0531, "step": 235150 }, { "epoch": 3.463498328448771, "grad_norm": 1.452646017074585, "learning_rate": 6.835060251215825e-06, "loss": 0.051, "step": 235175 }, { "epoch": 3.463866511538858, "grad_norm": 1.3941121101379395, "learning_rate": 6.833423879248841e-06, "loss": 0.0566, "step": 235200 }, { "epoch": 3.464234694628945, "grad_norm": 0.8465602993965149, "learning_rate": 6.831787507281856e-06, "loss": 0.0569, "step": 235225 }, { "epoch": 3.4646028777190323, "grad_norm": 1.2280335426330566, "learning_rate": 6.830151135314871e-06, "loss": 0.0599, "step": 235250 }, { "epoch": 3.4649710608091193, "grad_norm": 1.3824517726898193, "learning_rate": 6.828514763347887e-06, "loss": 0.0534, "step": 235275 }, { "epoch": 3.4653392438992063, "grad_norm": 1.3251019716262817, "learning_rate": 6.826878391380902e-06, "loss": 0.0491, "step": 235300 }, { "epoch": 3.4657074269892933, "grad_norm": 0.8468957543373108, "learning_rate": 6.825242019413918e-06, "loss": 0.0526, "step": 235325 }, { "epoch": 3.4660756100793804, "grad_norm": 0.9797238111495972, "learning_rate": 6.823605647446933e-06, "loss": 0.0514, "step": 235350 }, { "epoch": 3.4664437931694674, "grad_norm": 1.248177170753479, "learning_rate": 6.821969275479948e-06, "loss": 0.0521, "step": 235375 }, { "epoch": 3.4668119762595544, "grad_norm": 1.6597363948822021, "learning_rate": 6.820332903512964e-06, "loss": 0.0578, "step": 235400 }, { "epoch": 3.4671801593496414, "grad_norm": 1.5750609636306763, "learning_rate": 6.8186965315459795e-06, "loss": 0.0555, "step": 235425 }, { "epoch": 3.4675483424397284, "grad_norm": 2.0586562156677246, "learning_rate": 6.817060159578994e-06, "loss": 0.0505, "step": 235450 }, { "epoch": 3.4679165255298154, "grad_norm": 1.3287327289581299, "learning_rate": 6.81542378761201e-06, "loss": 0.0532, "step": 235475 }, { "epoch": 3.4682847086199025, "grad_norm": 1.8020696640014648, "learning_rate": 6.813787415645025e-06, "loss": 0.0554, "step": 235500 }, { "epoch": 3.4686528917099895, "grad_norm": 1.4357680082321167, "learning_rate": 6.8121510436780416e-06, "loss": 0.0547, "step": 235525 }, { "epoch": 3.4690210748000765, "grad_norm": 1.7434290647506714, "learning_rate": 6.810514671711057e-06, "loss": 0.0505, "step": 235550 }, { "epoch": 3.4693892578901635, "grad_norm": 1.6416116952896118, "learning_rate": 6.808878299744071e-06, "loss": 0.0586, "step": 235575 }, { "epoch": 3.4697574409802505, "grad_norm": 1.632177472114563, "learning_rate": 6.8072419277770875e-06, "loss": 0.0547, "step": 235600 }, { "epoch": 3.4701256240703375, "grad_norm": 1.4830220937728882, "learning_rate": 6.805605555810103e-06, "loss": 0.0541, "step": 235625 }, { "epoch": 3.4704938071604245, "grad_norm": 1.9248520135879517, "learning_rate": 6.803969183843119e-06, "loss": 0.0521, "step": 235650 }, { "epoch": 3.470861990250512, "grad_norm": 1.7081308364868164, "learning_rate": 6.802332811876133e-06, "loss": 0.0511, "step": 235675 }, { "epoch": 3.471230173340599, "grad_norm": 0.9971753358840942, "learning_rate": 6.800696439909149e-06, "loss": 0.0474, "step": 235700 }, { "epoch": 3.471598356430686, "grad_norm": 1.6527999639511108, "learning_rate": 6.799060067942165e-06, "loss": 0.0574, "step": 235725 }, { "epoch": 3.471966539520773, "grad_norm": 1.3607879877090454, "learning_rate": 6.79742369597518e-06, "loss": 0.055, "step": 235750 }, { "epoch": 3.47233472261086, "grad_norm": 1.4134190082550049, "learning_rate": 6.7957873240081954e-06, "loss": 0.0532, "step": 235775 }, { "epoch": 3.472702905700947, "grad_norm": 0.9669596552848816, "learning_rate": 6.794150952041211e-06, "loss": 0.062, "step": 235800 }, { "epoch": 3.473071088791034, "grad_norm": 0.6366238594055176, "learning_rate": 6.792514580074227e-06, "loss": 0.054, "step": 235825 }, { "epoch": 3.473439271881121, "grad_norm": 1.2645734548568726, "learning_rate": 6.790878208107242e-06, "loss": 0.0629, "step": 235850 }, { "epoch": 3.473807454971208, "grad_norm": 1.1953133344650269, "learning_rate": 6.789241836140257e-06, "loss": 0.053, "step": 235875 }, { "epoch": 3.474175638061295, "grad_norm": 1.2158385515213013, "learning_rate": 6.787605464173273e-06, "loss": 0.0557, "step": 235900 }, { "epoch": 3.474543821151382, "grad_norm": 0.9192224740982056, "learning_rate": 6.785969092206288e-06, "loss": 0.0551, "step": 235925 }, { "epoch": 3.474912004241469, "grad_norm": 1.5386022329330444, "learning_rate": 6.784332720239304e-06, "loss": 0.0591, "step": 235950 }, { "epoch": 3.475280187331556, "grad_norm": 1.353676438331604, "learning_rate": 6.782696348272319e-06, "loss": 0.0535, "step": 235975 }, { "epoch": 3.475648370421643, "grad_norm": 1.5959298610687256, "learning_rate": 6.781059976305334e-06, "loss": 0.052, "step": 236000 }, { "epoch": 3.4760165535117302, "grad_norm": 1.2088018655776978, "learning_rate": 6.77942360433835e-06, "loss": 0.0528, "step": 236025 }, { "epoch": 3.4763847366018172, "grad_norm": 1.6774694919586182, "learning_rate": 6.7777872323713655e-06, "loss": 0.0543, "step": 236050 }, { "epoch": 3.4767529196919043, "grad_norm": 1.322643756866455, "learning_rate": 6.776150860404382e-06, "loss": 0.061, "step": 236075 }, { "epoch": 3.4771211027819913, "grad_norm": 1.6292881965637207, "learning_rate": 6.774514488437396e-06, "loss": 0.0585, "step": 236100 }, { "epoch": 3.4774892858720783, "grad_norm": 1.2966444492340088, "learning_rate": 6.772878116470411e-06, "loss": 0.0526, "step": 236125 }, { "epoch": 3.4778574689621653, "grad_norm": 1.436832308769226, "learning_rate": 6.7712417445034276e-06, "loss": 0.0562, "step": 236150 }, { "epoch": 3.4782256520522523, "grad_norm": 1.6515053510665894, "learning_rate": 6.769605372536443e-06, "loss": 0.0509, "step": 236175 }, { "epoch": 3.4785938351423398, "grad_norm": 1.1039272546768188, "learning_rate": 6.767969000569457e-06, "loss": 0.0535, "step": 236200 }, { "epoch": 3.478962018232427, "grad_norm": 1.2447068691253662, "learning_rate": 6.7663326286024735e-06, "loss": 0.0556, "step": 236225 }, { "epoch": 3.479330201322514, "grad_norm": 1.0118263959884644, "learning_rate": 6.764696256635489e-06, "loss": 0.0563, "step": 236250 }, { "epoch": 3.479698384412601, "grad_norm": 1.262508511543274, "learning_rate": 6.763059884668505e-06, "loss": 0.0511, "step": 236275 }, { "epoch": 3.480066567502688, "grad_norm": 1.5445690155029297, "learning_rate": 6.761423512701519e-06, "loss": 0.0547, "step": 236300 }, { "epoch": 3.480434750592775, "grad_norm": 0.6799076199531555, "learning_rate": 6.759787140734535e-06, "loss": 0.0556, "step": 236325 }, { "epoch": 3.480802933682862, "grad_norm": 0.9605331420898438, "learning_rate": 6.758150768767551e-06, "loss": 0.0499, "step": 236350 }, { "epoch": 3.481171116772949, "grad_norm": 0.9172486662864685, "learning_rate": 6.756514396800566e-06, "loss": 0.0504, "step": 236375 }, { "epoch": 3.481539299863036, "grad_norm": 1.4085545539855957, "learning_rate": 6.754878024833581e-06, "loss": 0.0539, "step": 236400 }, { "epoch": 3.481907482953123, "grad_norm": 1.6286646127700806, "learning_rate": 6.753241652866597e-06, "loss": 0.0559, "step": 236425 }, { "epoch": 3.48227566604321, "grad_norm": 1.5250663757324219, "learning_rate": 6.751605280899612e-06, "loss": 0.0565, "step": 236450 }, { "epoch": 3.482643849133297, "grad_norm": 1.1588718891143799, "learning_rate": 6.749968908932628e-06, "loss": 0.0576, "step": 236475 }, { "epoch": 3.483012032223384, "grad_norm": 1.209121584892273, "learning_rate": 6.7483325369656435e-06, "loss": 0.0554, "step": 236500 }, { "epoch": 3.483380215313471, "grad_norm": 1.5117765665054321, "learning_rate": 6.746696164998659e-06, "loss": 0.0532, "step": 236525 }, { "epoch": 3.483748398403558, "grad_norm": 1.2792174816131592, "learning_rate": 6.745059793031674e-06, "loss": 0.0592, "step": 236550 }, { "epoch": 3.484116581493645, "grad_norm": 0.8825824856758118, "learning_rate": 6.7434234210646894e-06, "loss": 0.0546, "step": 236575 }, { "epoch": 3.484484764583732, "grad_norm": 1.230028510093689, "learning_rate": 6.741787049097706e-06, "loss": 0.054, "step": 236600 }, { "epoch": 3.4848529476738195, "grad_norm": 1.2112821340560913, "learning_rate": 6.74015067713072e-06, "loss": 0.0623, "step": 236625 }, { "epoch": 3.4852211307639065, "grad_norm": 1.7815861701965332, "learning_rate": 6.738514305163736e-06, "loss": 0.0515, "step": 236650 }, { "epoch": 3.4855893138539935, "grad_norm": 1.379918098449707, "learning_rate": 6.7368779331967515e-06, "loss": 0.0598, "step": 236675 }, { "epoch": 3.4859574969440805, "grad_norm": 1.303359031677246, "learning_rate": 6.735241561229768e-06, "loss": 0.0574, "step": 236700 }, { "epoch": 3.4863256800341675, "grad_norm": 1.2450717687606812, "learning_rate": 6.733605189262782e-06, "loss": 0.0552, "step": 236725 }, { "epoch": 3.4866938631242546, "grad_norm": 1.3029987812042236, "learning_rate": 6.731968817295797e-06, "loss": 0.0528, "step": 236750 }, { "epoch": 3.4870620462143416, "grad_norm": 1.0911694765090942, "learning_rate": 6.7303324453288136e-06, "loss": 0.0572, "step": 236775 }, { "epoch": 3.4874302293044286, "grad_norm": 1.6581034660339355, "learning_rate": 6.728696073361829e-06, "loss": 0.0532, "step": 236800 }, { "epoch": 3.4877984123945156, "grad_norm": 1.311486005783081, "learning_rate": 6.727059701394843e-06, "loss": 0.054, "step": 236825 }, { "epoch": 3.4881665954846026, "grad_norm": 1.219870924949646, "learning_rate": 6.7254233294278595e-06, "loss": 0.0646, "step": 236850 }, { "epoch": 3.4885347785746896, "grad_norm": 1.3772090673446655, "learning_rate": 6.723786957460875e-06, "loss": 0.0509, "step": 236875 }, { "epoch": 3.4889029616647766, "grad_norm": 1.1852127313613892, "learning_rate": 6.722150585493891e-06, "loss": 0.0579, "step": 236900 }, { "epoch": 3.4892711447548637, "grad_norm": 1.2634761333465576, "learning_rate": 6.720514213526905e-06, "loss": 0.0578, "step": 236925 }, { "epoch": 3.4896393278449507, "grad_norm": 1.434631586074829, "learning_rate": 6.718877841559921e-06, "loss": 0.0497, "step": 236950 }, { "epoch": 3.4900075109350377, "grad_norm": 1.5747640132904053, "learning_rate": 6.717241469592937e-06, "loss": 0.0523, "step": 236975 }, { "epoch": 3.4903756940251247, "grad_norm": 1.2704558372497559, "learning_rate": 6.715605097625952e-06, "loss": 0.0645, "step": 237000 }, { "epoch": 3.4907438771152117, "grad_norm": 1.4632608890533447, "learning_rate": 6.713968725658968e-06, "loss": 0.0531, "step": 237025 }, { "epoch": 3.4911120602052987, "grad_norm": 1.9516022205352783, "learning_rate": 6.712332353691983e-06, "loss": 0.0571, "step": 237050 }, { "epoch": 3.4914802432953858, "grad_norm": 1.6178995370864868, "learning_rate": 6.710695981724998e-06, "loss": 0.0499, "step": 237075 }, { "epoch": 3.4918484263854728, "grad_norm": 1.7712401151657104, "learning_rate": 6.709059609758014e-06, "loss": 0.0619, "step": 237100 }, { "epoch": 3.49221660947556, "grad_norm": 1.120568871498108, "learning_rate": 6.707488692669709e-06, "loss": 0.0592, "step": 237125 }, { "epoch": 3.492584792565647, "grad_norm": 1.1097540855407715, "learning_rate": 6.705852320702723e-06, "loss": 0.0571, "step": 237150 }, { "epoch": 3.4929529756557343, "grad_norm": 1.0537636280059814, "learning_rate": 6.7042159487357395e-06, "loss": 0.0554, "step": 237175 }, { "epoch": 3.4933211587458213, "grad_norm": 1.3439337015151978, "learning_rate": 6.702579576768755e-06, "loss": 0.0588, "step": 237200 }, { "epoch": 3.4936893418359083, "grad_norm": 1.8948816061019897, "learning_rate": 6.700943204801771e-06, "loss": 0.0539, "step": 237225 }, { "epoch": 3.4940575249259953, "grad_norm": 1.3520216941833496, "learning_rate": 6.6993068328347855e-06, "loss": 0.0602, "step": 237250 }, { "epoch": 3.4944257080160823, "grad_norm": 1.1100796461105347, "learning_rate": 6.697670460867801e-06, "loss": 0.0537, "step": 237275 }, { "epoch": 3.4947938911061693, "grad_norm": 1.159147024154663, "learning_rate": 6.696034088900817e-06, "loss": 0.0618, "step": 237300 }, { "epoch": 3.4951620741962564, "grad_norm": 1.400046467781067, "learning_rate": 6.694397716933832e-06, "loss": 0.0524, "step": 237325 }, { "epoch": 3.4955302572863434, "grad_norm": 1.0062354803085327, "learning_rate": 6.6927613449668475e-06, "loss": 0.0512, "step": 237350 }, { "epoch": 3.4958984403764304, "grad_norm": 1.3053812980651855, "learning_rate": 6.691124972999863e-06, "loss": 0.0503, "step": 237375 }, { "epoch": 3.4962666234665174, "grad_norm": 1.707844853401184, "learning_rate": 6.689488601032879e-06, "loss": 0.0567, "step": 237400 }, { "epoch": 3.4966348065566044, "grad_norm": 1.378909707069397, "learning_rate": 6.687852229065894e-06, "loss": 0.0561, "step": 237425 }, { "epoch": 3.4970029896466914, "grad_norm": 1.0125243663787842, "learning_rate": 6.68621585709891e-06, "loss": 0.0493, "step": 237450 }, { "epoch": 3.4973711727367784, "grad_norm": 1.4544315338134766, "learning_rate": 6.684579485131925e-06, "loss": 0.0628, "step": 237475 }, { "epoch": 3.4977393558268655, "grad_norm": 1.692452073097229, "learning_rate": 6.68294311316494e-06, "loss": 0.0542, "step": 237500 }, { "epoch": 3.4981075389169525, "grad_norm": 1.0853323936462402, "learning_rate": 6.681306741197956e-06, "loss": 0.0563, "step": 237525 }, { "epoch": 3.4984757220070395, "grad_norm": 1.0275168418884277, "learning_rate": 6.679670369230972e-06, "loss": 0.0589, "step": 237550 }, { "epoch": 3.4988439050971265, "grad_norm": 2.008805274963379, "learning_rate": 6.678033997263986e-06, "loss": 0.0589, "step": 237575 }, { "epoch": 3.499212088187214, "grad_norm": 1.8501317501068115, "learning_rate": 6.676397625297002e-06, "loss": 0.0563, "step": 237600 }, { "epoch": 3.499580271277301, "grad_norm": 1.7143025398254395, "learning_rate": 6.6747612533300176e-06, "loss": 0.0553, "step": 237625 }, { "epoch": 3.499948454367388, "grad_norm": 1.7455675601959229, "learning_rate": 6.673124881363034e-06, "loss": 0.0596, "step": 237650 }, { "epoch": 3.500316637457475, "grad_norm": 1.641186237335205, "learning_rate": 6.671488509396048e-06, "loss": 0.0578, "step": 237675 }, { "epoch": 3.500684820547562, "grad_norm": 1.3982226848602295, "learning_rate": 6.6698521374290635e-06, "loss": 0.0524, "step": 237700 }, { "epoch": 3.501053003637649, "grad_norm": 1.5200787782669067, "learning_rate": 6.66821576546208e-06, "loss": 0.0542, "step": 237725 }, { "epoch": 3.501421186727736, "grad_norm": 1.4995650053024292, "learning_rate": 6.666579393495095e-06, "loss": 0.0483, "step": 237750 }, { "epoch": 3.501789369817823, "grad_norm": 1.4795305728912354, "learning_rate": 6.664943021528109e-06, "loss": 0.0564, "step": 237775 }, { "epoch": 3.50215755290791, "grad_norm": 1.0468206405639648, "learning_rate": 6.6633066495611255e-06, "loss": 0.0516, "step": 237800 }, { "epoch": 3.502525735997997, "grad_norm": 1.4110145568847656, "learning_rate": 6.661670277594141e-06, "loss": 0.0537, "step": 237825 }, { "epoch": 3.502893919088084, "grad_norm": 1.5681344270706177, "learning_rate": 6.660033905627157e-06, "loss": 0.0568, "step": 237850 }, { "epoch": 3.503262102178171, "grad_norm": 1.2710176706314087, "learning_rate": 6.6583975336601715e-06, "loss": 0.0527, "step": 237875 }, { "epoch": 3.503630285268258, "grad_norm": 1.0221668481826782, "learning_rate": 6.656761161693187e-06, "loss": 0.0497, "step": 237900 }, { "epoch": 3.503998468358345, "grad_norm": 1.3735289573669434, "learning_rate": 6.655124789726203e-06, "loss": 0.0543, "step": 237925 }, { "epoch": 3.504366651448432, "grad_norm": 1.8345123529434204, "learning_rate": 6.653488417759218e-06, "loss": 0.0564, "step": 237950 }, { "epoch": 3.504734834538519, "grad_norm": 1.3793649673461914, "learning_rate": 6.651852045792234e-06, "loss": 0.0598, "step": 237975 }, { "epoch": 3.505103017628606, "grad_norm": 0.896697998046875, "learning_rate": 6.650215673825249e-06, "loss": 0.0555, "step": 238000 }, { "epoch": 3.5054712007186932, "grad_norm": 1.7305998802185059, "learning_rate": 6.648579301858264e-06, "loss": 0.0489, "step": 238025 }, { "epoch": 3.5058393838087802, "grad_norm": 1.2747411727905273, "learning_rate": 6.64694292989128e-06, "loss": 0.0592, "step": 238050 }, { "epoch": 3.5062075668988673, "grad_norm": 1.5657886266708374, "learning_rate": 6.645306557924296e-06, "loss": 0.0609, "step": 238075 }, { "epoch": 3.5065757499889543, "grad_norm": 1.0818257331848145, "learning_rate": 6.643670185957311e-06, "loss": 0.0587, "step": 238100 }, { "epoch": 3.5069439330790413, "grad_norm": 1.3494561910629272, "learning_rate": 6.642033813990326e-06, "loss": 0.0607, "step": 238125 }, { "epoch": 3.5073121161691287, "grad_norm": 1.1361825466156006, "learning_rate": 6.6403974420233415e-06, "loss": 0.0543, "step": 238150 }, { "epoch": 3.5076802992592158, "grad_norm": 1.1191574335098267, "learning_rate": 6.638761070056358e-06, "loss": 0.0476, "step": 238175 }, { "epoch": 3.5080484823493028, "grad_norm": 1.2232915163040161, "learning_rate": 6.637124698089372e-06, "loss": 0.0529, "step": 238200 }, { "epoch": 3.50841666543939, "grad_norm": 0.8323943614959717, "learning_rate": 6.635488326122388e-06, "loss": 0.0472, "step": 238225 }, { "epoch": 3.508784848529477, "grad_norm": 1.4062432050704956, "learning_rate": 6.6338519541554036e-06, "loss": 0.0613, "step": 238250 }, { "epoch": 3.509153031619564, "grad_norm": 1.2244796752929688, "learning_rate": 6.63221558218842e-06, "loss": 0.0579, "step": 238275 }, { "epoch": 3.509521214709651, "grad_norm": 1.6239651441574097, "learning_rate": 6.630579210221434e-06, "loss": 0.0549, "step": 238300 }, { "epoch": 3.509889397799738, "grad_norm": 1.101367473602295, "learning_rate": 6.6289428382544495e-06, "loss": 0.0517, "step": 238325 }, { "epoch": 3.510257580889825, "grad_norm": 1.7837995290756226, "learning_rate": 6.627306466287466e-06, "loss": 0.0539, "step": 238350 }, { "epoch": 3.510625763979912, "grad_norm": 1.1036487817764282, "learning_rate": 6.625670094320481e-06, "loss": 0.0531, "step": 238375 }, { "epoch": 3.510993947069999, "grad_norm": 1.0700322389602661, "learning_rate": 6.624033722353495e-06, "loss": 0.0537, "step": 238400 }, { "epoch": 3.511362130160086, "grad_norm": 1.2576203346252441, "learning_rate": 6.6223973503865115e-06, "loss": 0.0534, "step": 238425 }, { "epoch": 3.511730313250173, "grad_norm": 0.8141782879829407, "learning_rate": 6.620760978419527e-06, "loss": 0.0484, "step": 238450 }, { "epoch": 3.51209849634026, "grad_norm": 1.3504191637039185, "learning_rate": 6.619124606452543e-06, "loss": 0.0566, "step": 238475 }, { "epoch": 3.512466679430347, "grad_norm": 1.2083675861358643, "learning_rate": 6.617488234485558e-06, "loss": 0.0545, "step": 238500 }, { "epoch": 3.5128348625204344, "grad_norm": 1.0957146883010864, "learning_rate": 6.615851862518573e-06, "loss": 0.055, "step": 238525 }, { "epoch": 3.5132030456105214, "grad_norm": 1.055159568786621, "learning_rate": 6.614215490551589e-06, "loss": 0.0535, "step": 238550 }, { "epoch": 3.5135712287006085, "grad_norm": 1.4225109815597534, "learning_rate": 6.612579118584604e-06, "loss": 0.0529, "step": 238575 }, { "epoch": 3.5139394117906955, "grad_norm": 1.0083485841751099, "learning_rate": 6.61094274661762e-06, "loss": 0.0585, "step": 238600 }, { "epoch": 3.5143075948807825, "grad_norm": 0.8519147038459778, "learning_rate": 6.609306374650635e-06, "loss": 0.0564, "step": 238625 }, { "epoch": 3.5146757779708695, "grad_norm": 1.3270843029022217, "learning_rate": 6.60767000268365e-06, "loss": 0.0536, "step": 238650 }, { "epoch": 3.5150439610609565, "grad_norm": 1.1831995248794556, "learning_rate": 6.606033630716666e-06, "loss": 0.0545, "step": 238675 }, { "epoch": 3.5154121441510435, "grad_norm": 2.0273499488830566, "learning_rate": 6.604397258749682e-06, "loss": 0.0581, "step": 238700 }, { "epoch": 3.5157803272411305, "grad_norm": 1.365114688873291, "learning_rate": 6.602760886782696e-06, "loss": 0.0555, "step": 238725 }, { "epoch": 3.5161485103312176, "grad_norm": 1.4722472429275513, "learning_rate": 6.601124514815712e-06, "loss": 0.0558, "step": 238750 }, { "epoch": 3.5165166934213046, "grad_norm": 1.4480513334274292, "learning_rate": 6.5994881428487275e-06, "loss": 0.0493, "step": 238775 }, { "epoch": 3.5168848765113916, "grad_norm": 1.4381823539733887, "learning_rate": 6.597851770881744e-06, "loss": 0.0626, "step": 238800 }, { "epoch": 3.5172530596014786, "grad_norm": 0.95949786901474, "learning_rate": 6.596215398914758e-06, "loss": 0.0631, "step": 238825 }, { "epoch": 3.5176212426915656, "grad_norm": 1.0725449323654175, "learning_rate": 6.594579026947774e-06, "loss": 0.052, "step": 238850 }, { "epoch": 3.5179894257816526, "grad_norm": 1.4058222770690918, "learning_rate": 6.5929426549807896e-06, "loss": 0.0502, "step": 238875 }, { "epoch": 3.5183576088717397, "grad_norm": 1.219465732574463, "learning_rate": 6.591306283013805e-06, "loss": 0.0506, "step": 238900 }, { "epoch": 3.5187257919618267, "grad_norm": 1.4697891473770142, "learning_rate": 6.589669911046821e-06, "loss": 0.0483, "step": 238925 }, { "epoch": 3.5190939750519137, "grad_norm": 1.4222756624221802, "learning_rate": 6.5880335390798355e-06, "loss": 0.0543, "step": 238950 }, { "epoch": 3.5194621581420007, "grad_norm": 1.1545531749725342, "learning_rate": 6.586397167112852e-06, "loss": 0.0513, "step": 238975 }, { "epoch": 3.5198303412320877, "grad_norm": 1.1878083944320679, "learning_rate": 6.584760795145867e-06, "loss": 0.0504, "step": 239000 }, { "epoch": 3.5201985243221747, "grad_norm": 1.4583110809326172, "learning_rate": 6.583124423178883e-06, "loss": 0.0561, "step": 239025 }, { "epoch": 3.5205667074122617, "grad_norm": 1.1590267419815063, "learning_rate": 6.5814880512118975e-06, "loss": 0.0593, "step": 239050 }, { "epoch": 3.5209348905023488, "grad_norm": 1.353603482246399, "learning_rate": 6.579851679244913e-06, "loss": 0.0522, "step": 239075 }, { "epoch": 3.5213030735924358, "grad_norm": 1.6820868253707886, "learning_rate": 6.578215307277929e-06, "loss": 0.058, "step": 239100 }, { "epoch": 3.5216712566825232, "grad_norm": 1.0427802801132202, "learning_rate": 6.576578935310944e-06, "loss": 0.0495, "step": 239125 }, { "epoch": 3.5220394397726102, "grad_norm": 1.4165109395980835, "learning_rate": 6.574942563343959e-06, "loss": 0.0517, "step": 239150 }, { "epoch": 3.5224076228626973, "grad_norm": 1.5403512716293335, "learning_rate": 6.573306191376975e-06, "loss": 0.0607, "step": 239175 }, { "epoch": 3.5227758059527843, "grad_norm": 1.482560634613037, "learning_rate": 6.57166981940999e-06, "loss": 0.058, "step": 239200 }, { "epoch": 3.5231439890428713, "grad_norm": 1.7138524055480957, "learning_rate": 6.570033447443006e-06, "loss": 0.0581, "step": 239225 }, { "epoch": 3.5235121721329583, "grad_norm": 1.4876329898834229, "learning_rate": 6.568397075476021e-06, "loss": 0.0556, "step": 239250 }, { "epoch": 3.5238803552230453, "grad_norm": 1.2482026815414429, "learning_rate": 6.566760703509036e-06, "loss": 0.0603, "step": 239275 }, { "epoch": 3.5242485383131323, "grad_norm": 1.320499062538147, "learning_rate": 6.565124331542052e-06, "loss": 0.0525, "step": 239300 }, { "epoch": 3.5246167214032194, "grad_norm": 1.2832379341125488, "learning_rate": 6.563553414453747e-06, "loss": 0.0659, "step": 239325 }, { "epoch": 3.5249849044933064, "grad_norm": 1.1847057342529297, "learning_rate": 6.5619170424867615e-06, "loss": 0.0519, "step": 239350 }, { "epoch": 3.5253530875833934, "grad_norm": 1.2214913368225098, "learning_rate": 6.560280670519778e-06, "loss": 0.06, "step": 239375 }, { "epoch": 3.5257212706734804, "grad_norm": 1.207951307296753, "learning_rate": 6.558644298552793e-06, "loss": 0.0469, "step": 239400 }, { "epoch": 3.5260894537635674, "grad_norm": 1.6714341640472412, "learning_rate": 6.557007926585809e-06, "loss": 0.0599, "step": 239425 }, { "epoch": 3.5264576368536544, "grad_norm": 1.8564172983169556, "learning_rate": 6.555371554618824e-06, "loss": 0.0528, "step": 239450 }, { "epoch": 3.526825819943742, "grad_norm": 1.068618655204773, "learning_rate": 6.553735182651839e-06, "loss": 0.051, "step": 239475 }, { "epoch": 3.527194003033829, "grad_norm": 1.001126766204834, "learning_rate": 6.552098810684855e-06, "loss": 0.0622, "step": 239500 }, { "epoch": 3.527562186123916, "grad_norm": 1.6345113515853882, "learning_rate": 6.55046243871787e-06, "loss": 0.0506, "step": 239525 }, { "epoch": 3.527930369214003, "grad_norm": 1.3190492391586304, "learning_rate": 6.5488260667508864e-06, "loss": 0.0554, "step": 239550 }, { "epoch": 3.52829855230409, "grad_norm": 1.4491008520126343, "learning_rate": 6.547189694783901e-06, "loss": 0.0598, "step": 239575 }, { "epoch": 3.528666735394177, "grad_norm": 1.3110566139221191, "learning_rate": 6.545553322816916e-06, "loss": 0.0538, "step": 239600 }, { "epoch": 3.529034918484264, "grad_norm": 1.1355332136154175, "learning_rate": 6.543916950849932e-06, "loss": 0.0545, "step": 239625 }, { "epoch": 3.529403101574351, "grad_norm": 0.5813958644866943, "learning_rate": 6.542280578882948e-06, "loss": 0.0528, "step": 239650 }, { "epoch": 3.529771284664438, "grad_norm": 1.1517421007156372, "learning_rate": 6.540644206915963e-06, "loss": 0.0488, "step": 239675 }, { "epoch": 3.530139467754525, "grad_norm": 1.0873079299926758, "learning_rate": 6.539007834948978e-06, "loss": 0.0519, "step": 239700 }, { "epoch": 3.530507650844612, "grad_norm": 1.235602855682373, "learning_rate": 6.5373714629819936e-06, "loss": 0.0562, "step": 239725 }, { "epoch": 3.530875833934699, "grad_norm": 1.028826117515564, "learning_rate": 6.53573509101501e-06, "loss": 0.0544, "step": 239750 }, { "epoch": 3.531244017024786, "grad_norm": 1.5451363325119019, "learning_rate": 6.534098719048024e-06, "loss": 0.0528, "step": 239775 }, { "epoch": 3.531612200114873, "grad_norm": 1.3391128778457642, "learning_rate": 6.53246234708104e-06, "loss": 0.0519, "step": 239800 }, { "epoch": 3.53198038320496, "grad_norm": 1.2500059604644775, "learning_rate": 6.530825975114056e-06, "loss": 0.0533, "step": 239825 }, { "epoch": 3.532348566295047, "grad_norm": 1.6858341693878174, "learning_rate": 6.529189603147072e-06, "loss": 0.0555, "step": 239850 }, { "epoch": 3.532716749385134, "grad_norm": 1.4265520572662354, "learning_rate": 6.527553231180086e-06, "loss": 0.0515, "step": 239875 }, { "epoch": 3.533084932475221, "grad_norm": 1.5402430295944214, "learning_rate": 6.5259168592131015e-06, "loss": 0.0594, "step": 239900 }, { "epoch": 3.533453115565308, "grad_norm": 1.5277717113494873, "learning_rate": 6.524280487246118e-06, "loss": 0.05, "step": 239925 }, { "epoch": 3.533821298655395, "grad_norm": 1.59786057472229, "learning_rate": 6.522644115279133e-06, "loss": 0.0512, "step": 239950 }, { "epoch": 3.534189481745482, "grad_norm": 1.531436562538147, "learning_rate": 6.521007743312149e-06, "loss": 0.0573, "step": 239975 }, { "epoch": 3.534557664835569, "grad_norm": 1.2132772207260132, "learning_rate": 6.519371371345164e-06, "loss": 0.0581, "step": 240000 }, { "epoch": 3.5349258479256562, "grad_norm": 0.9268856048583984, "learning_rate": 6.517734999378179e-06, "loss": 0.0573, "step": 240025 }, { "epoch": 3.5352940310157432, "grad_norm": 1.2429592609405518, "learning_rate": 6.516098627411195e-06, "loss": 0.0555, "step": 240050 }, { "epoch": 3.5356622141058307, "grad_norm": 1.0812588930130005, "learning_rate": 6.51446225544421e-06, "loss": 0.0512, "step": 240075 }, { "epoch": 3.5360303971959177, "grad_norm": 1.0637778043746948, "learning_rate": 6.512825883477225e-06, "loss": 0.0523, "step": 240100 }, { "epoch": 3.5363985802860047, "grad_norm": 1.4924452304840088, "learning_rate": 6.511189511510241e-06, "loss": 0.057, "step": 240125 }, { "epoch": 3.5367667633760917, "grad_norm": 1.6891506910324097, "learning_rate": 6.509553139543256e-06, "loss": 0.0567, "step": 240150 }, { "epoch": 3.5371349464661788, "grad_norm": 1.0667810440063477, "learning_rate": 6.5079167675762724e-06, "loss": 0.0528, "step": 240175 }, { "epoch": 3.537503129556266, "grad_norm": 1.1576346158981323, "learning_rate": 6.506280395609287e-06, "loss": 0.0541, "step": 240200 }, { "epoch": 3.537871312646353, "grad_norm": 1.3905154466629028, "learning_rate": 6.504644023642302e-06, "loss": 0.0487, "step": 240225 }, { "epoch": 3.53823949573644, "grad_norm": 1.4382922649383545, "learning_rate": 6.503007651675318e-06, "loss": 0.0548, "step": 240250 }, { "epoch": 3.538607678826527, "grad_norm": 1.3392270803451538, "learning_rate": 6.501371279708334e-06, "loss": 0.0572, "step": 240275 }, { "epoch": 3.538975861916614, "grad_norm": 1.4199312925338745, "learning_rate": 6.499734907741348e-06, "loss": 0.0554, "step": 240300 }, { "epoch": 3.539344045006701, "grad_norm": 1.397796630859375, "learning_rate": 6.498098535774364e-06, "loss": 0.0556, "step": 240325 }, { "epoch": 3.539712228096788, "grad_norm": 1.3609561920166016, "learning_rate": 6.4964621638073796e-06, "loss": 0.0484, "step": 240350 }, { "epoch": 3.540080411186875, "grad_norm": 1.6534154415130615, "learning_rate": 6.494825791840396e-06, "loss": 0.0479, "step": 240375 }, { "epoch": 3.540448594276962, "grad_norm": 1.191688895225525, "learning_rate": 6.493189419873411e-06, "loss": 0.0503, "step": 240400 }, { "epoch": 3.5408167773670494, "grad_norm": 1.6409965753555298, "learning_rate": 6.491553047906426e-06, "loss": 0.0526, "step": 240425 }, { "epoch": 3.5411849604571364, "grad_norm": 1.658474087715149, "learning_rate": 6.489916675939442e-06, "loss": 0.0573, "step": 240450 }, { "epoch": 3.5415531435472234, "grad_norm": 1.2004069089889526, "learning_rate": 6.488280303972457e-06, "loss": 0.055, "step": 240475 }, { "epoch": 3.5419213266373104, "grad_norm": 1.3615010976791382, "learning_rate": 6.486643932005473e-06, "loss": 0.0617, "step": 240500 }, { "epoch": 3.5422895097273974, "grad_norm": 1.157260775566101, "learning_rate": 6.4850075600384875e-06, "loss": 0.0551, "step": 240525 }, { "epoch": 3.5426576928174844, "grad_norm": 1.0880311727523804, "learning_rate": 6.483371188071504e-06, "loss": 0.0488, "step": 240550 }, { "epoch": 3.5430258759075715, "grad_norm": 1.7445656061172485, "learning_rate": 6.481734816104519e-06, "loss": 0.0542, "step": 240575 }, { "epoch": 3.5433940589976585, "grad_norm": 1.377503514289856, "learning_rate": 6.480098444137535e-06, "loss": 0.0509, "step": 240600 }, { "epoch": 3.5437622420877455, "grad_norm": 1.4644700288772583, "learning_rate": 6.47846207217055e-06, "loss": 0.0538, "step": 240625 }, { "epoch": 3.5441304251778325, "grad_norm": 1.2659029960632324, "learning_rate": 6.476825700203565e-06, "loss": 0.0516, "step": 240650 }, { "epoch": 3.5444986082679195, "grad_norm": 1.73847234249115, "learning_rate": 6.475189328236581e-06, "loss": 0.0577, "step": 240675 }, { "epoch": 3.5448667913580065, "grad_norm": 1.5496197938919067, "learning_rate": 6.473552956269596e-06, "loss": 0.055, "step": 240700 }, { "epoch": 3.5452349744480935, "grad_norm": 1.3950140476226807, "learning_rate": 6.471916584302611e-06, "loss": 0.0492, "step": 240725 }, { "epoch": 3.5456031575381806, "grad_norm": 1.6770752668380737, "learning_rate": 6.470280212335627e-06, "loss": 0.0542, "step": 240750 }, { "epoch": 3.5459713406282676, "grad_norm": 1.4783209562301636, "learning_rate": 6.468643840368642e-06, "loss": 0.0499, "step": 240775 }, { "epoch": 3.5463395237183546, "grad_norm": 1.030889630317688, "learning_rate": 6.4670074684016584e-06, "loss": 0.055, "step": 240800 }, { "epoch": 3.5467077068084416, "grad_norm": 1.4858734607696533, "learning_rate": 6.465371096434673e-06, "loss": 0.0559, "step": 240825 }, { "epoch": 3.5470758898985286, "grad_norm": 1.286137342453003, "learning_rate": 6.463734724467688e-06, "loss": 0.0516, "step": 240850 }, { "epoch": 3.5474440729886156, "grad_norm": 1.4824665784835815, "learning_rate": 6.462098352500704e-06, "loss": 0.0542, "step": 240875 }, { "epoch": 3.5478122560787027, "grad_norm": 1.2878632545471191, "learning_rate": 6.46046198053372e-06, "loss": 0.0573, "step": 240900 }, { "epoch": 3.5481804391687897, "grad_norm": 1.1780720949172974, "learning_rate": 6.458825608566736e-06, "loss": 0.0547, "step": 240925 }, { "epoch": 3.5485486222588767, "grad_norm": 0.6847298741340637, "learning_rate": 6.45718923659975e-06, "loss": 0.0529, "step": 240950 }, { "epoch": 3.5489168053489637, "grad_norm": 1.3146413564682007, "learning_rate": 6.4555528646327656e-06, "loss": 0.0492, "step": 240975 }, { "epoch": 3.5492849884390507, "grad_norm": 1.2977025508880615, "learning_rate": 6.453916492665782e-06, "loss": 0.0533, "step": 241000 }, { "epoch": 3.549653171529138, "grad_norm": 1.5004416704177856, "learning_rate": 6.452280120698797e-06, "loss": 0.0489, "step": 241025 }, { "epoch": 3.550021354619225, "grad_norm": 1.2740706205368042, "learning_rate": 6.4506437487318115e-06, "loss": 0.0594, "step": 241050 }, { "epoch": 3.550389537709312, "grad_norm": 1.189389944076538, "learning_rate": 6.449007376764828e-06, "loss": 0.055, "step": 241075 }, { "epoch": 3.550757720799399, "grad_norm": 1.220145344734192, "learning_rate": 6.447371004797843e-06, "loss": 0.0525, "step": 241100 }, { "epoch": 3.5511259038894862, "grad_norm": 1.157399296760559, "learning_rate": 6.445734632830859e-06, "loss": 0.0537, "step": 241125 }, { "epoch": 3.5514940869795733, "grad_norm": 0.7158670425415039, "learning_rate": 6.4440982608638735e-06, "loss": 0.0544, "step": 241150 }, { "epoch": 3.5518622700696603, "grad_norm": 0.9459164142608643, "learning_rate": 6.44246188889689e-06, "loss": 0.0542, "step": 241175 }, { "epoch": 3.5522304531597473, "grad_norm": 1.2373384237289429, "learning_rate": 6.440825516929905e-06, "loss": 0.0583, "step": 241200 }, { "epoch": 3.5525986362498343, "grad_norm": 1.261459231376648, "learning_rate": 6.43918914496292e-06, "loss": 0.051, "step": 241225 }, { "epoch": 3.5529668193399213, "grad_norm": 1.8835111856460571, "learning_rate": 6.437552772995936e-06, "loss": 0.061, "step": 241250 }, { "epoch": 3.5533350024300083, "grad_norm": 0.9285681843757629, "learning_rate": 6.435916401028951e-06, "loss": 0.0508, "step": 241275 }, { "epoch": 3.5537031855200953, "grad_norm": 1.9005039930343628, "learning_rate": 6.434280029061967e-06, "loss": 0.0538, "step": 241300 }, { "epoch": 3.5540713686101824, "grad_norm": 0.9641976952552795, "learning_rate": 6.432643657094982e-06, "loss": 0.0504, "step": 241325 }, { "epoch": 3.5544395517002694, "grad_norm": 1.4524327516555786, "learning_rate": 6.431007285127997e-06, "loss": 0.0564, "step": 241350 }, { "epoch": 3.5548077347903564, "grad_norm": 1.343019723892212, "learning_rate": 6.429370913161013e-06, "loss": 0.052, "step": 241375 }, { "epoch": 3.555175917880444, "grad_norm": 0.9352220892906189, "learning_rate": 6.427734541194028e-06, "loss": 0.0563, "step": 241400 }, { "epoch": 3.555544100970531, "grad_norm": 1.2362533807754517, "learning_rate": 6.4260981692270444e-06, "loss": 0.0523, "step": 241425 }, { "epoch": 3.555912284060618, "grad_norm": 1.329370141029358, "learning_rate": 6.42446179726006e-06, "loss": 0.0556, "step": 241450 }, { "epoch": 3.556280467150705, "grad_norm": 1.212063193321228, "learning_rate": 6.422825425293074e-06, "loss": 0.0592, "step": 241475 }, { "epoch": 3.556648650240792, "grad_norm": 1.6169648170471191, "learning_rate": 6.42118905332609e-06, "loss": 0.0511, "step": 241500 }, { "epoch": 3.557016833330879, "grad_norm": 1.2143300771713257, "learning_rate": 6.419552681359106e-06, "loss": 0.0499, "step": 241525 }, { "epoch": 3.557385016420966, "grad_norm": 1.3234690427780151, "learning_rate": 6.417916309392122e-06, "loss": 0.0579, "step": 241550 }, { "epoch": 3.557753199511053, "grad_norm": 1.5765708684921265, "learning_rate": 6.416279937425136e-06, "loss": 0.0621, "step": 241575 }, { "epoch": 3.55812138260114, "grad_norm": 1.1030410528182983, "learning_rate": 6.4146435654581516e-06, "loss": 0.0521, "step": 241600 }, { "epoch": 3.558489565691227, "grad_norm": 1.5761111974716187, "learning_rate": 6.413072648369847e-06, "loss": 0.0609, "step": 241625 }, { "epoch": 3.558857748781314, "grad_norm": 1.3453253507614136, "learning_rate": 6.4114362764028624e-06, "loss": 0.0545, "step": 241650 }, { "epoch": 3.559225931871401, "grad_norm": 1.520203948020935, "learning_rate": 6.409799904435877e-06, "loss": 0.0537, "step": 241675 }, { "epoch": 3.559594114961488, "grad_norm": 0.9888783693313599, "learning_rate": 6.408163532468893e-06, "loss": 0.0559, "step": 241700 }, { "epoch": 3.559962298051575, "grad_norm": 1.7509855031967163, "learning_rate": 6.406527160501908e-06, "loss": 0.0464, "step": 241725 }, { "epoch": 3.560330481141662, "grad_norm": 1.3090428113937378, "learning_rate": 6.4048907885349245e-06, "loss": 0.0536, "step": 241750 }, { "epoch": 3.560698664231749, "grad_norm": 1.4758968353271484, "learning_rate": 6.403254416567939e-06, "loss": 0.0507, "step": 241775 }, { "epoch": 3.561066847321836, "grad_norm": 1.620352029800415, "learning_rate": 6.401618044600954e-06, "loss": 0.0555, "step": 241800 }, { "epoch": 3.561435030411923, "grad_norm": 1.500105381011963, "learning_rate": 6.39998167263397e-06, "loss": 0.0584, "step": 241825 }, { "epoch": 3.56180321350201, "grad_norm": 1.3465381860733032, "learning_rate": 6.398345300666986e-06, "loss": 0.0458, "step": 241850 }, { "epoch": 3.562171396592097, "grad_norm": 1.4730098247528076, "learning_rate": 6.396708928700002e-06, "loss": 0.0496, "step": 241875 }, { "epoch": 3.562539579682184, "grad_norm": 1.381256103515625, "learning_rate": 6.395072556733016e-06, "loss": 0.0484, "step": 241900 }, { "epoch": 3.562907762772271, "grad_norm": 1.3158462047576904, "learning_rate": 6.393436184766032e-06, "loss": 0.0548, "step": 241925 }, { "epoch": 3.563275945862358, "grad_norm": 0.9338003993034363, "learning_rate": 6.391799812799048e-06, "loss": 0.0539, "step": 241950 }, { "epoch": 3.563644128952445, "grad_norm": 1.442128300666809, "learning_rate": 6.390163440832063e-06, "loss": 0.0545, "step": 241975 }, { "epoch": 3.5640123120425327, "grad_norm": 1.8567023277282715, "learning_rate": 6.388527068865078e-06, "loss": 0.0612, "step": 242000 }, { "epoch": 3.5643804951326197, "grad_norm": 1.1950697898864746, "learning_rate": 6.386890696898094e-06, "loss": 0.0516, "step": 242025 }, { "epoch": 3.5647486782227067, "grad_norm": 0.8833608627319336, "learning_rate": 6.385254324931109e-06, "loss": 0.0533, "step": 242050 }, { "epoch": 3.5651168613127937, "grad_norm": 0.7630489468574524, "learning_rate": 6.383617952964125e-06, "loss": 0.0542, "step": 242075 }, { "epoch": 3.5654850444028807, "grad_norm": 1.2349568605422974, "learning_rate": 6.38198158099714e-06, "loss": 0.0558, "step": 242100 }, { "epoch": 3.5658532274929677, "grad_norm": 1.4984163045883179, "learning_rate": 6.380345209030156e-06, "loss": 0.0527, "step": 242125 }, { "epoch": 3.5662214105830548, "grad_norm": 1.307085633277893, "learning_rate": 6.378708837063171e-06, "loss": 0.0475, "step": 242150 }, { "epoch": 3.5665895936731418, "grad_norm": 1.2279508113861084, "learning_rate": 6.377072465096187e-06, "loss": 0.0536, "step": 242175 }, { "epoch": 3.566957776763229, "grad_norm": 0.9844478368759155, "learning_rate": 6.375436093129202e-06, "loss": 0.0559, "step": 242200 }, { "epoch": 3.567325959853316, "grad_norm": 1.0601154565811157, "learning_rate": 6.373799721162217e-06, "loss": 0.0533, "step": 242225 }, { "epoch": 3.567694142943403, "grad_norm": 0.9835296273231506, "learning_rate": 6.372163349195233e-06, "loss": 0.048, "step": 242250 }, { "epoch": 3.56806232603349, "grad_norm": 1.6406023502349854, "learning_rate": 6.3705269772282484e-06, "loss": 0.0538, "step": 242275 }, { "epoch": 3.568430509123577, "grad_norm": 1.235166072845459, "learning_rate": 6.368890605261263e-06, "loss": 0.0505, "step": 242300 }, { "epoch": 3.568798692213664, "grad_norm": 1.4901325702667236, "learning_rate": 6.367254233294279e-06, "loss": 0.0518, "step": 242325 }, { "epoch": 3.5691668753037513, "grad_norm": 1.6954319477081299, "learning_rate": 6.365617861327294e-06, "loss": 0.0569, "step": 242350 }, { "epoch": 3.5695350583938383, "grad_norm": 1.4798673391342163, "learning_rate": 6.3639814893603105e-06, "loss": 0.0536, "step": 242375 }, { "epoch": 3.5699032414839253, "grad_norm": 1.34373939037323, "learning_rate": 6.362345117393326e-06, "loss": 0.0473, "step": 242400 }, { "epoch": 3.5702714245740124, "grad_norm": 1.538964033126831, "learning_rate": 6.36070874542634e-06, "loss": 0.0544, "step": 242425 }, { "epoch": 3.5706396076640994, "grad_norm": 1.8510009050369263, "learning_rate": 6.359072373459356e-06, "loss": 0.0558, "step": 242450 }, { "epoch": 3.5710077907541864, "grad_norm": 1.198325514793396, "learning_rate": 6.357436001492372e-06, "loss": 0.0568, "step": 242475 }, { "epoch": 3.5713759738442734, "grad_norm": 0.910335123538971, "learning_rate": 6.355799629525388e-06, "loss": 0.0642, "step": 242500 }, { "epoch": 3.5717441569343604, "grad_norm": 1.613919734954834, "learning_rate": 6.354163257558402e-06, "loss": 0.0513, "step": 242525 }, { "epoch": 3.5721123400244474, "grad_norm": 1.2130235433578491, "learning_rate": 6.352526885591418e-06, "loss": 0.0568, "step": 242550 }, { "epoch": 3.5724805231145345, "grad_norm": 1.123764157295227, "learning_rate": 6.350890513624434e-06, "loss": 0.0543, "step": 242575 }, { "epoch": 3.5728487062046215, "grad_norm": 1.3343775272369385, "learning_rate": 6.349254141657449e-06, "loss": 0.0518, "step": 242600 }, { "epoch": 3.5732168892947085, "grad_norm": 1.30911386013031, "learning_rate": 6.3476177696904636e-06, "loss": 0.0515, "step": 242625 }, { "epoch": 3.5735850723847955, "grad_norm": 1.0751773118972778, "learning_rate": 6.34598139772348e-06, "loss": 0.0546, "step": 242650 }, { "epoch": 3.5739532554748825, "grad_norm": 0.7018923163414001, "learning_rate": 6.344345025756495e-06, "loss": 0.0563, "step": 242675 }, { "epoch": 3.5743214385649695, "grad_norm": 1.2422815561294556, "learning_rate": 6.342708653789511e-06, "loss": 0.0601, "step": 242700 }, { "epoch": 3.5746896216550565, "grad_norm": 1.1047272682189941, "learning_rate": 6.341072281822526e-06, "loss": 0.058, "step": 242725 }, { "epoch": 3.5750578047451436, "grad_norm": 1.252564787864685, "learning_rate": 6.339435909855542e-06, "loss": 0.0471, "step": 242750 }, { "epoch": 3.5754259878352306, "grad_norm": 1.2016949653625488, "learning_rate": 6.337799537888557e-06, "loss": 0.0506, "step": 242775 }, { "epoch": 3.5757941709253176, "grad_norm": 1.1374620199203491, "learning_rate": 6.336163165921572e-06, "loss": 0.0541, "step": 242800 }, { "epoch": 3.5761623540154046, "grad_norm": 1.4337965250015259, "learning_rate": 6.3345267939545885e-06, "loss": 0.0584, "step": 242825 }, { "epoch": 3.5765305371054916, "grad_norm": 1.2424113750457764, "learning_rate": 6.332890421987603e-06, "loss": 0.0495, "step": 242850 }, { "epoch": 3.5768987201955786, "grad_norm": 1.4434897899627686, "learning_rate": 6.331254050020619e-06, "loss": 0.0499, "step": 242875 }, { "epoch": 3.5772669032856657, "grad_norm": 1.5763555765151978, "learning_rate": 6.3296176780536344e-06, "loss": 0.0548, "step": 242900 }, { "epoch": 3.5776350863757527, "grad_norm": 1.4369443655014038, "learning_rate": 6.327981306086651e-06, "loss": 0.0551, "step": 242925 }, { "epoch": 3.57800326946584, "grad_norm": 1.4109026193618774, "learning_rate": 6.326344934119665e-06, "loss": 0.0537, "step": 242950 }, { "epoch": 3.578371452555927, "grad_norm": 1.2654056549072266, "learning_rate": 6.32470856215268e-06, "loss": 0.053, "step": 242975 }, { "epoch": 3.578739635646014, "grad_norm": 2.049483060836792, "learning_rate": 6.3230721901856965e-06, "loss": 0.0615, "step": 243000 }, { "epoch": 3.579107818736101, "grad_norm": 1.703001618385315, "learning_rate": 6.321435818218712e-06, "loss": 0.0514, "step": 243025 }, { "epoch": 3.579476001826188, "grad_norm": 1.5279680490493774, "learning_rate": 6.319799446251726e-06, "loss": 0.0509, "step": 243050 }, { "epoch": 3.579844184916275, "grad_norm": 1.4448721408843994, "learning_rate": 6.318163074284742e-06, "loss": 0.0556, "step": 243075 }, { "epoch": 3.5802123680063622, "grad_norm": 1.5780704021453857, "learning_rate": 6.316526702317758e-06, "loss": 0.0513, "step": 243100 }, { "epoch": 3.5805805510964492, "grad_norm": 1.4930514097213745, "learning_rate": 6.314890330350774e-06, "loss": 0.0562, "step": 243125 }, { "epoch": 3.5809487341865363, "grad_norm": 1.1155985593795776, "learning_rate": 6.313253958383788e-06, "loss": 0.0503, "step": 243150 }, { "epoch": 3.5813169172766233, "grad_norm": 1.4989086389541626, "learning_rate": 6.311617586416804e-06, "loss": 0.0572, "step": 243175 }, { "epoch": 3.5816851003667103, "grad_norm": 1.3143268823623657, "learning_rate": 6.30998121444982e-06, "loss": 0.058, "step": 243200 }, { "epoch": 3.5820532834567973, "grad_norm": 1.2218605279922485, "learning_rate": 6.308344842482835e-06, "loss": 0.0512, "step": 243225 }, { "epoch": 3.5824214665468843, "grad_norm": 1.347113847732544, "learning_rate": 6.3067084705158495e-06, "loss": 0.0541, "step": 243250 }, { "epoch": 3.5827896496369713, "grad_norm": 1.5165753364562988, "learning_rate": 6.305072098548866e-06, "loss": 0.0612, "step": 243275 }, { "epoch": 3.583157832727059, "grad_norm": 0.5842728614807129, "learning_rate": 6.303435726581881e-06, "loss": 0.0507, "step": 243300 }, { "epoch": 3.583526015817146, "grad_norm": 1.269214153289795, "learning_rate": 6.301799354614897e-06, "loss": 0.0624, "step": 243325 }, { "epoch": 3.583894198907233, "grad_norm": 1.4616782665252686, "learning_rate": 6.3001629826479125e-06, "loss": 0.0595, "step": 243350 }, { "epoch": 3.58426238199732, "grad_norm": 1.0306979417800903, "learning_rate": 6.298526610680927e-06, "loss": 0.0515, "step": 243375 }, { "epoch": 3.584630565087407, "grad_norm": 1.4185216426849365, "learning_rate": 6.296890238713943e-06, "loss": 0.054, "step": 243400 }, { "epoch": 3.584998748177494, "grad_norm": 1.3975141048431396, "learning_rate": 6.295253866746958e-06, "loss": 0.0582, "step": 243425 }, { "epoch": 3.585366931267581, "grad_norm": 1.7203360795974731, "learning_rate": 6.2936174947799745e-06, "loss": 0.0595, "step": 243450 }, { "epoch": 3.585735114357668, "grad_norm": 1.608855128288269, "learning_rate": 6.291981122812989e-06, "loss": 0.0557, "step": 243475 }, { "epoch": 3.586103297447755, "grad_norm": 1.4659583568572998, "learning_rate": 6.290344750846004e-06, "loss": 0.0546, "step": 243500 }, { "epoch": 3.586471480537842, "grad_norm": 1.4480702877044678, "learning_rate": 6.2887083788790204e-06, "loss": 0.0498, "step": 243525 }, { "epoch": 3.586839663627929, "grad_norm": 1.438968300819397, "learning_rate": 6.287072006912036e-06, "loss": 0.0577, "step": 243550 }, { "epoch": 3.587207846718016, "grad_norm": 0.9120373725891113, "learning_rate": 6.285435634945051e-06, "loss": 0.0511, "step": 243575 }, { "epoch": 3.587576029808103, "grad_norm": 1.5426757335662842, "learning_rate": 6.283799262978066e-06, "loss": 0.0555, "step": 243600 }, { "epoch": 3.58794421289819, "grad_norm": 1.5347541570663452, "learning_rate": 6.2821628910110825e-06, "loss": 0.0557, "step": 243625 }, { "epoch": 3.588312395988277, "grad_norm": 1.6872762441635132, "learning_rate": 6.280526519044098e-06, "loss": 0.0566, "step": 243650 }, { "epoch": 3.588680579078364, "grad_norm": 1.684535264968872, "learning_rate": 6.278890147077112e-06, "loss": 0.0583, "step": 243675 }, { "epoch": 3.589048762168451, "grad_norm": 1.1899725198745728, "learning_rate": 6.277253775110128e-06, "loss": 0.0578, "step": 243700 }, { "epoch": 3.589416945258538, "grad_norm": 1.234108328819275, "learning_rate": 6.275617403143144e-06, "loss": 0.0563, "step": 243725 }, { "epoch": 3.589785128348625, "grad_norm": 0.7505812644958496, "learning_rate": 6.27398103117616e-06, "loss": 0.053, "step": 243750 }, { "epoch": 3.590153311438712, "grad_norm": 1.1787515878677368, "learning_rate": 6.272344659209174e-06, "loss": 0.0551, "step": 243775 }, { "epoch": 3.590521494528799, "grad_norm": 1.2294164896011353, "learning_rate": 6.27070828724219e-06, "loss": 0.0651, "step": 243800 }, { "epoch": 3.590889677618886, "grad_norm": 1.1803584098815918, "learning_rate": 6.269071915275206e-06, "loss": 0.0573, "step": 243825 }, { "epoch": 3.591257860708973, "grad_norm": 0.7361862659454346, "learning_rate": 6.267435543308221e-06, "loss": 0.0509, "step": 243850 }, { "epoch": 3.59162604379906, "grad_norm": 1.3564823865890503, "learning_rate": 6.265799171341237e-06, "loss": 0.0492, "step": 243875 }, { "epoch": 3.5919942268891476, "grad_norm": 1.2647794485092163, "learning_rate": 6.264162799374252e-06, "loss": 0.0555, "step": 243900 }, { "epoch": 3.5923624099792346, "grad_norm": 1.1084637641906738, "learning_rate": 6.262526427407267e-06, "loss": 0.0526, "step": 243925 }, { "epoch": 3.5927305930693216, "grad_norm": 0.8604913949966431, "learning_rate": 6.260890055440283e-06, "loss": 0.0536, "step": 243950 }, { "epoch": 3.5930987761594086, "grad_norm": 1.1622568368911743, "learning_rate": 6.2592536834732985e-06, "loss": 0.0588, "step": 243975 }, { "epoch": 3.5934669592494957, "grad_norm": 1.5048962831497192, "learning_rate": 6.257617311506313e-06, "loss": 0.0527, "step": 244000 }, { "epoch": 3.5938351423395827, "grad_norm": 1.2005444765090942, "learning_rate": 6.255980939539329e-06, "loss": 0.0582, "step": 244025 }, { "epoch": 3.5942033254296697, "grad_norm": 1.516257643699646, "learning_rate": 6.254344567572344e-06, "loss": 0.055, "step": 244050 }, { "epoch": 3.5945715085197567, "grad_norm": 1.3147369623184204, "learning_rate": 6.2527081956053605e-06, "loss": 0.0648, "step": 244075 }, { "epoch": 3.5949396916098437, "grad_norm": 1.078008770942688, "learning_rate": 6.251071823638375e-06, "loss": 0.0515, "step": 244100 }, { "epoch": 3.5953078746999307, "grad_norm": 1.3717511892318726, "learning_rate": 6.24943545167139e-06, "loss": 0.0538, "step": 244125 }, { "epoch": 3.5956760577900178, "grad_norm": 0.8229482173919678, "learning_rate": 6.2477990797044064e-06, "loss": 0.052, "step": 244150 }, { "epoch": 3.5960442408801048, "grad_norm": 0.9627171158790588, "learning_rate": 6.246162707737422e-06, "loss": 0.0505, "step": 244175 }, { "epoch": 3.596412423970192, "grad_norm": 1.3391168117523193, "learning_rate": 6.244526335770437e-06, "loss": 0.0507, "step": 244200 }, { "epoch": 3.596780607060279, "grad_norm": 1.6255720853805542, "learning_rate": 6.242889963803452e-06, "loss": 0.0504, "step": 244225 }, { "epoch": 3.597148790150366, "grad_norm": 1.5499217510223389, "learning_rate": 6.241253591836468e-06, "loss": 0.0518, "step": 244250 }, { "epoch": 3.5975169732404533, "grad_norm": 1.371516466140747, "learning_rate": 6.239617219869484e-06, "loss": 0.0507, "step": 244275 }, { "epoch": 3.5978851563305403, "grad_norm": 1.1114004850387573, "learning_rate": 6.237980847902499e-06, "loss": 0.0507, "step": 244300 }, { "epoch": 3.5982533394206273, "grad_norm": 1.0851997137069702, "learning_rate": 6.236344475935514e-06, "loss": 0.0547, "step": 244325 }, { "epoch": 3.5986215225107143, "grad_norm": 1.604949951171875, "learning_rate": 6.23470810396853e-06, "loss": 0.0476, "step": 244350 }, { "epoch": 3.5989897056008013, "grad_norm": 0.6757989525794983, "learning_rate": 6.233071732001546e-06, "loss": 0.0556, "step": 244375 }, { "epoch": 3.5993578886908884, "grad_norm": 1.1998815536499023, "learning_rate": 6.231435360034561e-06, "loss": 0.062, "step": 244400 }, { "epoch": 3.5997260717809754, "grad_norm": 1.7735071182250977, "learning_rate": 6.229798988067576e-06, "loss": 0.0549, "step": 244425 }, { "epoch": 3.6000942548710624, "grad_norm": 1.3361785411834717, "learning_rate": 6.228162616100592e-06, "loss": 0.0549, "step": 244450 }, { "epoch": 3.6004624379611494, "grad_norm": 1.4690197706222534, "learning_rate": 6.226526244133607e-06, "loss": 0.0552, "step": 244475 }, { "epoch": 3.6008306210512364, "grad_norm": 2.2308056354522705, "learning_rate": 6.224889872166623e-06, "loss": 0.0618, "step": 244500 }, { "epoch": 3.6011988041413234, "grad_norm": 1.2314001321792603, "learning_rate": 6.223253500199638e-06, "loss": 0.0474, "step": 244525 }, { "epoch": 3.6015669872314104, "grad_norm": 1.5631963014602661, "learning_rate": 6.221617128232653e-06, "loss": 0.051, "step": 244550 }, { "epoch": 3.6019351703214975, "grad_norm": 1.363520622253418, "learning_rate": 6.219980756265669e-06, "loss": 0.0515, "step": 244575 }, { "epoch": 3.6023033534115845, "grad_norm": 1.4217227697372437, "learning_rate": 6.2183443842986845e-06, "loss": 0.0512, "step": 244600 }, { "epoch": 3.6026715365016715, "grad_norm": 1.4224117994308472, "learning_rate": 6.216708012331699e-06, "loss": 0.0554, "step": 244625 }, { "epoch": 3.6030397195917585, "grad_norm": 1.8205037117004395, "learning_rate": 6.215071640364715e-06, "loss": 0.0527, "step": 244650 }, { "epoch": 3.6034079026818455, "grad_norm": 1.1793156862258911, "learning_rate": 6.21343526839773e-06, "loss": 0.0509, "step": 244675 }, { "epoch": 3.6037760857719325, "grad_norm": 1.0544114112854004, "learning_rate": 6.2117988964307465e-06, "loss": 0.0465, "step": 244700 }, { "epoch": 3.6041442688620196, "grad_norm": 0.9881449937820435, "learning_rate": 6.21022797934244e-06, "loss": 0.0504, "step": 244725 }, { "epoch": 3.6045124519521066, "grad_norm": 0.9702735543251038, "learning_rate": 6.208591607375456e-06, "loss": 0.0525, "step": 244750 }, { "epoch": 3.6048806350421936, "grad_norm": 1.267856478691101, "learning_rate": 6.206955235408472e-06, "loss": 0.053, "step": 244775 }, { "epoch": 3.6052488181322806, "grad_norm": 1.4718348979949951, "learning_rate": 6.205318863441487e-06, "loss": 0.0584, "step": 244800 }, { "epoch": 3.6056170012223676, "grad_norm": 1.0116479396820068, "learning_rate": 6.203682491474503e-06, "loss": 0.0499, "step": 244825 }, { "epoch": 3.6059851843124546, "grad_norm": 1.7999377250671387, "learning_rate": 6.202046119507518e-06, "loss": 0.0543, "step": 244850 }, { "epoch": 3.606353367402542, "grad_norm": 1.6597347259521484, "learning_rate": 6.200409747540533e-06, "loss": 0.0597, "step": 244875 }, { "epoch": 3.606721550492629, "grad_norm": 0.996076762676239, "learning_rate": 6.198773375573549e-06, "loss": 0.06, "step": 244900 }, { "epoch": 3.607089733582716, "grad_norm": 0.8949547410011292, "learning_rate": 6.1971370036065645e-06, "loss": 0.048, "step": 244925 }, { "epoch": 3.607457916672803, "grad_norm": 1.2150899171829224, "learning_rate": 6.195500631639579e-06, "loss": 0.0494, "step": 244950 }, { "epoch": 3.60782609976289, "grad_norm": 1.2176042795181274, "learning_rate": 6.193864259672595e-06, "loss": 0.052, "step": 244975 }, { "epoch": 3.608194282852977, "grad_norm": 1.6427092552185059, "learning_rate": 6.1922278877056104e-06, "loss": 0.0637, "step": 245000 }, { "epoch": 3.608562465943064, "grad_norm": 1.6079386472702026, "learning_rate": 6.190591515738627e-06, "loss": 0.0515, "step": 245025 }, { "epoch": 3.608930649033151, "grad_norm": 1.4702075719833374, "learning_rate": 6.188955143771641e-06, "loss": 0.0504, "step": 245050 }, { "epoch": 3.609298832123238, "grad_norm": 1.0616623163223267, "learning_rate": 6.187318771804657e-06, "loss": 0.0564, "step": 245075 }, { "epoch": 3.6096670152133252, "grad_norm": 1.1923943758010864, "learning_rate": 6.1856823998376725e-06, "loss": 0.0555, "step": 245100 }, { "epoch": 3.6100351983034122, "grad_norm": 1.1287178993225098, "learning_rate": 6.184046027870688e-06, "loss": 0.0524, "step": 245125 }, { "epoch": 3.6104033813934993, "grad_norm": 1.0990760326385498, "learning_rate": 6.182409655903703e-06, "loss": 0.0599, "step": 245150 }, { "epoch": 3.6107715644835863, "grad_norm": 1.4486708641052246, "learning_rate": 6.1807732839367184e-06, "loss": 0.0547, "step": 245175 }, { "epoch": 3.6111397475736733, "grad_norm": 1.107300877571106, "learning_rate": 6.1791369119697346e-06, "loss": 0.053, "step": 245200 }, { "epoch": 3.6115079306637607, "grad_norm": 1.3102949857711792, "learning_rate": 6.17750054000275e-06, "loss": 0.0583, "step": 245225 }, { "epoch": 3.6118761137538478, "grad_norm": 1.5146225690841675, "learning_rate": 6.175864168035765e-06, "loss": 0.0561, "step": 245250 }, { "epoch": 3.6122442968439348, "grad_norm": 1.256121039390564, "learning_rate": 6.1742277960687805e-06, "loss": 0.0608, "step": 245275 }, { "epoch": 3.612612479934022, "grad_norm": 1.4065275192260742, "learning_rate": 6.172591424101796e-06, "loss": 0.0639, "step": 245300 }, { "epoch": 3.612980663024109, "grad_norm": 1.9923678636550903, "learning_rate": 6.170955052134812e-06, "loss": 0.045, "step": 245325 }, { "epoch": 3.613348846114196, "grad_norm": 1.399849534034729, "learning_rate": 6.169318680167827e-06, "loss": 0.0507, "step": 245350 }, { "epoch": 3.613717029204283, "grad_norm": 1.5250312089920044, "learning_rate": 6.167682308200842e-06, "loss": 0.0517, "step": 245375 }, { "epoch": 3.61408521229437, "grad_norm": 1.3271722793579102, "learning_rate": 6.166045936233858e-06, "loss": 0.0508, "step": 245400 }, { "epoch": 3.614453395384457, "grad_norm": 1.3831508159637451, "learning_rate": 6.164409564266873e-06, "loss": 0.0506, "step": 245425 }, { "epoch": 3.614821578474544, "grad_norm": 1.142835021018982, "learning_rate": 6.162773192299889e-06, "loss": 0.0519, "step": 245450 }, { "epoch": 3.615189761564631, "grad_norm": 1.3392393589019775, "learning_rate": 6.161136820332904e-06, "loss": 0.0499, "step": 245475 }, { "epoch": 3.615557944654718, "grad_norm": 1.1853067874908447, "learning_rate": 6.159500448365919e-06, "loss": 0.0528, "step": 245500 }, { "epoch": 3.615926127744805, "grad_norm": 1.3341588973999023, "learning_rate": 6.157864076398935e-06, "loss": 0.0523, "step": 245525 }, { "epoch": 3.616294310834892, "grad_norm": 1.495862603187561, "learning_rate": 6.1562277044319505e-06, "loss": 0.0533, "step": 245550 }, { "epoch": 3.616662493924979, "grad_norm": 1.5366014242172241, "learning_rate": 6.154591332464965e-06, "loss": 0.0498, "step": 245575 }, { "epoch": 3.617030677015066, "grad_norm": 1.1389579772949219, "learning_rate": 6.152954960497981e-06, "loss": 0.0542, "step": 245600 }, { "epoch": 3.617398860105153, "grad_norm": 0.932179868221283, "learning_rate": 6.1513185885309964e-06, "loss": 0.0535, "step": 245625 }, { "epoch": 3.61776704319524, "grad_norm": 1.5072063207626343, "learning_rate": 6.149682216564013e-06, "loss": 0.0537, "step": 245650 }, { "epoch": 3.618135226285327, "grad_norm": 1.2622642517089844, "learning_rate": 6.148045844597027e-06, "loss": 0.049, "step": 245675 }, { "epoch": 3.618503409375414, "grad_norm": 1.2886232137680054, "learning_rate": 6.146409472630042e-06, "loss": 0.06, "step": 245700 }, { "epoch": 3.618871592465501, "grad_norm": 2.2847864627838135, "learning_rate": 6.1447731006630585e-06, "loss": 0.0578, "step": 245725 }, { "epoch": 3.619239775555588, "grad_norm": 0.6749784350395203, "learning_rate": 6.143136728696074e-06, "loss": 0.0515, "step": 245750 }, { "epoch": 3.619607958645675, "grad_norm": 1.4920670986175537, "learning_rate": 6.14150035672909e-06, "loss": 0.0611, "step": 245775 }, { "epoch": 3.619976141735762, "grad_norm": 1.5810109376907349, "learning_rate": 6.139863984762104e-06, "loss": 0.0558, "step": 245800 }, { "epoch": 3.6203443248258496, "grad_norm": 1.413640022277832, "learning_rate": 6.13822761279512e-06, "loss": 0.0578, "step": 245825 }, { "epoch": 3.6207125079159366, "grad_norm": 0.9894798398017883, "learning_rate": 6.136591240828136e-06, "loss": 0.0493, "step": 245850 }, { "epoch": 3.6210806910060236, "grad_norm": 1.2460707426071167, "learning_rate": 6.134954868861151e-06, "loss": 0.0585, "step": 245875 }, { "epoch": 3.6214488740961106, "grad_norm": 1.248345136642456, "learning_rate": 6.1333184968941665e-06, "loss": 0.0483, "step": 245900 }, { "epoch": 3.6218170571861976, "grad_norm": 1.5993595123291016, "learning_rate": 6.131682124927182e-06, "loss": 0.0524, "step": 245925 }, { "epoch": 3.6221852402762846, "grad_norm": 1.0702323913574219, "learning_rate": 6.130045752960198e-06, "loss": 0.0457, "step": 245950 }, { "epoch": 3.6225534233663716, "grad_norm": 1.7296591997146606, "learning_rate": 6.128409380993213e-06, "loss": 0.0596, "step": 245975 }, { "epoch": 3.6229216064564587, "grad_norm": 1.5586003065109253, "learning_rate": 6.126773009026228e-06, "loss": 0.05, "step": 246000 }, { "epoch": 3.6232897895465457, "grad_norm": 1.4900046586990356, "learning_rate": 6.125136637059244e-06, "loss": 0.0494, "step": 246025 }, { "epoch": 3.6236579726366327, "grad_norm": 1.5979666709899902, "learning_rate": 6.123500265092259e-06, "loss": 0.0533, "step": 246050 }, { "epoch": 3.6240261557267197, "grad_norm": 1.6462072134017944, "learning_rate": 6.121863893125275e-06, "loss": 0.0519, "step": 246075 }, { "epoch": 3.6243943388168067, "grad_norm": 1.8660732507705688, "learning_rate": 6.12022752115829e-06, "loss": 0.0535, "step": 246100 }, { "epoch": 3.6247625219068937, "grad_norm": 1.1489410400390625, "learning_rate": 6.118591149191305e-06, "loss": 0.0509, "step": 246125 }, { "epoch": 3.6251307049969808, "grad_norm": 1.7616811990737915, "learning_rate": 6.116954777224321e-06, "loss": 0.0587, "step": 246150 }, { "epoch": 3.6254988880870678, "grad_norm": 1.6229524612426758, "learning_rate": 6.1153184052573365e-06, "loss": 0.0493, "step": 246175 }, { "epoch": 3.6258670711771552, "grad_norm": 1.388395071029663, "learning_rate": 6.113682033290351e-06, "loss": 0.0532, "step": 246200 }, { "epoch": 3.6262352542672422, "grad_norm": 1.6239497661590576, "learning_rate": 6.112045661323367e-06, "loss": 0.0581, "step": 246225 }, { "epoch": 3.6266034373573293, "grad_norm": 0.9362977147102356, "learning_rate": 6.1104092893563824e-06, "loss": 0.0495, "step": 246250 }, { "epoch": 3.6269716204474163, "grad_norm": 1.5556639432907104, "learning_rate": 6.108772917389399e-06, "loss": 0.0554, "step": 246275 }, { "epoch": 3.6273398035375033, "grad_norm": 1.2726385593414307, "learning_rate": 6.107136545422414e-06, "loss": 0.0502, "step": 246300 }, { "epoch": 3.6277079866275903, "grad_norm": 1.9834768772125244, "learning_rate": 6.105500173455428e-06, "loss": 0.0531, "step": 246325 }, { "epoch": 3.6280761697176773, "grad_norm": 1.2573577165603638, "learning_rate": 6.1038638014884445e-06, "loss": 0.0545, "step": 246350 }, { "epoch": 3.6284443528077643, "grad_norm": 1.6854448318481445, "learning_rate": 6.10222742952146e-06, "loss": 0.057, "step": 246375 }, { "epoch": 3.6288125358978514, "grad_norm": 1.40595281124115, "learning_rate": 6.100591057554476e-06, "loss": 0.0517, "step": 246400 }, { "epoch": 3.6291807189879384, "grad_norm": 1.1282232999801636, "learning_rate": 6.09895468558749e-06, "loss": 0.0577, "step": 246425 }, { "epoch": 3.6295489020780254, "grad_norm": 0.7684329152107239, "learning_rate": 6.097318313620506e-06, "loss": 0.0479, "step": 246450 }, { "epoch": 3.6299170851681124, "grad_norm": 0.6485980153083801, "learning_rate": 6.095681941653522e-06, "loss": 0.0499, "step": 246475 }, { "epoch": 3.6302852682581994, "grad_norm": 1.4307173490524292, "learning_rate": 6.094045569686537e-06, "loss": 0.0562, "step": 246500 }, { "epoch": 3.6306534513482864, "grad_norm": 1.3257659673690796, "learning_rate": 6.0924091977195525e-06, "loss": 0.0591, "step": 246525 }, { "epoch": 3.6310216344383734, "grad_norm": 1.5286110639572144, "learning_rate": 6.090772825752568e-06, "loss": 0.0469, "step": 246550 }, { "epoch": 3.6313898175284605, "grad_norm": 0.8919461369514465, "learning_rate": 6.089136453785583e-06, "loss": 0.055, "step": 246575 }, { "epoch": 3.6317580006185475, "grad_norm": 1.9157183170318604, "learning_rate": 6.087500081818599e-06, "loss": 0.0557, "step": 246600 }, { "epoch": 3.6321261837086345, "grad_norm": 1.1851670742034912, "learning_rate": 6.085863709851614e-06, "loss": 0.0606, "step": 246625 }, { "epoch": 3.6324943667987215, "grad_norm": 1.8410229682922363, "learning_rate": 6.08422733788463e-06, "loss": 0.0544, "step": 246650 }, { "epoch": 3.6328625498888085, "grad_norm": 1.2160470485687256, "learning_rate": 6.082590965917645e-06, "loss": 0.0518, "step": 246675 }, { "epoch": 3.6332307329788955, "grad_norm": 1.572349190711975, "learning_rate": 6.080954593950661e-06, "loss": 0.0565, "step": 246700 }, { "epoch": 3.6335989160689826, "grad_norm": 1.3269578218460083, "learning_rate": 6.079318221983677e-06, "loss": 0.0541, "step": 246725 }, { "epoch": 3.6339670991590696, "grad_norm": 0.7197899222373962, "learning_rate": 6.077681850016691e-06, "loss": 0.0494, "step": 246750 }, { "epoch": 3.6343352822491566, "grad_norm": 1.1615010499954224, "learning_rate": 6.076045478049707e-06, "loss": 0.0591, "step": 246775 }, { "epoch": 3.634703465339244, "grad_norm": 0.9506219625473022, "learning_rate": 6.0744091060827225e-06, "loss": 0.0603, "step": 246800 }, { "epoch": 3.635071648429331, "grad_norm": 1.126420021057129, "learning_rate": 6.072772734115739e-06, "loss": 0.0579, "step": 246825 }, { "epoch": 3.635439831519418, "grad_norm": 1.2277495861053467, "learning_rate": 6.071136362148753e-06, "loss": 0.0623, "step": 246850 }, { "epoch": 3.635808014609505, "grad_norm": 1.1536575555801392, "learning_rate": 6.0694999901817684e-06, "loss": 0.0529, "step": 246875 }, { "epoch": 3.636176197699592, "grad_norm": 1.2662118673324585, "learning_rate": 6.067863618214785e-06, "loss": 0.0538, "step": 246900 }, { "epoch": 3.636544380789679, "grad_norm": 1.493308186531067, "learning_rate": 6.0662272462478e-06, "loss": 0.047, "step": 246925 }, { "epoch": 3.636912563879766, "grad_norm": 2.351929187774658, "learning_rate": 6.064590874280814e-06, "loss": 0.057, "step": 246950 }, { "epoch": 3.637280746969853, "grad_norm": 1.4156646728515625, "learning_rate": 6.0629545023138305e-06, "loss": 0.0573, "step": 246975 }, { "epoch": 3.63764893005994, "grad_norm": 1.434942364692688, "learning_rate": 6.061318130346846e-06, "loss": 0.0522, "step": 247000 }, { "epoch": 3.638017113150027, "grad_norm": 1.510393738746643, "learning_rate": 6.059681758379862e-06, "loss": 0.0662, "step": 247025 }, { "epoch": 3.638385296240114, "grad_norm": 1.399326205253601, "learning_rate": 6.058045386412876e-06, "loss": 0.0522, "step": 247050 }, { "epoch": 3.638753479330201, "grad_norm": 1.0100810527801514, "learning_rate": 6.056409014445892e-06, "loss": 0.0481, "step": 247075 }, { "epoch": 3.6391216624202882, "grad_norm": 1.1176669597625732, "learning_rate": 6.054772642478908e-06, "loss": 0.054, "step": 247100 }, { "epoch": 3.6394898455103752, "grad_norm": 1.309601902961731, "learning_rate": 6.053136270511923e-06, "loss": 0.0521, "step": 247125 }, { "epoch": 3.6398580286004627, "grad_norm": 1.0684806108474731, "learning_rate": 6.051499898544938e-06, "loss": 0.0553, "step": 247150 }, { "epoch": 3.6402262116905497, "grad_norm": 1.735277771949768, "learning_rate": 6.049863526577954e-06, "loss": 0.0543, "step": 247175 }, { "epoch": 3.6405943947806367, "grad_norm": 1.1124500036239624, "learning_rate": 6.048227154610969e-06, "loss": 0.058, "step": 247200 }, { "epoch": 3.6409625778707237, "grad_norm": 1.3430871963500977, "learning_rate": 6.046590782643985e-06, "loss": 0.0536, "step": 247225 }, { "epoch": 3.6413307609608108, "grad_norm": 2.3065345287323, "learning_rate": 6.0449544106770006e-06, "loss": 0.0618, "step": 247250 }, { "epoch": 3.641698944050898, "grad_norm": 0.9468328952789307, "learning_rate": 6.043318038710016e-06, "loss": 0.0491, "step": 247275 }, { "epoch": 3.642067127140985, "grad_norm": 1.0123200416564941, "learning_rate": 6.041681666743031e-06, "loss": 0.0552, "step": 247300 }, { "epoch": 3.642435310231072, "grad_norm": 1.094185471534729, "learning_rate": 6.0400452947760465e-06, "loss": 0.0512, "step": 247325 }, { "epoch": 3.642803493321159, "grad_norm": 1.2349109649658203, "learning_rate": 6.038408922809063e-06, "loss": 0.0472, "step": 247350 }, { "epoch": 3.643171676411246, "grad_norm": 1.3068591356277466, "learning_rate": 6.036772550842077e-06, "loss": 0.0525, "step": 247375 }, { "epoch": 3.643539859501333, "grad_norm": 1.3664003610610962, "learning_rate": 6.035136178875093e-06, "loss": 0.0537, "step": 247400 }, { "epoch": 3.64390804259142, "grad_norm": 1.492756962776184, "learning_rate": 6.0334998069081085e-06, "loss": 0.0526, "step": 247425 }, { "epoch": 3.644276225681507, "grad_norm": 1.2986410856246948, "learning_rate": 6.031863434941124e-06, "loss": 0.0499, "step": 247450 }, { "epoch": 3.644644408771594, "grad_norm": 1.2511528730392456, "learning_rate": 6.030227062974139e-06, "loss": 0.0577, "step": 247475 }, { "epoch": 3.645012591861681, "grad_norm": 1.599492073059082, "learning_rate": 6.0285906910071544e-06, "loss": 0.0557, "step": 247500 }, { "epoch": 3.645380774951768, "grad_norm": 1.4056516885757446, "learning_rate": 6.02701977391885e-06, "loss": 0.061, "step": 247525 }, { "epoch": 3.645748958041855, "grad_norm": 1.0515146255493164, "learning_rate": 6.025383401951865e-06, "loss": 0.0507, "step": 247550 }, { "epoch": 3.646117141131942, "grad_norm": 1.6306889057159424, "learning_rate": 6.02374702998488e-06, "loss": 0.0574, "step": 247575 }, { "epoch": 3.646485324222029, "grad_norm": 1.2514461278915405, "learning_rate": 6.022110658017896e-06, "loss": 0.054, "step": 247600 }, { "epoch": 3.646853507312116, "grad_norm": 0.9605504870414734, "learning_rate": 6.020474286050911e-06, "loss": 0.0476, "step": 247625 }, { "epoch": 3.647221690402203, "grad_norm": 1.020743489265442, "learning_rate": 6.018837914083927e-06, "loss": 0.0555, "step": 247650 }, { "epoch": 3.64758987349229, "grad_norm": 1.1369078159332275, "learning_rate": 6.017201542116942e-06, "loss": 0.0569, "step": 247675 }, { "epoch": 3.647958056582377, "grad_norm": 1.3633928298950195, "learning_rate": 6.015565170149957e-06, "loss": 0.0516, "step": 247700 }, { "epoch": 3.648326239672464, "grad_norm": 1.1503260135650635, "learning_rate": 6.013928798182973e-06, "loss": 0.0499, "step": 247725 }, { "epoch": 3.6486944227625515, "grad_norm": 0.8007208108901978, "learning_rate": 6.012292426215989e-06, "loss": 0.048, "step": 247750 }, { "epoch": 3.6490626058526385, "grad_norm": 1.6318391561508179, "learning_rate": 6.010656054249005e-06, "loss": 0.0495, "step": 247775 }, { "epoch": 3.6494307889427255, "grad_norm": 1.3860453367233276, "learning_rate": 6.009019682282019e-06, "loss": 0.0501, "step": 247800 }, { "epoch": 3.6497989720328126, "grad_norm": 1.5322635173797607, "learning_rate": 6.0073833103150345e-06, "loss": 0.0552, "step": 247825 }, { "epoch": 3.6501671551228996, "grad_norm": 1.2735567092895508, "learning_rate": 6.005746938348051e-06, "loss": 0.0492, "step": 247850 }, { "epoch": 3.6505353382129866, "grad_norm": 1.451747179031372, "learning_rate": 6.004110566381066e-06, "loss": 0.0505, "step": 247875 }, { "epoch": 3.6509035213030736, "grad_norm": 0.9211105108261108, "learning_rate": 6.0024741944140804e-06, "loss": 0.0555, "step": 247900 }, { "epoch": 3.6512717043931606, "grad_norm": 1.5851130485534668, "learning_rate": 6.000837822447097e-06, "loss": 0.0574, "step": 247925 }, { "epoch": 3.6516398874832476, "grad_norm": 1.8523852825164795, "learning_rate": 5.999201450480112e-06, "loss": 0.0568, "step": 247950 }, { "epoch": 3.6520080705733347, "grad_norm": 0.8993156552314758, "learning_rate": 5.997565078513128e-06, "loss": 0.0514, "step": 247975 }, { "epoch": 3.6523762536634217, "grad_norm": 1.0645922422409058, "learning_rate": 5.9959287065461425e-06, "loss": 0.052, "step": 248000 }, { "epoch": 3.6527444367535087, "grad_norm": 1.5108942985534668, "learning_rate": 5.994292334579158e-06, "loss": 0.0585, "step": 248025 }, { "epoch": 3.6531126198435957, "grad_norm": 1.2397645711898804, "learning_rate": 5.992655962612174e-06, "loss": 0.0609, "step": 248050 }, { "epoch": 3.6534808029336827, "grad_norm": 2.0706255435943604, "learning_rate": 5.991019590645189e-06, "loss": 0.0505, "step": 248075 }, { "epoch": 3.65384898602377, "grad_norm": 1.3555341958999634, "learning_rate": 5.9893832186782046e-06, "loss": 0.0529, "step": 248100 }, { "epoch": 3.654217169113857, "grad_norm": 1.3032127618789673, "learning_rate": 5.98774684671122e-06, "loss": 0.0532, "step": 248125 }, { "epoch": 3.654585352203944, "grad_norm": 1.8432704210281372, "learning_rate": 5.986110474744235e-06, "loss": 0.0518, "step": 248150 }, { "epoch": 3.654953535294031, "grad_norm": 1.8248027563095093, "learning_rate": 5.984474102777251e-06, "loss": 0.0521, "step": 248175 }, { "epoch": 3.6553217183841182, "grad_norm": 1.4147013425827026, "learning_rate": 5.982837730810267e-06, "loss": 0.0546, "step": 248200 }, { "epoch": 3.6556899014742053, "grad_norm": 1.702972650527954, "learning_rate": 5.981201358843282e-06, "loss": 0.0616, "step": 248225 }, { "epoch": 3.6560580845642923, "grad_norm": 1.338383674621582, "learning_rate": 5.979564986876297e-06, "loss": 0.0544, "step": 248250 }, { "epoch": 3.6564262676543793, "grad_norm": 1.0849289894104004, "learning_rate": 5.977928614909313e-06, "loss": 0.0489, "step": 248275 }, { "epoch": 3.6567944507444663, "grad_norm": 1.5459071397781372, "learning_rate": 5.976292242942329e-06, "loss": 0.0556, "step": 248300 }, { "epoch": 3.6571626338345533, "grad_norm": 0.8210369348526001, "learning_rate": 5.974655870975343e-06, "loss": 0.044, "step": 248325 }, { "epoch": 3.6575308169246403, "grad_norm": 1.5008289813995361, "learning_rate": 5.973019499008359e-06, "loss": 0.0513, "step": 248350 }, { "epoch": 3.6578990000147273, "grad_norm": 1.6575672626495361, "learning_rate": 5.971383127041375e-06, "loss": 0.0537, "step": 248375 }, { "epoch": 3.6582671831048144, "grad_norm": 1.339687705039978, "learning_rate": 5.969746755074391e-06, "loss": 0.0507, "step": 248400 }, { "epoch": 3.6586353661949014, "grad_norm": 2.0025038719177246, "learning_rate": 5.968110383107405e-06, "loss": 0.0536, "step": 248425 }, { "epoch": 3.6590035492849884, "grad_norm": 1.4699890613555908, "learning_rate": 5.9664740111404205e-06, "loss": 0.0566, "step": 248450 }, { "epoch": 3.6593717323750754, "grad_norm": 1.4215010404586792, "learning_rate": 5.964837639173437e-06, "loss": 0.0518, "step": 248475 }, { "epoch": 3.6597399154651624, "grad_norm": 1.5836677551269531, "learning_rate": 5.963201267206452e-06, "loss": 0.0515, "step": 248500 }, { "epoch": 3.6601080985552494, "grad_norm": 1.1467339992523193, "learning_rate": 5.9615648952394664e-06, "loss": 0.0558, "step": 248525 }, { "epoch": 3.6604762816453364, "grad_norm": 0.9520508050918579, "learning_rate": 5.959928523272483e-06, "loss": 0.0579, "step": 248550 }, { "epoch": 3.6608444647354235, "grad_norm": 1.471479058265686, "learning_rate": 5.958292151305498e-06, "loss": 0.0529, "step": 248575 }, { "epoch": 3.6612126478255105, "grad_norm": 1.4006043672561646, "learning_rate": 5.956655779338514e-06, "loss": 0.0476, "step": 248600 }, { "epoch": 3.6615808309155975, "grad_norm": 1.6726654767990112, "learning_rate": 5.9550194073715285e-06, "loss": 0.0577, "step": 248625 }, { "epoch": 3.6619490140056845, "grad_norm": 1.358842372894287, "learning_rate": 5.953383035404544e-06, "loss": 0.0548, "step": 248650 }, { "epoch": 3.6623171970957715, "grad_norm": 1.4992647171020508, "learning_rate": 5.95174666343756e-06, "loss": 0.0512, "step": 248675 }, { "epoch": 3.662685380185859, "grad_norm": 1.303995966911316, "learning_rate": 5.950110291470575e-06, "loss": 0.0519, "step": 248700 }, { "epoch": 3.663053563275946, "grad_norm": 1.8302656412124634, "learning_rate": 5.948473919503591e-06, "loss": 0.0555, "step": 248725 }, { "epoch": 3.663421746366033, "grad_norm": 1.6934587955474854, "learning_rate": 5.946837547536606e-06, "loss": 0.0552, "step": 248750 }, { "epoch": 3.66378992945612, "grad_norm": 0.9948172569274902, "learning_rate": 5.945201175569621e-06, "loss": 0.0513, "step": 248775 }, { "epoch": 3.664158112546207, "grad_norm": 1.5662486553192139, "learning_rate": 5.943564803602637e-06, "loss": 0.0522, "step": 248800 }, { "epoch": 3.664526295636294, "grad_norm": 0.9182396531105042, "learning_rate": 5.941928431635653e-06, "loss": 0.0523, "step": 248825 }, { "epoch": 3.664894478726381, "grad_norm": 1.5740448236465454, "learning_rate": 5.940292059668668e-06, "loss": 0.0566, "step": 248850 }, { "epoch": 3.665262661816468, "grad_norm": 1.4985628128051758, "learning_rate": 5.938655687701683e-06, "loss": 0.0611, "step": 248875 }, { "epoch": 3.665630844906555, "grad_norm": 0.7719475030899048, "learning_rate": 5.9370193157346985e-06, "loss": 0.0585, "step": 248900 }, { "epoch": 3.665999027996642, "grad_norm": 1.188104510307312, "learning_rate": 5.935382943767715e-06, "loss": 0.0587, "step": 248925 }, { "epoch": 3.666367211086729, "grad_norm": 1.3426990509033203, "learning_rate": 5.933746571800729e-06, "loss": 0.0576, "step": 248950 }, { "epoch": 3.666735394176816, "grad_norm": 1.6435117721557617, "learning_rate": 5.932110199833745e-06, "loss": 0.0541, "step": 248975 }, { "epoch": 3.667103577266903, "grad_norm": 1.4483036994934082, "learning_rate": 5.930473827866761e-06, "loss": 0.0494, "step": 249000 }, { "epoch": 3.66747176035699, "grad_norm": 1.3431450128555298, "learning_rate": 5.928837455899777e-06, "loss": 0.0562, "step": 249025 }, { "epoch": 3.667839943447077, "grad_norm": 1.640661597251892, "learning_rate": 5.927201083932791e-06, "loss": 0.0557, "step": 249050 }, { "epoch": 3.6682081265371647, "grad_norm": 1.4969080686569214, "learning_rate": 5.9255647119658065e-06, "loss": 0.0553, "step": 249075 }, { "epoch": 3.6685763096272517, "grad_norm": 1.5314726829528809, "learning_rate": 5.923928339998823e-06, "loss": 0.053, "step": 249100 }, { "epoch": 3.6689444927173387, "grad_norm": 1.4686625003814697, "learning_rate": 5.922291968031838e-06, "loss": 0.057, "step": 249125 }, { "epoch": 3.6693126758074257, "grad_norm": 1.0762619972229004, "learning_rate": 5.9206555960648524e-06, "loss": 0.0504, "step": 249150 }, { "epoch": 3.6696808588975127, "grad_norm": 1.2639774084091187, "learning_rate": 5.919019224097869e-06, "loss": 0.0556, "step": 249175 }, { "epoch": 3.6700490419875997, "grad_norm": 1.439120888710022, "learning_rate": 5.917382852130884e-06, "loss": 0.0555, "step": 249200 }, { "epoch": 3.6704172250776868, "grad_norm": 1.29892897605896, "learning_rate": 5.9157464801639e-06, "loss": 0.0537, "step": 249225 }, { "epoch": 3.6707854081677738, "grad_norm": 1.665852427482605, "learning_rate": 5.914110108196915e-06, "loss": 0.0597, "step": 249250 }, { "epoch": 3.671153591257861, "grad_norm": 1.8004980087280273, "learning_rate": 5.91247373622993e-06, "loss": 0.0568, "step": 249275 }, { "epoch": 3.671521774347948, "grad_norm": 1.6189017295837402, "learning_rate": 5.910837364262946e-06, "loss": 0.0556, "step": 249300 }, { "epoch": 3.671889957438035, "grad_norm": 1.3257699012756348, "learning_rate": 5.909200992295961e-06, "loss": 0.0511, "step": 249325 }, { "epoch": 3.672258140528122, "grad_norm": 1.8579190969467163, "learning_rate": 5.907564620328977e-06, "loss": 0.0547, "step": 249350 }, { "epoch": 3.672626323618209, "grad_norm": 1.4295527935028076, "learning_rate": 5.905928248361992e-06, "loss": 0.0476, "step": 249375 }, { "epoch": 3.672994506708296, "grad_norm": 0.9922029376029968, "learning_rate": 5.904291876395007e-06, "loss": 0.052, "step": 249400 }, { "epoch": 3.673362689798383, "grad_norm": 1.6572712659835815, "learning_rate": 5.902655504428023e-06, "loss": 0.0593, "step": 249425 }, { "epoch": 3.67373087288847, "grad_norm": 1.2601135969161987, "learning_rate": 5.901019132461039e-06, "loss": 0.0494, "step": 249450 }, { "epoch": 3.674099055978557, "grad_norm": 0.8301722407341003, "learning_rate": 5.899382760494053e-06, "loss": 0.0488, "step": 249475 }, { "epoch": 3.674467239068644, "grad_norm": 1.1777797937393188, "learning_rate": 5.897746388527069e-06, "loss": 0.0546, "step": 249500 }, { "epoch": 3.674835422158731, "grad_norm": 1.8586320877075195, "learning_rate": 5.8961100165600845e-06, "loss": 0.0567, "step": 249525 }, { "epoch": 3.675203605248818, "grad_norm": 1.4060636758804321, "learning_rate": 5.894473644593101e-06, "loss": 0.055, "step": 249550 }, { "epoch": 3.675571788338905, "grad_norm": 1.2396047115325928, "learning_rate": 5.892837272626115e-06, "loss": 0.0553, "step": 249575 }, { "epoch": 3.675939971428992, "grad_norm": 1.1717571020126343, "learning_rate": 5.8912009006591304e-06, "loss": 0.0481, "step": 249600 }, { "epoch": 3.676308154519079, "grad_norm": 1.4184919595718384, "learning_rate": 5.889564528692147e-06, "loss": 0.0537, "step": 249625 }, { "epoch": 3.676676337609166, "grad_norm": 0.9783734083175659, "learning_rate": 5.887928156725162e-06, "loss": 0.0481, "step": 249650 }, { "epoch": 3.6770445206992535, "grad_norm": 0.8995241522789001, "learning_rate": 5.886291784758178e-06, "loss": 0.0527, "step": 249675 }, { "epoch": 3.6774127037893405, "grad_norm": 0.9122073650360107, "learning_rate": 5.8846554127911925e-06, "loss": 0.0528, "step": 249700 }, { "epoch": 3.6777808868794275, "grad_norm": 1.0846768617630005, "learning_rate": 5.883019040824209e-06, "loss": 0.0472, "step": 249725 }, { "epoch": 3.6781490699695145, "grad_norm": 1.857362985610962, "learning_rate": 5.881382668857224e-06, "loss": 0.0554, "step": 249750 }, { "epoch": 3.6785172530596015, "grad_norm": 1.790833592414856, "learning_rate": 5.879746296890239e-06, "loss": 0.0556, "step": 249775 }, { "epoch": 3.6788854361496885, "grad_norm": 1.3882207870483398, "learning_rate": 5.878109924923255e-06, "loss": 0.0505, "step": 249800 }, { "epoch": 3.6792536192397756, "grad_norm": 1.0891906023025513, "learning_rate": 5.87647355295627e-06, "loss": 0.0473, "step": 249825 }, { "epoch": 3.6796218023298626, "grad_norm": 1.1623611450195312, "learning_rate": 5.874837180989286e-06, "loss": 0.0608, "step": 249850 }, { "epoch": 3.6799899854199496, "grad_norm": 0.8595478534698486, "learning_rate": 5.873200809022301e-06, "loss": 0.0541, "step": 249875 }, { "epoch": 3.6803581685100366, "grad_norm": 1.7485387325286865, "learning_rate": 5.871564437055316e-06, "loss": 0.0546, "step": 249900 }, { "epoch": 3.6807263516001236, "grad_norm": 0.9385265707969666, "learning_rate": 5.869928065088332e-06, "loss": 0.0498, "step": 249925 }, { "epoch": 3.6810945346902106, "grad_norm": 1.4399908781051636, "learning_rate": 5.868291693121347e-06, "loss": 0.0463, "step": 249950 }, { "epoch": 3.6814627177802977, "grad_norm": 1.282204031944275, "learning_rate": 5.866655321154363e-06, "loss": 0.0519, "step": 249975 }, { "epoch": 3.6818309008703847, "grad_norm": 1.3285976648330688, "learning_rate": 5.865018949187378e-06, "loss": 0.0486, "step": 250000 }, { "epoch": 3.682199083960472, "grad_norm": 1.1242409944534302, "learning_rate": 5.863382577220393e-06, "loss": 0.0625, "step": 250025 }, { "epoch": 3.682567267050559, "grad_norm": 1.1674872636795044, "learning_rate": 5.861746205253409e-06, "loss": 0.0594, "step": 250050 }, { "epoch": 3.682935450140646, "grad_norm": 1.760764241218567, "learning_rate": 5.860109833286425e-06, "loss": 0.0592, "step": 250075 }, { "epoch": 3.683303633230733, "grad_norm": 1.2220913171768188, "learning_rate": 5.858473461319439e-06, "loss": 0.0539, "step": 250100 }, { "epoch": 3.68367181632082, "grad_norm": 0.8641085028648376, "learning_rate": 5.856837089352455e-06, "loss": 0.0482, "step": 250125 }, { "epoch": 3.684039999410907, "grad_norm": 1.2287153005599976, "learning_rate": 5.8552007173854705e-06, "loss": 0.0487, "step": 250150 }, { "epoch": 3.6844081825009942, "grad_norm": 1.0244650840759277, "learning_rate": 5.853564345418487e-06, "loss": 0.0571, "step": 250175 }, { "epoch": 3.6847763655910812, "grad_norm": 1.1917394399642944, "learning_rate": 5.851927973451502e-06, "loss": 0.0567, "step": 250200 }, { "epoch": 3.6851445486811683, "grad_norm": 1.7676109075546265, "learning_rate": 5.8502916014845164e-06, "loss": 0.0509, "step": 250225 }, { "epoch": 3.6855127317712553, "grad_norm": 1.4125123023986816, "learning_rate": 5.848655229517533e-06, "loss": 0.057, "step": 250250 }, { "epoch": 3.6858809148613423, "grad_norm": 1.3902137279510498, "learning_rate": 5.847018857550548e-06, "loss": 0.0571, "step": 250275 }, { "epoch": 3.6862490979514293, "grad_norm": 1.337487816810608, "learning_rate": 5.845382485583564e-06, "loss": 0.0578, "step": 250300 }, { "epoch": 3.6866172810415163, "grad_norm": 1.1797250509262085, "learning_rate": 5.8437461136165785e-06, "loss": 0.0547, "step": 250325 }, { "epoch": 3.6869854641316033, "grad_norm": 1.3180617094039917, "learning_rate": 5.842109741649594e-06, "loss": 0.0558, "step": 250350 }, { "epoch": 3.6873536472216903, "grad_norm": 1.0192644596099854, "learning_rate": 5.84047336968261e-06, "loss": 0.0516, "step": 250375 }, { "epoch": 3.6877218303117774, "grad_norm": 1.0768439769744873, "learning_rate": 5.838836997715625e-06, "loss": 0.0627, "step": 250400 }, { "epoch": 3.6880900134018644, "grad_norm": 1.3138222694396973, "learning_rate": 5.8372006257486406e-06, "loss": 0.051, "step": 250425 }, { "epoch": 3.6884581964919514, "grad_norm": 1.0106867551803589, "learning_rate": 5.835564253781656e-06, "loss": 0.0534, "step": 250450 }, { "epoch": 3.6888263795820384, "grad_norm": 1.5678738355636597, "learning_rate": 5.833927881814672e-06, "loss": 0.0561, "step": 250475 }, { "epoch": 3.6891945626721254, "grad_norm": 1.904113531112671, "learning_rate": 5.832291509847687e-06, "loss": 0.0569, "step": 250500 }, { "epoch": 3.6895627457622124, "grad_norm": 0.9363990426063538, "learning_rate": 5.830655137880702e-06, "loss": 0.0522, "step": 250525 }, { "epoch": 3.6899309288522995, "grad_norm": 1.0286802053451538, "learning_rate": 5.829018765913718e-06, "loss": 0.0524, "step": 250550 }, { "epoch": 3.6902991119423865, "grad_norm": 0.9147318005561829, "learning_rate": 5.827382393946733e-06, "loss": 0.0498, "step": 250575 }, { "epoch": 3.6906672950324735, "grad_norm": 1.3984861373901367, "learning_rate": 5.825746021979749e-06, "loss": 0.0511, "step": 250600 }, { "epoch": 3.691035478122561, "grad_norm": 2.1696903705596924, "learning_rate": 5.824109650012764e-06, "loss": 0.0578, "step": 250625 }, { "epoch": 3.691403661212648, "grad_norm": 1.66084885597229, "learning_rate": 5.822473278045779e-06, "loss": 0.0522, "step": 250650 }, { "epoch": 3.691771844302735, "grad_norm": 1.3171850442886353, "learning_rate": 5.820836906078795e-06, "loss": 0.0497, "step": 250675 }, { "epoch": 3.692140027392822, "grad_norm": 0.9930122494697571, "learning_rate": 5.819200534111811e-06, "loss": 0.0504, "step": 250700 }, { "epoch": 3.692508210482909, "grad_norm": 1.7931667566299438, "learning_rate": 5.817564162144827e-06, "loss": 0.0501, "step": 250725 }, { "epoch": 3.692876393572996, "grad_norm": 1.1997019052505493, "learning_rate": 5.815927790177841e-06, "loss": 0.0492, "step": 250750 }, { "epoch": 3.693244576663083, "grad_norm": 1.5087249279022217, "learning_rate": 5.8142914182108565e-06, "loss": 0.0576, "step": 250775 }, { "epoch": 3.69361275975317, "grad_norm": 1.2138445377349854, "learning_rate": 5.812655046243873e-06, "loss": 0.0531, "step": 250800 }, { "epoch": 3.693980942843257, "grad_norm": 0.6003586053848267, "learning_rate": 5.811018674276888e-06, "loss": 0.0526, "step": 250825 }, { "epoch": 3.694349125933344, "grad_norm": 1.2139582633972168, "learning_rate": 5.8093823023099024e-06, "loss": 0.0496, "step": 250850 }, { "epoch": 3.694717309023431, "grad_norm": 1.5219069719314575, "learning_rate": 5.807745930342919e-06, "loss": 0.0548, "step": 250875 }, { "epoch": 3.695085492113518, "grad_norm": 1.328459620475769, "learning_rate": 5.806109558375934e-06, "loss": 0.0574, "step": 250900 }, { "epoch": 3.695453675203605, "grad_norm": 1.2728067636489868, "learning_rate": 5.80447318640895e-06, "loss": 0.0572, "step": 250925 }, { "epoch": 3.695821858293692, "grad_norm": 1.318516731262207, "learning_rate": 5.8028368144419645e-06, "loss": 0.0514, "step": 250950 }, { "epoch": 3.6961900413837796, "grad_norm": 1.19840407371521, "learning_rate": 5.80120044247498e-06, "loss": 0.0629, "step": 250975 }, { "epoch": 3.6965582244738666, "grad_norm": 0.6499559283256531, "learning_rate": 5.799564070507996e-06, "loss": 0.0482, "step": 251000 }, { "epoch": 3.6969264075639536, "grad_norm": 1.533064842224121, "learning_rate": 5.797927698541011e-06, "loss": 0.0587, "step": 251025 }, { "epoch": 3.6972945906540406, "grad_norm": 1.0622810125350952, "learning_rate": 5.7962913265740266e-06, "loss": 0.052, "step": 251050 }, { "epoch": 3.6976627737441277, "grad_norm": 1.281508207321167, "learning_rate": 5.794654954607042e-06, "loss": 0.0539, "step": 251075 }, { "epoch": 3.6980309568342147, "grad_norm": 1.468364953994751, "learning_rate": 5.793018582640057e-06, "loss": 0.0548, "step": 251100 }, { "epoch": 3.6983991399243017, "grad_norm": 1.375420331954956, "learning_rate": 5.791382210673073e-06, "loss": 0.0569, "step": 251125 }, { "epoch": 3.6987673230143887, "grad_norm": 1.0663232803344727, "learning_rate": 5.789745838706089e-06, "loss": 0.0545, "step": 251150 }, { "epoch": 3.6991355061044757, "grad_norm": 1.4968196153640747, "learning_rate": 5.788109466739104e-06, "loss": 0.0494, "step": 251175 }, { "epoch": 3.6995036891945627, "grad_norm": 1.2448912858963013, "learning_rate": 5.786473094772119e-06, "loss": 0.0535, "step": 251200 }, { "epoch": 3.6998718722846498, "grad_norm": 1.282294511795044, "learning_rate": 5.7848367228051346e-06, "loss": 0.0531, "step": 251225 }, { "epoch": 3.7002400553747368, "grad_norm": 1.6939245462417603, "learning_rate": 5.783200350838151e-06, "loss": 0.0562, "step": 251250 }, { "epoch": 3.700608238464824, "grad_norm": 1.7599190473556519, "learning_rate": 5.781563978871165e-06, "loss": 0.062, "step": 251275 }, { "epoch": 3.700976421554911, "grad_norm": 1.3975493907928467, "learning_rate": 5.779927606904181e-06, "loss": 0.048, "step": 251300 }, { "epoch": 3.701344604644998, "grad_norm": 1.0987061262130737, "learning_rate": 5.778291234937197e-06, "loss": 0.0583, "step": 251325 }, { "epoch": 3.701712787735085, "grad_norm": 1.5857268571853638, "learning_rate": 5.776654862970213e-06, "loss": 0.055, "step": 251350 }, { "epoch": 3.702080970825172, "grad_norm": 1.0517363548278809, "learning_rate": 5.775018491003227e-06, "loss": 0.0558, "step": 251375 }, { "epoch": 3.702449153915259, "grad_norm": 1.3491718769073486, "learning_rate": 5.7733821190362425e-06, "loss": 0.0519, "step": 251400 }, { "epoch": 3.702817337005346, "grad_norm": 1.9140602350234985, "learning_rate": 5.771745747069259e-06, "loss": 0.0527, "step": 251425 }, { "epoch": 3.703185520095433, "grad_norm": 1.5180044174194336, "learning_rate": 5.770109375102274e-06, "loss": 0.0459, "step": 251450 }, { "epoch": 3.70355370318552, "grad_norm": 1.3003180027008057, "learning_rate": 5.7684730031352884e-06, "loss": 0.0477, "step": 251475 }, { "epoch": 3.703921886275607, "grad_norm": 1.350250005722046, "learning_rate": 5.766836631168305e-06, "loss": 0.0612, "step": 251500 }, { "epoch": 3.704290069365694, "grad_norm": 1.6882920265197754, "learning_rate": 5.765265714079999e-06, "loss": 0.0461, "step": 251525 }, { "epoch": 3.704658252455781, "grad_norm": 1.538088083267212, "learning_rate": 5.7636293421130155e-06, "loss": 0.0487, "step": 251550 }, { "epoch": 3.7050264355458684, "grad_norm": 1.2595710754394531, "learning_rate": 5.76199297014603e-06, "loss": 0.0485, "step": 251575 }, { "epoch": 3.7053946186359554, "grad_norm": 1.2149754762649536, "learning_rate": 5.760356598179045e-06, "loss": 0.055, "step": 251600 }, { "epoch": 3.7057628017260424, "grad_norm": 0.9896544218063354, "learning_rate": 5.758720226212061e-06, "loss": 0.0564, "step": 251625 }, { "epoch": 3.7061309848161295, "grad_norm": 1.3770445585250854, "learning_rate": 5.757083854245077e-06, "loss": 0.052, "step": 251650 }, { "epoch": 3.7064991679062165, "grad_norm": 1.1469231843948364, "learning_rate": 5.755447482278093e-06, "loss": 0.045, "step": 251675 }, { "epoch": 3.7068673509963035, "grad_norm": 1.002680778503418, "learning_rate": 5.753811110311107e-06, "loss": 0.0502, "step": 251700 }, { "epoch": 3.7072355340863905, "grad_norm": 1.060140609741211, "learning_rate": 5.752174738344123e-06, "loss": 0.0603, "step": 251725 }, { "epoch": 3.7076037171764775, "grad_norm": 1.5683294534683228, "learning_rate": 5.750538366377139e-06, "loss": 0.0599, "step": 251750 }, { "epoch": 3.7079719002665645, "grad_norm": 0.8978787064552307, "learning_rate": 5.748901994410154e-06, "loss": 0.0538, "step": 251775 }, { "epoch": 3.7083400833566516, "grad_norm": 1.6562714576721191, "learning_rate": 5.7472656224431685e-06, "loss": 0.0569, "step": 251800 }, { "epoch": 3.7087082664467386, "grad_norm": 1.2582255601882935, "learning_rate": 5.745629250476185e-06, "loss": 0.0552, "step": 251825 }, { "epoch": 3.7090764495368256, "grad_norm": 1.4364217519760132, "learning_rate": 5.7439928785092e-06, "loss": 0.0544, "step": 251850 }, { "epoch": 3.7094446326269126, "grad_norm": 1.3657604455947876, "learning_rate": 5.742356506542216e-06, "loss": 0.0607, "step": 251875 }, { "epoch": 3.7098128157169996, "grad_norm": 1.0781053304672241, "learning_rate": 5.740720134575231e-06, "loss": 0.0524, "step": 251900 }, { "epoch": 3.7101809988070866, "grad_norm": 1.0399731397628784, "learning_rate": 5.739083762608246e-06, "loss": 0.0516, "step": 251925 }, { "epoch": 3.710549181897174, "grad_norm": 0.8589385151863098, "learning_rate": 5.7375128455199415e-06, "loss": 0.0544, "step": 251950 }, { "epoch": 3.710917364987261, "grad_norm": 0.9743944406509399, "learning_rate": 5.735876473552957e-06, "loss": 0.0545, "step": 251975 }, { "epoch": 3.711285548077348, "grad_norm": 1.2415448427200317, "learning_rate": 5.734240101585972e-06, "loss": 0.0526, "step": 252000 }, { "epoch": 3.711653731167435, "grad_norm": 1.7873594760894775, "learning_rate": 5.732603729618987e-06, "loss": 0.052, "step": 252025 }, { "epoch": 3.712021914257522, "grad_norm": 1.1923871040344238, "learning_rate": 5.730967357652003e-06, "loss": 0.057, "step": 252050 }, { "epoch": 3.712390097347609, "grad_norm": 1.191372275352478, "learning_rate": 5.729330985685019e-06, "loss": 0.0481, "step": 252075 }, { "epoch": 3.712758280437696, "grad_norm": 1.3412585258483887, "learning_rate": 5.727694613718034e-06, "loss": 0.0631, "step": 252100 }, { "epoch": 3.713126463527783, "grad_norm": 1.0661895275115967, "learning_rate": 5.7260582417510494e-06, "loss": 0.0543, "step": 252125 }, { "epoch": 3.71349464661787, "grad_norm": 1.5244656801223755, "learning_rate": 5.724421869784065e-06, "loss": 0.0526, "step": 252150 }, { "epoch": 3.7138628297079572, "grad_norm": 1.5156444311141968, "learning_rate": 5.722785497817081e-06, "loss": 0.0557, "step": 252175 }, { "epoch": 3.7142310127980442, "grad_norm": 2.343926429748535, "learning_rate": 5.721149125850096e-06, "loss": 0.0552, "step": 252200 }, { "epoch": 3.7145991958881313, "grad_norm": 1.3030555248260498, "learning_rate": 5.719512753883111e-06, "loss": 0.0514, "step": 252225 }, { "epoch": 3.7149673789782183, "grad_norm": 1.3795397281646729, "learning_rate": 5.717876381916127e-06, "loss": 0.051, "step": 252250 }, { "epoch": 3.7153355620683053, "grad_norm": 1.3526875972747803, "learning_rate": 5.716240009949142e-06, "loss": 0.0551, "step": 252275 }, { "epoch": 3.7157037451583923, "grad_norm": 0.9389641284942627, "learning_rate": 5.714603637982158e-06, "loss": 0.0528, "step": 252300 }, { "epoch": 3.7160719282484793, "grad_norm": 1.8082996606826782, "learning_rate": 5.712967266015173e-06, "loss": 0.0549, "step": 252325 }, { "epoch": 3.7164401113385663, "grad_norm": 2.0994391441345215, "learning_rate": 5.711330894048188e-06, "loss": 0.0546, "step": 252350 }, { "epoch": 3.7168082944286533, "grad_norm": 1.2304348945617676, "learning_rate": 5.709694522081204e-06, "loss": 0.0579, "step": 252375 }, { "epoch": 3.7171764775187404, "grad_norm": 1.218194603919983, "learning_rate": 5.7080581501142195e-06, "loss": 0.051, "step": 252400 }, { "epoch": 3.7175446606088274, "grad_norm": 0.8290098309516907, "learning_rate": 5.706421778147234e-06, "loss": 0.0501, "step": 252425 }, { "epoch": 3.7179128436989144, "grad_norm": 1.408225417137146, "learning_rate": 5.70478540618025e-06, "loss": 0.0575, "step": 252450 }, { "epoch": 3.7182810267890014, "grad_norm": 1.3667598962783813, "learning_rate": 5.703149034213265e-06, "loss": 0.047, "step": 252475 }, { "epoch": 3.7186492098790884, "grad_norm": 1.0042376518249512, "learning_rate": 5.7015126622462815e-06, "loss": 0.0482, "step": 252500 }, { "epoch": 3.7190173929691754, "grad_norm": 1.4553728103637695, "learning_rate": 5.699876290279296e-06, "loss": 0.056, "step": 252525 }, { "epoch": 3.719385576059263, "grad_norm": 1.4743157625198364, "learning_rate": 5.698239918312311e-06, "loss": 0.0497, "step": 252550 }, { "epoch": 3.71975375914935, "grad_norm": 1.0868481397628784, "learning_rate": 5.6966035463453275e-06, "loss": 0.0534, "step": 252575 }, { "epoch": 3.720121942239437, "grad_norm": 0.7166820168495178, "learning_rate": 5.694967174378343e-06, "loss": 0.0467, "step": 252600 }, { "epoch": 3.720490125329524, "grad_norm": 1.4140657186508179, "learning_rate": 5.693330802411359e-06, "loss": 0.0567, "step": 252625 }, { "epoch": 3.720858308419611, "grad_norm": 1.2269924879074097, "learning_rate": 5.691694430444373e-06, "loss": 0.0523, "step": 252650 }, { "epoch": 3.721226491509698, "grad_norm": 2.159184455871582, "learning_rate": 5.690058058477389e-06, "loss": 0.0526, "step": 252675 }, { "epoch": 3.721594674599785, "grad_norm": 1.8485740423202515, "learning_rate": 5.688421686510405e-06, "loss": 0.059, "step": 252700 }, { "epoch": 3.721962857689872, "grad_norm": 2.0930328369140625, "learning_rate": 5.68678531454342e-06, "loss": 0.0494, "step": 252725 }, { "epoch": 3.722331040779959, "grad_norm": 1.4516451358795166, "learning_rate": 5.6851489425764354e-06, "loss": 0.0565, "step": 252750 }, { "epoch": 3.722699223870046, "grad_norm": 1.2605059146881104, "learning_rate": 5.683512570609451e-06, "loss": 0.0529, "step": 252775 }, { "epoch": 3.723067406960133, "grad_norm": 0.9492141604423523, "learning_rate": 5.681876198642466e-06, "loss": 0.0521, "step": 252800 }, { "epoch": 3.72343559005022, "grad_norm": 1.4231098890304565, "learning_rate": 5.680239826675482e-06, "loss": 0.0567, "step": 252825 }, { "epoch": 3.723803773140307, "grad_norm": 1.3626117706298828, "learning_rate": 5.678603454708497e-06, "loss": 0.0498, "step": 252850 }, { "epoch": 3.724171956230394, "grad_norm": 1.5522030591964722, "learning_rate": 5.676967082741513e-06, "loss": 0.0495, "step": 252875 }, { "epoch": 3.7245401393204816, "grad_norm": 0.9307212829589844, "learning_rate": 5.675330710774528e-06, "loss": 0.0603, "step": 252900 }, { "epoch": 3.7249083224105686, "grad_norm": 1.206346869468689, "learning_rate": 5.673694338807543e-06, "loss": 0.0558, "step": 252925 }, { "epoch": 3.7252765055006556, "grad_norm": 1.2777684926986694, "learning_rate": 5.672057966840559e-06, "loss": 0.0517, "step": 252950 }, { "epoch": 3.7256446885907426, "grad_norm": 1.1456856727600098, "learning_rate": 5.670421594873574e-06, "loss": 0.0513, "step": 252975 }, { "epoch": 3.7260128716808296, "grad_norm": 1.420722246170044, "learning_rate": 5.66878522290659e-06, "loss": 0.0494, "step": 253000 }, { "epoch": 3.7263810547709166, "grad_norm": 1.6824190616607666, "learning_rate": 5.6671488509396055e-06, "loss": 0.0534, "step": 253025 }, { "epoch": 3.7267492378610036, "grad_norm": 1.2085250616073608, "learning_rate": 5.66551247897262e-06, "loss": 0.0528, "step": 253050 }, { "epoch": 3.7271174209510907, "grad_norm": 1.712480902671814, "learning_rate": 5.663876107005636e-06, "loss": 0.0572, "step": 253075 }, { "epoch": 3.7274856040411777, "grad_norm": 1.1883583068847656, "learning_rate": 5.662239735038651e-06, "loss": 0.0539, "step": 253100 }, { "epoch": 3.7278537871312647, "grad_norm": 1.7856664657592773, "learning_rate": 5.6606033630716675e-06, "loss": 0.0526, "step": 253125 }, { "epoch": 3.7282219702213517, "grad_norm": 1.5447059869766235, "learning_rate": 5.658966991104683e-06, "loss": 0.0526, "step": 253150 }, { "epoch": 3.7285901533114387, "grad_norm": 1.5127798318862915, "learning_rate": 5.657330619137697e-06, "loss": 0.0597, "step": 253175 }, { "epoch": 3.7289583364015257, "grad_norm": 1.3642851114273071, "learning_rate": 5.6556942471707135e-06, "loss": 0.0452, "step": 253200 }, { "epoch": 3.7293265194916128, "grad_norm": 1.419844150543213, "learning_rate": 5.654057875203729e-06, "loss": 0.0544, "step": 253225 }, { "epoch": 3.7296947025816998, "grad_norm": 0.8943883180618286, "learning_rate": 5.652421503236745e-06, "loss": 0.0484, "step": 253250 }, { "epoch": 3.730062885671787, "grad_norm": 1.8702971935272217, "learning_rate": 5.650785131269759e-06, "loss": 0.0543, "step": 253275 }, { "epoch": 3.730431068761874, "grad_norm": 1.5334993600845337, "learning_rate": 5.649148759302775e-06, "loss": 0.0561, "step": 253300 }, { "epoch": 3.730799251851961, "grad_norm": 1.957545518875122, "learning_rate": 5.647512387335791e-06, "loss": 0.0592, "step": 253325 }, { "epoch": 3.731167434942048, "grad_norm": 1.1380528211593628, "learning_rate": 5.645876015368806e-06, "loss": 0.0587, "step": 253350 }, { "epoch": 3.731535618032135, "grad_norm": 1.2202318906784058, "learning_rate": 5.644239643401821e-06, "loss": 0.0531, "step": 253375 }, { "epoch": 3.731903801122222, "grad_norm": 1.397199034690857, "learning_rate": 5.642603271434837e-06, "loss": 0.0544, "step": 253400 }, { "epoch": 3.732271984212309, "grad_norm": 1.5993881225585938, "learning_rate": 5.640966899467852e-06, "loss": 0.0527, "step": 253425 }, { "epoch": 3.732640167302396, "grad_norm": 1.0414845943450928, "learning_rate": 5.639330527500868e-06, "loss": 0.0472, "step": 253450 }, { "epoch": 3.733008350392483, "grad_norm": 1.3304674625396729, "learning_rate": 5.637694155533883e-06, "loss": 0.0538, "step": 253475 }, { "epoch": 3.7333765334825704, "grad_norm": 0.9491090774536133, "learning_rate": 5.636057783566898e-06, "loss": 0.0534, "step": 253500 }, { "epoch": 3.7337447165726574, "grad_norm": 1.0399004220962524, "learning_rate": 5.634421411599914e-06, "loss": 0.0565, "step": 253525 }, { "epoch": 3.7341128996627444, "grad_norm": 1.2867871522903442, "learning_rate": 5.632785039632929e-06, "loss": 0.0564, "step": 253550 }, { "epoch": 3.7344810827528314, "grad_norm": 1.3402962684631348, "learning_rate": 5.6311486676659456e-06, "loss": 0.0501, "step": 253575 }, { "epoch": 3.7348492658429184, "grad_norm": 0.974835216999054, "learning_rate": 5.62951229569896e-06, "loss": 0.0551, "step": 253600 }, { "epoch": 3.7352174489330054, "grad_norm": 1.0583165884017944, "learning_rate": 5.627875923731976e-06, "loss": 0.0535, "step": 253625 }, { "epoch": 3.7355856320230925, "grad_norm": 2.0062127113342285, "learning_rate": 5.6262395517649915e-06, "loss": 0.0567, "step": 253650 }, { "epoch": 3.7359538151131795, "grad_norm": 1.4993880987167358, "learning_rate": 5.624603179798007e-06, "loss": 0.0569, "step": 253675 }, { "epoch": 3.7363219982032665, "grad_norm": 1.6953601837158203, "learning_rate": 5.622966807831022e-06, "loss": 0.0545, "step": 253700 }, { "epoch": 3.7366901812933535, "grad_norm": 1.5041608810424805, "learning_rate": 5.621330435864037e-06, "loss": 0.0463, "step": 253725 }, { "epoch": 3.7370583643834405, "grad_norm": 1.3922077417373657, "learning_rate": 5.6196940638970535e-06, "loss": 0.0555, "step": 253750 }, { "epoch": 3.7374265474735275, "grad_norm": 1.3642810583114624, "learning_rate": 5.618057691930069e-06, "loss": 0.0523, "step": 253775 }, { "epoch": 3.7377947305636146, "grad_norm": 1.7539656162261963, "learning_rate": 5.616421319963083e-06, "loss": 0.0549, "step": 253800 }, { "epoch": 3.7381629136537016, "grad_norm": 1.55299973487854, "learning_rate": 5.6147849479960995e-06, "loss": 0.0561, "step": 253825 }, { "epoch": 3.738531096743789, "grad_norm": 1.2092570066452026, "learning_rate": 5.613148576029115e-06, "loss": 0.057, "step": 253850 }, { "epoch": 3.738899279833876, "grad_norm": 1.6553634405136108, "learning_rate": 5.611512204062131e-06, "loss": 0.0541, "step": 253875 }, { "epoch": 3.739267462923963, "grad_norm": 1.1082477569580078, "learning_rate": 5.609875832095145e-06, "loss": 0.0537, "step": 253900 }, { "epoch": 3.73963564601405, "grad_norm": 0.8774492144584656, "learning_rate": 5.608239460128161e-06, "loss": 0.0575, "step": 253925 }, { "epoch": 3.740003829104137, "grad_norm": 1.1553659439086914, "learning_rate": 5.606603088161177e-06, "loss": 0.0563, "step": 253950 }, { "epoch": 3.740372012194224, "grad_norm": 1.5415480136871338, "learning_rate": 5.604966716194192e-06, "loss": 0.0562, "step": 253975 }, { "epoch": 3.740740195284311, "grad_norm": 0.9468506574630737, "learning_rate": 5.603330344227207e-06, "loss": 0.0574, "step": 254000 }, { "epoch": 3.741108378374398, "grad_norm": 1.3352893590927124, "learning_rate": 5.601693972260223e-06, "loss": 0.0531, "step": 254025 }, { "epoch": 3.741476561464485, "grad_norm": 1.564494252204895, "learning_rate": 5.600057600293238e-06, "loss": 0.0516, "step": 254050 }, { "epoch": 3.741844744554572, "grad_norm": 1.5080046653747559, "learning_rate": 5.598421228326254e-06, "loss": 0.0501, "step": 254075 }, { "epoch": 3.742212927644659, "grad_norm": 1.3194935321807861, "learning_rate": 5.5967848563592695e-06, "loss": 0.0597, "step": 254100 }, { "epoch": 3.742581110734746, "grad_norm": 1.4618470668792725, "learning_rate": 5.595148484392284e-06, "loss": 0.0552, "step": 254125 }, { "epoch": 3.742949293824833, "grad_norm": 1.3278696537017822, "learning_rate": 5.5935775673039795e-06, "loss": 0.0577, "step": 254150 }, { "epoch": 3.7433174769149202, "grad_norm": 2.116835832595825, "learning_rate": 5.591941195336995e-06, "loss": 0.0624, "step": 254175 }, { "epoch": 3.7436856600050072, "grad_norm": 1.7616889476776123, "learning_rate": 5.590304823370011e-06, "loss": 0.0499, "step": 254200 }, { "epoch": 3.7440538430950943, "grad_norm": 2.1772472858428955, "learning_rate": 5.5886684514030254e-06, "loss": 0.0627, "step": 254225 }, { "epoch": 3.7444220261851813, "grad_norm": 1.0219078063964844, "learning_rate": 5.587032079436041e-06, "loss": 0.0526, "step": 254250 }, { "epoch": 3.7447902092752683, "grad_norm": 1.9629639387130737, "learning_rate": 5.585395707469057e-06, "loss": 0.0552, "step": 254275 }, { "epoch": 3.7451583923653553, "grad_norm": 1.9039746522903442, "learning_rate": 5.583759335502072e-06, "loss": 0.0523, "step": 254300 }, { "epoch": 3.7455265754554423, "grad_norm": 1.5746433734893799, "learning_rate": 5.5821229635350875e-06, "loss": 0.0534, "step": 254325 }, { "epoch": 3.7458947585455293, "grad_norm": 1.8572666645050049, "learning_rate": 5.580486591568103e-06, "loss": 0.0589, "step": 254350 }, { "epoch": 3.7462629416356164, "grad_norm": 0.8775358200073242, "learning_rate": 5.578850219601118e-06, "loss": 0.0491, "step": 254375 }, { "epoch": 3.7466311247257034, "grad_norm": 1.6323455572128296, "learning_rate": 5.577213847634134e-06, "loss": 0.0582, "step": 254400 }, { "epoch": 3.7469993078157904, "grad_norm": 1.3335814476013184, "learning_rate": 5.575577475667149e-06, "loss": 0.0557, "step": 254425 }, { "epoch": 3.747367490905878, "grad_norm": 0.894173264503479, "learning_rate": 5.573941103700165e-06, "loss": 0.0588, "step": 254450 }, { "epoch": 3.747735673995965, "grad_norm": 1.5173102617263794, "learning_rate": 5.57230473173318e-06, "loss": 0.0527, "step": 254475 }, { "epoch": 3.748103857086052, "grad_norm": 1.433074951171875, "learning_rate": 5.5706683597661955e-06, "loss": 0.0522, "step": 254500 }, { "epoch": 3.748472040176139, "grad_norm": 1.2709681987762451, "learning_rate": 5.569031987799212e-06, "loss": 0.0518, "step": 254525 }, { "epoch": 3.748840223266226, "grad_norm": 1.0563149452209473, "learning_rate": 5.567395615832226e-06, "loss": 0.0534, "step": 254550 }, { "epoch": 3.749208406356313, "grad_norm": 1.5839401483535767, "learning_rate": 5.565759243865242e-06, "loss": 0.0558, "step": 254575 }, { "epoch": 3.7495765894464, "grad_norm": 1.1206369400024414, "learning_rate": 5.5641228718982576e-06, "loss": 0.0594, "step": 254600 }, { "epoch": 3.749944772536487, "grad_norm": 1.8041942119598389, "learning_rate": 5.562486499931274e-06, "loss": 0.0549, "step": 254625 }, { "epoch": 3.750312955626574, "grad_norm": 1.2459949254989624, "learning_rate": 5.560850127964288e-06, "loss": 0.0529, "step": 254650 }, { "epoch": 3.750681138716661, "grad_norm": 1.857454776763916, "learning_rate": 5.5592137559973035e-06, "loss": 0.0643, "step": 254675 }, { "epoch": 3.751049321806748, "grad_norm": 0.8499749898910522, "learning_rate": 5.55757738403032e-06, "loss": 0.0506, "step": 254700 }, { "epoch": 3.751417504896835, "grad_norm": 1.7271147966384888, "learning_rate": 5.555941012063335e-06, "loss": 0.0575, "step": 254725 }, { "epoch": 3.751785687986922, "grad_norm": 1.2960542440414429, "learning_rate": 5.554304640096349e-06, "loss": 0.0592, "step": 254750 }, { "epoch": 3.752153871077009, "grad_norm": 1.0242388248443604, "learning_rate": 5.5526682681293655e-06, "loss": 0.0557, "step": 254775 }, { "epoch": 3.752522054167096, "grad_norm": 1.5435205698013306, "learning_rate": 5.551031896162381e-06, "loss": 0.0631, "step": 254800 }, { "epoch": 3.7528902372571835, "grad_norm": 1.2656018733978271, "learning_rate": 5.549395524195397e-06, "loss": 0.0463, "step": 254825 }, { "epoch": 3.7532584203472705, "grad_norm": 1.3564785718917847, "learning_rate": 5.5477591522284114e-06, "loss": 0.0563, "step": 254850 }, { "epoch": 3.7536266034373575, "grad_norm": 1.6872167587280273, "learning_rate": 5.546122780261427e-06, "loss": 0.0564, "step": 254875 }, { "epoch": 3.7539947865274446, "grad_norm": 1.6043498516082764, "learning_rate": 5.544486408294443e-06, "loss": 0.0574, "step": 254900 }, { "epoch": 3.7543629696175316, "grad_norm": 1.449638843536377, "learning_rate": 5.542850036327458e-06, "loss": 0.0598, "step": 254925 }, { "epoch": 3.7547311527076186, "grad_norm": 1.5360407829284668, "learning_rate": 5.541213664360473e-06, "loss": 0.0526, "step": 254950 }, { "epoch": 3.7550993357977056, "grad_norm": 1.4203702211380005, "learning_rate": 5.539577292393489e-06, "loss": 0.0487, "step": 254975 }, { "epoch": 3.7554675188877926, "grad_norm": 1.0612748861312866, "learning_rate": 5.537940920426504e-06, "loss": 0.0511, "step": 255000 }, { "epoch": 3.7558357019778796, "grad_norm": 1.6567435264587402, "learning_rate": 5.53630454845952e-06, "loss": 0.0613, "step": 255025 }, { "epoch": 3.7562038850679667, "grad_norm": 0.856922447681427, "learning_rate": 5.5346681764925356e-06, "loss": 0.0532, "step": 255050 }, { "epoch": 3.7565720681580537, "grad_norm": 1.659920573234558, "learning_rate": 5.53303180452555e-06, "loss": 0.0536, "step": 255075 }, { "epoch": 3.7569402512481407, "grad_norm": 1.4628435373306274, "learning_rate": 5.531395432558566e-06, "loss": 0.0571, "step": 255100 }, { "epoch": 3.7573084343382277, "grad_norm": 1.719956398010254, "learning_rate": 5.5297590605915815e-06, "loss": 0.0608, "step": 255125 }, { "epoch": 3.7576766174283147, "grad_norm": 1.5746419429779053, "learning_rate": 5.528122688624598e-06, "loss": 0.0561, "step": 255150 }, { "epoch": 3.7580448005184017, "grad_norm": 1.9329231977462769, "learning_rate": 5.526486316657612e-06, "loss": 0.054, "step": 255175 }, { "epoch": 3.7584129836084887, "grad_norm": 1.1097121238708496, "learning_rate": 5.524849944690628e-06, "loss": 0.0516, "step": 255200 }, { "epoch": 3.7587811666985758, "grad_norm": 1.089319109916687, "learning_rate": 5.5232135727236436e-06, "loss": 0.0494, "step": 255225 }, { "epoch": 3.7591493497886628, "grad_norm": 1.215819239616394, "learning_rate": 5.521577200756659e-06, "loss": 0.0556, "step": 255250 }, { "epoch": 3.75951753287875, "grad_norm": 1.770950436592102, "learning_rate": 5.519940828789674e-06, "loss": 0.0563, "step": 255275 }, { "epoch": 3.759885715968837, "grad_norm": 1.3925715684890747, "learning_rate": 5.5183044568226895e-06, "loss": 0.0575, "step": 255300 }, { "epoch": 3.760253899058924, "grad_norm": 1.2943941354751587, "learning_rate": 5.516668084855706e-06, "loss": 0.0629, "step": 255325 }, { "epoch": 3.760622082149011, "grad_norm": 1.6082313060760498, "learning_rate": 5.515031712888721e-06, "loss": 0.0563, "step": 255350 }, { "epoch": 3.760990265239098, "grad_norm": 1.6059378385543823, "learning_rate": 5.513395340921735e-06, "loss": 0.0478, "step": 255375 }, { "epoch": 3.761358448329185, "grad_norm": 1.0294955968856812, "learning_rate": 5.5117589689547515e-06, "loss": 0.0518, "step": 255400 }, { "epoch": 3.7617266314192723, "grad_norm": 1.270851492881775, "learning_rate": 5.510122596987767e-06, "loss": 0.0547, "step": 255425 }, { "epoch": 3.7620948145093593, "grad_norm": 1.1567546129226685, "learning_rate": 5.508486225020783e-06, "loss": 0.0548, "step": 255450 }, { "epoch": 3.7624629975994464, "grad_norm": 0.7632060647010803, "learning_rate": 5.5068498530537974e-06, "loss": 0.0521, "step": 255475 }, { "epoch": 3.7628311806895334, "grad_norm": 1.4948337078094482, "learning_rate": 5.505213481086813e-06, "loss": 0.0553, "step": 255500 }, { "epoch": 3.7631993637796204, "grad_norm": 1.7801198959350586, "learning_rate": 5.503577109119829e-06, "loss": 0.0581, "step": 255525 }, { "epoch": 3.7635675468697074, "grad_norm": 1.4474761486053467, "learning_rate": 5.501940737152844e-06, "loss": 0.0568, "step": 255550 }, { "epoch": 3.7639357299597944, "grad_norm": 1.2973910570144653, "learning_rate": 5.50030436518586e-06, "loss": 0.0514, "step": 255575 }, { "epoch": 3.7643039130498814, "grad_norm": 1.047560214996338, "learning_rate": 5.498667993218875e-06, "loss": 0.0508, "step": 255600 }, { "epoch": 3.7646720961399684, "grad_norm": 1.7053852081298828, "learning_rate": 5.49703162125189e-06, "loss": 0.0549, "step": 255625 }, { "epoch": 3.7650402792300555, "grad_norm": 1.4982037544250488, "learning_rate": 5.495395249284906e-06, "loss": 0.054, "step": 255650 }, { "epoch": 3.7654084623201425, "grad_norm": 1.4650391340255737, "learning_rate": 5.4937588773179216e-06, "loss": 0.0546, "step": 255675 }, { "epoch": 3.7657766454102295, "grad_norm": 0.7948215007781982, "learning_rate": 5.492122505350936e-06, "loss": 0.061, "step": 255700 }, { "epoch": 3.7661448285003165, "grad_norm": 1.7028142213821411, "learning_rate": 5.490486133383952e-06, "loss": 0.0559, "step": 255725 }, { "epoch": 3.7665130115904035, "grad_norm": 1.1448397636413574, "learning_rate": 5.4888497614169675e-06, "loss": 0.0504, "step": 255750 }, { "epoch": 3.766881194680491, "grad_norm": 1.411699891090393, "learning_rate": 5.487213389449984e-06, "loss": 0.0575, "step": 255775 }, { "epoch": 3.767249377770578, "grad_norm": 1.5449858903884888, "learning_rate": 5.485577017482998e-06, "loss": 0.052, "step": 255800 }, { "epoch": 3.767617560860665, "grad_norm": 1.1064664125442505, "learning_rate": 5.483940645516013e-06, "loss": 0.0474, "step": 255825 }, { "epoch": 3.767985743950752, "grad_norm": 1.473809838294983, "learning_rate": 5.4823042735490295e-06, "loss": 0.0528, "step": 255850 }, { "epoch": 3.768353927040839, "grad_norm": 1.6745480298995972, "learning_rate": 5.480667901582045e-06, "loss": 0.0589, "step": 255875 }, { "epoch": 3.768722110130926, "grad_norm": 1.6598762273788452, "learning_rate": 5.47903152961506e-06, "loss": 0.0554, "step": 255900 }, { "epoch": 3.769090293221013, "grad_norm": 1.3508578538894653, "learning_rate": 5.4773951576480755e-06, "loss": 0.0585, "step": 255925 }, { "epoch": 3.7694584763111, "grad_norm": 1.256160855293274, "learning_rate": 5.475758785681092e-06, "loss": 0.0509, "step": 255950 }, { "epoch": 3.769826659401187, "grad_norm": 1.4047116041183472, "learning_rate": 5.474122413714107e-06, "loss": 0.0552, "step": 255975 }, { "epoch": 3.770194842491274, "grad_norm": 0.8932319283485413, "learning_rate": 5.472486041747122e-06, "loss": 0.0521, "step": 256000 }, { "epoch": 3.770563025581361, "grad_norm": 1.0128567218780518, "learning_rate": 5.4708496697801375e-06, "loss": 0.053, "step": 256025 }, { "epoch": 3.770931208671448, "grad_norm": 1.258533000946045, "learning_rate": 5.469213297813153e-06, "loss": 0.0471, "step": 256050 }, { "epoch": 3.771299391761535, "grad_norm": 1.0562033653259277, "learning_rate": 5.467576925846169e-06, "loss": 0.052, "step": 256075 }, { "epoch": 3.771667574851622, "grad_norm": 1.2327269315719604, "learning_rate": 5.465940553879184e-06, "loss": 0.0549, "step": 256100 }, { "epoch": 3.772035757941709, "grad_norm": 1.4154163599014282, "learning_rate": 5.464304181912199e-06, "loss": 0.0561, "step": 256125 }, { "epoch": 3.772403941031796, "grad_norm": 1.3224729299545288, "learning_rate": 5.462667809945215e-06, "loss": 0.0526, "step": 256150 }, { "epoch": 3.7727721241218832, "grad_norm": 1.5392817258834839, "learning_rate": 5.46103143797823e-06, "loss": 0.0576, "step": 256175 }, { "epoch": 3.7731403072119702, "grad_norm": 1.4855889081954956, "learning_rate": 5.459395066011246e-06, "loss": 0.057, "step": 256200 }, { "epoch": 3.7735084903020573, "grad_norm": 1.3495925664901733, "learning_rate": 5.457758694044261e-06, "loss": 0.0538, "step": 256225 }, { "epoch": 3.7738766733921443, "grad_norm": 1.6467121839523315, "learning_rate": 5.456122322077276e-06, "loss": 0.0534, "step": 256250 }, { "epoch": 3.7742448564822313, "grad_norm": 1.9972038269042969, "learning_rate": 5.454485950110292e-06, "loss": 0.0595, "step": 256275 }, { "epoch": 3.7746130395723183, "grad_norm": 1.1029690504074097, "learning_rate": 5.4528495781433076e-06, "loss": 0.0505, "step": 256300 }, { "epoch": 3.7749812226624053, "grad_norm": 1.0059043169021606, "learning_rate": 5.451213206176322e-06, "loss": 0.0509, "step": 256325 }, { "epoch": 3.7753494057524923, "grad_norm": 0.9450321197509766, "learning_rate": 5.449576834209338e-06, "loss": 0.0504, "step": 256350 }, { "epoch": 3.77571758884258, "grad_norm": 0.9107796549797058, "learning_rate": 5.4479404622423535e-06, "loss": 0.0505, "step": 256375 }, { "epoch": 3.776085771932667, "grad_norm": 1.2592682838439941, "learning_rate": 5.44630409027537e-06, "loss": 0.0511, "step": 256400 }, { "epoch": 3.776453955022754, "grad_norm": 1.16099214553833, "learning_rate": 5.444667718308384e-06, "loss": 0.0488, "step": 256425 }, { "epoch": 3.776822138112841, "grad_norm": 0.9327324628829956, "learning_rate": 5.443031346341399e-06, "loss": 0.049, "step": 256450 }, { "epoch": 3.777190321202928, "grad_norm": 0.9738727807998657, "learning_rate": 5.4413949743744155e-06, "loss": 0.0494, "step": 256475 }, { "epoch": 3.777558504293015, "grad_norm": 1.8228766918182373, "learning_rate": 5.439758602407431e-06, "loss": 0.0548, "step": 256500 }, { "epoch": 3.777926687383102, "grad_norm": 1.2142353057861328, "learning_rate": 5.438122230440447e-06, "loss": 0.0512, "step": 256525 }, { "epoch": 3.778294870473189, "grad_norm": 0.9079768657684326, "learning_rate": 5.4364858584734615e-06, "loss": 0.0514, "step": 256550 }, { "epoch": 3.778663053563276, "grad_norm": 0.9658983945846558, "learning_rate": 5.434849486506477e-06, "loss": 0.0577, "step": 256575 }, { "epoch": 3.779031236653363, "grad_norm": 0.7127007246017456, "learning_rate": 5.433213114539493e-06, "loss": 0.0494, "step": 256600 }, { "epoch": 3.77939941974345, "grad_norm": 1.2438246011734009, "learning_rate": 5.431576742572508e-06, "loss": 0.0602, "step": 256625 }, { "epoch": 3.779767602833537, "grad_norm": 1.516676664352417, "learning_rate": 5.4299403706055235e-06, "loss": 0.057, "step": 256650 }, { "epoch": 3.780135785923624, "grad_norm": 1.3705201148986816, "learning_rate": 5.428303998638539e-06, "loss": 0.0524, "step": 256675 }, { "epoch": 3.780503969013711, "grad_norm": 0.9629042148590088, "learning_rate": 5.426667626671554e-06, "loss": 0.0547, "step": 256700 }, { "epoch": 3.780872152103798, "grad_norm": 1.5228092670440674, "learning_rate": 5.42503125470457e-06, "loss": 0.0534, "step": 256725 }, { "epoch": 3.7812403351938855, "grad_norm": 0.9180556535720825, "learning_rate": 5.423460337616264e-06, "loss": 0.0454, "step": 256750 }, { "epoch": 3.7816085182839725, "grad_norm": 1.1951689720153809, "learning_rate": 5.42182396564928e-06, "loss": 0.054, "step": 256775 }, { "epoch": 3.7819767013740595, "grad_norm": 1.231819748878479, "learning_rate": 5.420187593682296e-06, "loss": 0.0459, "step": 256800 }, { "epoch": 3.7823448844641465, "grad_norm": 1.658528447151184, "learning_rate": 5.418551221715311e-06, "loss": 0.0583, "step": 256825 }, { "epoch": 3.7827130675542335, "grad_norm": 1.0634377002716064, "learning_rate": 5.416914849748326e-06, "loss": 0.0495, "step": 256850 }, { "epoch": 3.7830812506443205, "grad_norm": 1.3042906522750854, "learning_rate": 5.4152784777813415e-06, "loss": 0.0478, "step": 256875 }, { "epoch": 3.7834494337344076, "grad_norm": 1.640803575515747, "learning_rate": 5.413642105814358e-06, "loss": 0.0506, "step": 256900 }, { "epoch": 3.7838176168244946, "grad_norm": 1.1699762344360352, "learning_rate": 5.412005733847373e-06, "loss": 0.0562, "step": 256925 }, { "epoch": 3.7841857999145816, "grad_norm": 1.746739387512207, "learning_rate": 5.410369361880389e-06, "loss": 0.0497, "step": 256950 }, { "epoch": 3.7845539830046686, "grad_norm": 1.1424062252044678, "learning_rate": 5.408732989913404e-06, "loss": 0.0565, "step": 256975 }, { "epoch": 3.7849221660947556, "grad_norm": 1.3724812269210815, "learning_rate": 5.407096617946419e-06, "loss": 0.0548, "step": 257000 }, { "epoch": 3.7852903491848426, "grad_norm": 1.2119040489196777, "learning_rate": 5.405460245979435e-06, "loss": 0.05, "step": 257025 }, { "epoch": 3.7856585322749297, "grad_norm": 1.6405060291290283, "learning_rate": 5.40382387401245e-06, "loss": 0.0516, "step": 257050 }, { "epoch": 3.7860267153650167, "grad_norm": 1.511218547821045, "learning_rate": 5.402187502045465e-06, "loss": 0.0593, "step": 257075 }, { "epoch": 3.7863948984551037, "grad_norm": 1.2260838747024536, "learning_rate": 5.400551130078481e-06, "loss": 0.0492, "step": 257100 }, { "epoch": 3.7867630815451907, "grad_norm": 0.8578277826309204, "learning_rate": 5.398914758111496e-06, "loss": 0.0527, "step": 257125 }, { "epoch": 3.7871312646352777, "grad_norm": 1.2542473077774048, "learning_rate": 5.3972783861445124e-06, "loss": 0.0541, "step": 257150 }, { "epoch": 3.7874994477253647, "grad_norm": 1.1171574592590332, "learning_rate": 5.395642014177527e-06, "loss": 0.0549, "step": 257175 }, { "epoch": 3.7878676308154517, "grad_norm": 1.5778684616088867, "learning_rate": 5.394005642210542e-06, "loss": 0.0609, "step": 257200 }, { "epoch": 3.7882358139055388, "grad_norm": 1.9366440773010254, "learning_rate": 5.392369270243558e-06, "loss": 0.052, "step": 257225 }, { "epoch": 3.7886039969956258, "grad_norm": 1.5419654846191406, "learning_rate": 5.390732898276574e-06, "loss": 0.0547, "step": 257250 }, { "epoch": 3.788972180085713, "grad_norm": 1.533220648765564, "learning_rate": 5.389096526309588e-06, "loss": 0.0539, "step": 257275 }, { "epoch": 3.7893403631758, "grad_norm": 1.63277268409729, "learning_rate": 5.387460154342604e-06, "loss": 0.0561, "step": 257300 }, { "epoch": 3.789708546265887, "grad_norm": 1.1307544708251953, "learning_rate": 5.3858237823756196e-06, "loss": 0.0546, "step": 257325 }, { "epoch": 3.7900767293559743, "grad_norm": 1.6596407890319824, "learning_rate": 5.384187410408636e-06, "loss": 0.0522, "step": 257350 }, { "epoch": 3.7904449124460613, "grad_norm": 1.5582321882247925, "learning_rate": 5.38255103844165e-06, "loss": 0.0566, "step": 257375 }, { "epoch": 3.7908130955361483, "grad_norm": 1.3519046306610107, "learning_rate": 5.3809146664746655e-06, "loss": 0.0588, "step": 257400 }, { "epoch": 3.7911812786262353, "grad_norm": 1.0576746463775635, "learning_rate": 5.379278294507682e-06, "loss": 0.0539, "step": 257425 }, { "epoch": 3.7915494617163223, "grad_norm": 0.9776924848556519, "learning_rate": 5.377641922540697e-06, "loss": 0.0547, "step": 257450 }, { "epoch": 3.7919176448064094, "grad_norm": 1.6325174570083618, "learning_rate": 5.376005550573713e-06, "loss": 0.0503, "step": 257475 }, { "epoch": 3.7922858278964964, "grad_norm": 1.3017586469650269, "learning_rate": 5.3743691786067275e-06, "loss": 0.0612, "step": 257500 }, { "epoch": 3.7926540109865834, "grad_norm": 1.3604521751403809, "learning_rate": 5.372732806639744e-06, "loss": 0.0495, "step": 257525 }, { "epoch": 3.7930221940766704, "grad_norm": 0.9956885576248169, "learning_rate": 5.371096434672759e-06, "loss": 0.0534, "step": 257550 }, { "epoch": 3.7933903771667574, "grad_norm": 1.5488227605819702, "learning_rate": 5.369460062705774e-06, "loss": 0.0565, "step": 257575 }, { "epoch": 3.7937585602568444, "grad_norm": 1.2222161293029785, "learning_rate": 5.36782369073879e-06, "loss": 0.0543, "step": 257600 }, { "epoch": 3.7941267433469315, "grad_norm": 1.633481740951538, "learning_rate": 5.366187318771805e-06, "loss": 0.0516, "step": 257625 }, { "epoch": 3.7944949264370185, "grad_norm": 1.9253498315811157, "learning_rate": 5.364550946804821e-06, "loss": 0.055, "step": 257650 }, { "epoch": 3.7948631095271055, "grad_norm": 1.1269255876541138, "learning_rate": 5.362914574837836e-06, "loss": 0.0598, "step": 257675 }, { "epoch": 3.795231292617193, "grad_norm": 1.2639343738555908, "learning_rate": 5.361278202870851e-06, "loss": 0.0523, "step": 257700 }, { "epoch": 3.79559947570728, "grad_norm": 0.8839380145072937, "learning_rate": 5.359641830903867e-06, "loss": 0.0578, "step": 257725 }, { "epoch": 3.795967658797367, "grad_norm": 1.2928338050842285, "learning_rate": 5.358005458936882e-06, "loss": 0.0519, "step": 257750 }, { "epoch": 3.796335841887454, "grad_norm": 0.8707910776138306, "learning_rate": 5.3563690869698984e-06, "loss": 0.0553, "step": 257775 }, { "epoch": 3.796704024977541, "grad_norm": 1.1992079019546509, "learning_rate": 5.354732715002913e-06, "loss": 0.054, "step": 257800 }, { "epoch": 3.797072208067628, "grad_norm": 2.014716148376465, "learning_rate": 5.353096343035928e-06, "loss": 0.0556, "step": 257825 }, { "epoch": 3.797440391157715, "grad_norm": 1.2445027828216553, "learning_rate": 5.351459971068944e-06, "loss": 0.0549, "step": 257850 }, { "epoch": 3.797808574247802, "grad_norm": 1.348606824874878, "learning_rate": 5.34982359910196e-06, "loss": 0.0552, "step": 257875 }, { "epoch": 3.798176757337889, "grad_norm": 1.4702867269515991, "learning_rate": 5.348187227134974e-06, "loss": 0.0526, "step": 257900 }, { "epoch": 3.798544940427976, "grad_norm": 2.106762409210205, "learning_rate": 5.34655085516799e-06, "loss": 0.057, "step": 257925 }, { "epoch": 3.798913123518063, "grad_norm": 1.7450799942016602, "learning_rate": 5.3449144832010056e-06, "loss": 0.0539, "step": 257950 }, { "epoch": 3.79928130660815, "grad_norm": 1.56754469871521, "learning_rate": 5.343278111234022e-06, "loss": 0.0484, "step": 257975 }, { "epoch": 3.799649489698237, "grad_norm": 1.1302671432495117, "learning_rate": 5.341641739267037e-06, "loss": 0.0552, "step": 258000 }, { "epoch": 3.800017672788324, "grad_norm": 1.7833666801452637, "learning_rate": 5.3400053673000515e-06, "loss": 0.0575, "step": 258025 }, { "epoch": 3.800385855878411, "grad_norm": 1.1021578311920166, "learning_rate": 5.338368995333068e-06, "loss": 0.059, "step": 258050 }, { "epoch": 3.800754038968498, "grad_norm": 1.235292673110962, "learning_rate": 5.336732623366083e-06, "loss": 0.0503, "step": 258075 }, { "epoch": 3.801122222058585, "grad_norm": 1.4408515691757202, "learning_rate": 5.335096251399099e-06, "loss": 0.0605, "step": 258100 }, { "epoch": 3.801490405148672, "grad_norm": 1.5457375049591064, "learning_rate": 5.3334598794321135e-06, "loss": 0.0512, "step": 258125 }, { "epoch": 3.801858588238759, "grad_norm": 1.3230416774749756, "learning_rate": 5.331823507465129e-06, "loss": 0.0516, "step": 258150 }, { "epoch": 3.8022267713288462, "grad_norm": 0.7777018547058105, "learning_rate": 5.330187135498145e-06, "loss": 0.0459, "step": 258175 }, { "epoch": 3.8025949544189332, "grad_norm": 1.3024753332138062, "learning_rate": 5.32855076353116e-06, "loss": 0.0559, "step": 258200 }, { "epoch": 3.8029631375090203, "grad_norm": 1.4986072778701782, "learning_rate": 5.326914391564176e-06, "loss": 0.058, "step": 258225 }, { "epoch": 3.8033313205991073, "grad_norm": 1.0987248420715332, "learning_rate": 5.325278019597191e-06, "loss": 0.052, "step": 258250 }, { "epoch": 3.8036995036891943, "grad_norm": 1.2068313360214233, "learning_rate": 5.323641647630207e-06, "loss": 0.0524, "step": 258275 }, { "epoch": 3.8040676867792818, "grad_norm": 1.3169461488723755, "learning_rate": 5.322005275663222e-06, "loss": 0.0561, "step": 258300 }, { "epoch": 3.8044358698693688, "grad_norm": 0.6551405191421509, "learning_rate": 5.320368903696237e-06, "loss": 0.0533, "step": 258325 }, { "epoch": 3.804804052959456, "grad_norm": 1.0101947784423828, "learning_rate": 5.318732531729253e-06, "loss": 0.0532, "step": 258350 }, { "epoch": 3.805172236049543, "grad_norm": 0.8900593519210815, "learning_rate": 5.317096159762268e-06, "loss": 0.0472, "step": 258375 }, { "epoch": 3.80554041913963, "grad_norm": 1.1752017736434937, "learning_rate": 5.315459787795284e-06, "loss": 0.0559, "step": 258400 }, { "epoch": 3.805908602229717, "grad_norm": 1.813315987586975, "learning_rate": 5.3138234158283e-06, "loss": 0.0616, "step": 258425 }, { "epoch": 3.806276785319804, "grad_norm": 1.255013108253479, "learning_rate": 5.312187043861314e-06, "loss": 0.0555, "step": 258450 }, { "epoch": 3.806644968409891, "grad_norm": 1.2944269180297852, "learning_rate": 5.31055067189433e-06, "loss": 0.0603, "step": 258475 }, { "epoch": 3.807013151499978, "grad_norm": 1.3191167116165161, "learning_rate": 5.308914299927346e-06, "loss": 0.0504, "step": 258500 }, { "epoch": 3.807381334590065, "grad_norm": 1.7232218980789185, "learning_rate": 5.307277927960362e-06, "loss": 0.0567, "step": 258525 }, { "epoch": 3.807749517680152, "grad_norm": 1.3582383394241333, "learning_rate": 5.305641555993376e-06, "loss": 0.0576, "step": 258550 }, { "epoch": 3.808117700770239, "grad_norm": 1.328294038772583, "learning_rate": 5.3040051840263916e-06, "loss": 0.0601, "step": 258575 }, { "epoch": 3.808485883860326, "grad_norm": 1.5954055786132812, "learning_rate": 5.302368812059408e-06, "loss": 0.0512, "step": 258600 }, { "epoch": 3.808854066950413, "grad_norm": 1.3190314769744873, "learning_rate": 5.300732440092423e-06, "loss": 0.0492, "step": 258625 }, { "epoch": 3.8092222500405004, "grad_norm": 1.1978700160980225, "learning_rate": 5.2990960681254375e-06, "loss": 0.0588, "step": 258650 }, { "epoch": 3.8095904331305874, "grad_norm": 1.455289363861084, "learning_rate": 5.297459696158454e-06, "loss": 0.0479, "step": 258675 }, { "epoch": 3.8099586162206744, "grad_norm": 1.4614241123199463, "learning_rate": 5.295823324191469e-06, "loss": 0.0562, "step": 258700 }, { "epoch": 3.8103267993107615, "grad_norm": 1.2461235523223877, "learning_rate": 5.294186952224485e-06, "loss": 0.0497, "step": 258725 }, { "epoch": 3.8106949824008485, "grad_norm": 1.2991175651550293, "learning_rate": 5.2925505802574995e-06, "loss": 0.0614, "step": 258750 }, { "epoch": 3.8110631654909355, "grad_norm": 1.3581284284591675, "learning_rate": 5.290914208290515e-06, "loss": 0.0553, "step": 258775 }, { "epoch": 3.8114313485810225, "grad_norm": 1.3191865682601929, "learning_rate": 5.289277836323531e-06, "loss": 0.0487, "step": 258800 }, { "epoch": 3.8117995316711095, "grad_norm": 1.309084415435791, "learning_rate": 5.287641464356546e-06, "loss": 0.0535, "step": 258825 }, { "epoch": 3.8121677147611965, "grad_norm": 1.2470643520355225, "learning_rate": 5.286005092389561e-06, "loss": 0.0565, "step": 258850 }, { "epoch": 3.8125358978512836, "grad_norm": 1.290117859840393, "learning_rate": 5.284368720422577e-06, "loss": 0.0492, "step": 258875 }, { "epoch": 3.8129040809413706, "grad_norm": 1.2575085163116455, "learning_rate": 5.282732348455592e-06, "loss": 0.0467, "step": 258900 }, { "epoch": 3.8132722640314576, "grad_norm": 1.0561531782150269, "learning_rate": 5.281095976488608e-06, "loss": 0.0465, "step": 258925 }, { "epoch": 3.8136404471215446, "grad_norm": 1.3675233125686646, "learning_rate": 5.279459604521624e-06, "loss": 0.0576, "step": 258950 }, { "epoch": 3.8140086302116316, "grad_norm": 1.7130180597305298, "learning_rate": 5.277823232554639e-06, "loss": 0.0512, "step": 258975 }, { "epoch": 3.8143768133017186, "grad_norm": 1.4887468814849854, "learning_rate": 5.276186860587654e-06, "loss": 0.0585, "step": 259000 }, { "epoch": 3.8147449963918056, "grad_norm": 1.6468786001205444, "learning_rate": 5.2745504886206696e-06, "loss": 0.0577, "step": 259025 }, { "epoch": 3.8151131794818927, "grad_norm": 1.610073447227478, "learning_rate": 5.272914116653686e-06, "loss": 0.0531, "step": 259050 }, { "epoch": 3.8154813625719797, "grad_norm": 1.2798658609390259, "learning_rate": 5.2712777446867e-06, "loss": 0.0537, "step": 259075 }, { "epoch": 3.8158495456620667, "grad_norm": 1.083975911140442, "learning_rate": 5.269706827598396e-06, "loss": 0.0554, "step": 259100 }, { "epoch": 3.8162177287521537, "grad_norm": 1.5547819137573242, "learning_rate": 5.268070455631411e-06, "loss": 0.0537, "step": 259125 }, { "epoch": 3.8165859118422407, "grad_norm": 1.2542880773544312, "learning_rate": 5.266434083664426e-06, "loss": 0.0545, "step": 259150 }, { "epoch": 3.8169540949323277, "grad_norm": 1.0080958604812622, "learning_rate": 5.264797711697442e-06, "loss": 0.0548, "step": 259175 }, { "epoch": 3.8173222780224147, "grad_norm": 1.3396576642990112, "learning_rate": 5.263161339730457e-06, "loss": 0.0487, "step": 259200 }, { "epoch": 3.8176904611125018, "grad_norm": 1.5910086631774902, "learning_rate": 5.261524967763473e-06, "loss": 0.0472, "step": 259225 }, { "epoch": 3.8180586442025892, "grad_norm": 1.5560002326965332, "learning_rate": 5.2598885957964884e-06, "loss": 0.0481, "step": 259250 }, { "epoch": 3.8184268272926762, "grad_norm": 1.1967967748641968, "learning_rate": 5.258252223829503e-06, "loss": 0.0675, "step": 259275 }, { "epoch": 3.8187950103827633, "grad_norm": 1.1806341409683228, "learning_rate": 5.256615851862519e-06, "loss": 0.0596, "step": 259300 }, { "epoch": 3.8191631934728503, "grad_norm": 1.0007059574127197, "learning_rate": 5.254979479895534e-06, "loss": 0.0547, "step": 259325 }, { "epoch": 3.8195313765629373, "grad_norm": 1.5026322603225708, "learning_rate": 5.2533431079285505e-06, "loss": 0.053, "step": 259350 }, { "epoch": 3.8198995596530243, "grad_norm": 2.126260280609131, "learning_rate": 5.251706735961565e-06, "loss": 0.0604, "step": 259375 }, { "epoch": 3.8202677427431113, "grad_norm": 1.415503740310669, "learning_rate": 5.25007036399458e-06, "loss": 0.0551, "step": 259400 }, { "epoch": 3.8206359258331983, "grad_norm": 1.8415940999984741, "learning_rate": 5.248433992027596e-06, "loss": 0.0535, "step": 259425 }, { "epoch": 3.8210041089232853, "grad_norm": 1.6008800268173218, "learning_rate": 5.246797620060612e-06, "loss": 0.0547, "step": 259450 }, { "epoch": 3.8213722920133724, "grad_norm": 1.4544116258621216, "learning_rate": 5.245161248093628e-06, "loss": 0.0492, "step": 259475 }, { "epoch": 3.8217404751034594, "grad_norm": 0.896043062210083, "learning_rate": 5.243524876126642e-06, "loss": 0.0548, "step": 259500 }, { "epoch": 3.8221086581935464, "grad_norm": 1.6785802841186523, "learning_rate": 5.241888504159658e-06, "loss": 0.0565, "step": 259525 }, { "epoch": 3.8224768412836334, "grad_norm": 0.8544201254844666, "learning_rate": 5.240252132192674e-06, "loss": 0.0536, "step": 259550 }, { "epoch": 3.8228450243737204, "grad_norm": 1.707797884941101, "learning_rate": 5.238615760225689e-06, "loss": 0.0525, "step": 259575 }, { "epoch": 3.8232132074638074, "grad_norm": 1.360618233680725, "learning_rate": 5.2369793882587035e-06, "loss": 0.0506, "step": 259600 }, { "epoch": 3.823581390553895, "grad_norm": 1.5496617555618286, "learning_rate": 5.23534301629172e-06, "loss": 0.0517, "step": 259625 }, { "epoch": 3.823949573643982, "grad_norm": 1.1844253540039062, "learning_rate": 5.233706644324735e-06, "loss": 0.0532, "step": 259650 }, { "epoch": 3.824317756734069, "grad_norm": 1.2134591341018677, "learning_rate": 5.232070272357751e-06, "loss": 0.0538, "step": 259675 }, { "epoch": 3.824685939824156, "grad_norm": 1.4161320924758911, "learning_rate": 5.230433900390766e-06, "loss": 0.0616, "step": 259700 }, { "epoch": 3.825054122914243, "grad_norm": 1.357408881187439, "learning_rate": 5.228797528423781e-06, "loss": 0.055, "step": 259725 }, { "epoch": 3.82542230600433, "grad_norm": 1.6354411840438843, "learning_rate": 5.227161156456797e-06, "loss": 0.0621, "step": 259750 }, { "epoch": 3.825790489094417, "grad_norm": 1.4002183675765991, "learning_rate": 5.225524784489812e-06, "loss": 0.0521, "step": 259775 }, { "epoch": 3.826158672184504, "grad_norm": 1.368042230606079, "learning_rate": 5.223888412522828e-06, "loss": 0.0456, "step": 259800 }, { "epoch": 3.826526855274591, "grad_norm": 1.519740104675293, "learning_rate": 5.222252040555843e-06, "loss": 0.0555, "step": 259825 }, { "epoch": 3.826895038364678, "grad_norm": 1.3023755550384521, "learning_rate": 5.220615668588859e-06, "loss": 0.0566, "step": 259850 }, { "epoch": 3.827263221454765, "grad_norm": 1.1263644695281982, "learning_rate": 5.2189792966218744e-06, "loss": 0.0544, "step": 259875 }, { "epoch": 3.827631404544852, "grad_norm": 1.0691365003585815, "learning_rate": 5.21734292465489e-06, "loss": 0.0468, "step": 259900 }, { "epoch": 3.827999587634939, "grad_norm": 1.4326441287994385, "learning_rate": 5.215706552687905e-06, "loss": 0.0489, "step": 259925 }, { "epoch": 3.828367770725026, "grad_norm": 1.323684573173523, "learning_rate": 5.21407018072092e-06, "loss": 0.0589, "step": 259950 }, { "epoch": 3.828735953815113, "grad_norm": 1.7488555908203125, "learning_rate": 5.2124338087539365e-06, "loss": 0.0552, "step": 259975 }, { "epoch": 3.8291041369052, "grad_norm": 1.8526763916015625, "learning_rate": 5.210797436786952e-06, "loss": 0.0521, "step": 260000 }, { "epoch": 3.829472319995287, "grad_norm": 1.5918937921524048, "learning_rate": 5.209161064819966e-06, "loss": 0.0545, "step": 260025 }, { "epoch": 3.829840503085374, "grad_norm": 1.4427952766418457, "learning_rate": 5.207524692852982e-06, "loss": 0.05, "step": 260050 }, { "epoch": 3.830208686175461, "grad_norm": 1.1258643865585327, "learning_rate": 5.205888320885998e-06, "loss": 0.0542, "step": 260075 }, { "epoch": 3.830576869265548, "grad_norm": 1.606545329093933, "learning_rate": 5.204251948919014e-06, "loss": 0.0619, "step": 260100 }, { "epoch": 3.830945052355635, "grad_norm": 1.4542641639709473, "learning_rate": 5.202615576952028e-06, "loss": 0.0513, "step": 260125 }, { "epoch": 3.831313235445722, "grad_norm": 1.8232176303863525, "learning_rate": 5.200979204985044e-06, "loss": 0.0529, "step": 260150 }, { "epoch": 3.8316814185358092, "grad_norm": 1.6592656373977661, "learning_rate": 5.19934283301806e-06, "loss": 0.0504, "step": 260175 }, { "epoch": 3.8320496016258963, "grad_norm": 1.9837111234664917, "learning_rate": 5.197706461051075e-06, "loss": 0.0633, "step": 260200 }, { "epoch": 3.8324177847159837, "grad_norm": 1.306334376335144, "learning_rate": 5.1960700890840895e-06, "loss": 0.0496, "step": 260225 }, { "epoch": 3.8327859678060707, "grad_norm": 1.4030455350875854, "learning_rate": 5.194433717117106e-06, "loss": 0.0469, "step": 260250 }, { "epoch": 3.8331541508961577, "grad_norm": 1.6958765983581543, "learning_rate": 5.192797345150121e-06, "loss": 0.0579, "step": 260275 }, { "epoch": 3.8335223339862448, "grad_norm": 1.386186957359314, "learning_rate": 5.191160973183137e-06, "loss": 0.0532, "step": 260300 }, { "epoch": 3.8338905170763318, "grad_norm": 1.8941980600357056, "learning_rate": 5.189524601216152e-06, "loss": 0.0579, "step": 260325 }, { "epoch": 3.834258700166419, "grad_norm": 1.5502009391784668, "learning_rate": 5.187888229249167e-06, "loss": 0.0561, "step": 260350 }, { "epoch": 3.834626883256506, "grad_norm": 1.8309978246688843, "learning_rate": 5.186251857282183e-06, "loss": 0.0554, "step": 260375 }, { "epoch": 3.834995066346593, "grad_norm": 1.4434939622879028, "learning_rate": 5.184615485315198e-06, "loss": 0.0545, "step": 260400 }, { "epoch": 3.83536324943668, "grad_norm": 0.6179027557373047, "learning_rate": 5.1829791133482145e-06, "loss": 0.0498, "step": 260425 }, { "epoch": 3.835731432526767, "grad_norm": 1.7048661708831787, "learning_rate": 5.181342741381229e-06, "loss": 0.0601, "step": 260450 }, { "epoch": 3.836099615616854, "grad_norm": 1.4048082828521729, "learning_rate": 5.179706369414244e-06, "loss": 0.0586, "step": 260475 }, { "epoch": 3.836467798706941, "grad_norm": 1.4202415943145752, "learning_rate": 5.1780699974472604e-06, "loss": 0.0634, "step": 260500 }, { "epoch": 3.836835981797028, "grad_norm": 1.163485050201416, "learning_rate": 5.176433625480276e-06, "loss": 0.0556, "step": 260525 }, { "epoch": 3.837204164887115, "grad_norm": 1.1400727033615112, "learning_rate": 5.174797253513291e-06, "loss": 0.0541, "step": 260550 }, { "epoch": 3.8375723479772024, "grad_norm": 1.8711127042770386, "learning_rate": 5.173160881546306e-06, "loss": 0.0542, "step": 260575 }, { "epoch": 3.8379405310672894, "grad_norm": 1.0933290719985962, "learning_rate": 5.171524509579322e-06, "loss": 0.0541, "step": 260600 }, { "epoch": 3.8383087141573764, "grad_norm": 1.738000512123108, "learning_rate": 5.169888137612338e-06, "loss": 0.0534, "step": 260625 }, { "epoch": 3.8386768972474634, "grad_norm": 1.521758794784546, "learning_rate": 5.168251765645352e-06, "loss": 0.0523, "step": 260650 }, { "epoch": 3.8390450803375504, "grad_norm": 1.4863930940628052, "learning_rate": 5.166615393678368e-06, "loss": 0.0525, "step": 260675 }, { "epoch": 3.8394132634276374, "grad_norm": 1.4816769361495972, "learning_rate": 5.164979021711384e-06, "loss": 0.0519, "step": 260700 }, { "epoch": 3.8397814465177245, "grad_norm": 1.8269590139389038, "learning_rate": 5.1633426497444e-06, "loss": 0.0539, "step": 260725 }, { "epoch": 3.8401496296078115, "grad_norm": 1.7181164026260376, "learning_rate": 5.161706277777414e-06, "loss": 0.0513, "step": 260750 }, { "epoch": 3.8405178126978985, "grad_norm": 1.3441529273986816, "learning_rate": 5.16006990581043e-06, "loss": 0.0532, "step": 260775 }, { "epoch": 3.8408859957879855, "grad_norm": 1.526574730873108, "learning_rate": 5.158433533843446e-06, "loss": 0.053, "step": 260800 }, { "epoch": 3.8412541788780725, "grad_norm": 0.9288049936294556, "learning_rate": 5.156797161876461e-06, "loss": 0.0479, "step": 260825 }, { "epoch": 3.8416223619681595, "grad_norm": 0.9120572209358215, "learning_rate": 5.1551607899094755e-06, "loss": 0.0524, "step": 260850 }, { "epoch": 3.8419905450582466, "grad_norm": 1.4088650941848755, "learning_rate": 5.153524417942492e-06, "loss": 0.0588, "step": 260875 }, { "epoch": 3.8423587281483336, "grad_norm": 1.0027408599853516, "learning_rate": 5.151888045975507e-06, "loss": 0.0625, "step": 260900 }, { "epoch": 3.8427269112384206, "grad_norm": 1.4070595502853394, "learning_rate": 5.150251674008523e-06, "loss": 0.0522, "step": 260925 }, { "epoch": 3.8430950943285076, "grad_norm": 0.9803788661956787, "learning_rate": 5.1486153020415384e-06, "loss": 0.0584, "step": 260950 }, { "epoch": 3.8434632774185946, "grad_norm": 1.607533574104309, "learning_rate": 5.146978930074553e-06, "loss": 0.0547, "step": 260975 }, { "epoch": 3.8438314605086816, "grad_norm": 1.5357824563980103, "learning_rate": 5.145342558107569e-06, "loss": 0.0584, "step": 261000 }, { "epoch": 3.8441996435987686, "grad_norm": 1.1385902166366577, "learning_rate": 5.143706186140584e-06, "loss": 0.0518, "step": 261025 }, { "epoch": 3.8445678266888557, "grad_norm": 2.078275680541992, "learning_rate": 5.1420698141736005e-06, "loss": 0.0593, "step": 261050 }, { "epoch": 3.8449360097789427, "grad_norm": 1.5946872234344482, "learning_rate": 5.140433442206615e-06, "loss": 0.0513, "step": 261075 }, { "epoch": 3.8453041928690297, "grad_norm": 1.8258835077285767, "learning_rate": 5.13879707023963e-06, "loss": 0.0475, "step": 261100 }, { "epoch": 3.8456723759591167, "grad_norm": 0.797872006893158, "learning_rate": 5.1371606982726464e-06, "loss": 0.0485, "step": 261125 }, { "epoch": 3.8460405590492037, "grad_norm": 1.5468593835830688, "learning_rate": 5.135524326305662e-06, "loss": 0.0551, "step": 261150 }, { "epoch": 3.846408742139291, "grad_norm": 0.8762492537498474, "learning_rate": 5.133887954338676e-06, "loss": 0.0548, "step": 261175 }, { "epoch": 3.846776925229378, "grad_norm": 1.2396718263626099, "learning_rate": 5.132251582371692e-06, "loss": 0.0543, "step": 261200 }, { "epoch": 3.847145108319465, "grad_norm": 1.6593016386032104, "learning_rate": 5.130615210404708e-06, "loss": 0.0505, "step": 261225 }, { "epoch": 3.8475132914095522, "grad_norm": 1.3239442110061646, "learning_rate": 5.128978838437724e-06, "loss": 0.0545, "step": 261250 }, { "epoch": 3.8478814744996392, "grad_norm": 1.420785903930664, "learning_rate": 5.127342466470738e-06, "loss": 0.0545, "step": 261275 }, { "epoch": 3.8482496575897263, "grad_norm": 1.5579971075057983, "learning_rate": 5.125706094503754e-06, "loss": 0.047, "step": 261300 }, { "epoch": 3.8486178406798133, "grad_norm": 1.2454949617385864, "learning_rate": 5.12406972253677e-06, "loss": 0.0555, "step": 261325 }, { "epoch": 3.8489860237699003, "grad_norm": 1.3548647165298462, "learning_rate": 5.122433350569785e-06, "loss": 0.0551, "step": 261350 }, { "epoch": 3.8493542068599873, "grad_norm": 1.1681543588638306, "learning_rate": 5.120796978602801e-06, "loss": 0.0557, "step": 261375 }, { "epoch": 3.8497223899500743, "grad_norm": 1.1274433135986328, "learning_rate": 5.119160606635816e-06, "loss": 0.0525, "step": 261400 }, { "epoch": 3.8500905730401613, "grad_norm": 1.9322034120559692, "learning_rate": 5.117524234668832e-06, "loss": 0.0577, "step": 261425 }, { "epoch": 3.8504587561302484, "grad_norm": 1.1695538759231567, "learning_rate": 5.115887862701847e-06, "loss": 0.0544, "step": 261450 }, { "epoch": 3.8508269392203354, "grad_norm": 1.565733790397644, "learning_rate": 5.114251490734863e-06, "loss": 0.0507, "step": 261475 }, { "epoch": 3.8511951223104224, "grad_norm": 1.5190887451171875, "learning_rate": 5.112615118767878e-06, "loss": 0.0532, "step": 261500 }, { "epoch": 3.85156330540051, "grad_norm": 1.5840818881988525, "learning_rate": 5.110978746800893e-06, "loss": 0.0562, "step": 261525 }, { "epoch": 3.851931488490597, "grad_norm": 2.0657613277435303, "learning_rate": 5.109342374833909e-06, "loss": 0.0574, "step": 261550 }, { "epoch": 3.852299671580684, "grad_norm": 1.2047373056411743, "learning_rate": 5.1077060028669244e-06, "loss": 0.0537, "step": 261575 }, { "epoch": 3.852667854670771, "grad_norm": 1.5429242849349976, "learning_rate": 5.106069630899939e-06, "loss": 0.0486, "step": 261600 }, { "epoch": 3.853036037760858, "grad_norm": 1.3733842372894287, "learning_rate": 5.104433258932955e-06, "loss": 0.0576, "step": 261625 }, { "epoch": 3.853404220850945, "grad_norm": 0.98934406042099, "learning_rate": 5.10279688696597e-06, "loss": 0.0474, "step": 261650 }, { "epoch": 3.853772403941032, "grad_norm": 1.2279473543167114, "learning_rate": 5.1011605149989865e-06, "loss": 0.0564, "step": 261675 }, { "epoch": 3.854140587031119, "grad_norm": 1.2108162641525269, "learning_rate": 5.099524143032001e-06, "loss": 0.0472, "step": 261700 }, { "epoch": 3.854508770121206, "grad_norm": 1.415026307106018, "learning_rate": 5.097887771065016e-06, "loss": 0.0558, "step": 261725 }, { "epoch": 3.854876953211293, "grad_norm": 1.457869291305542, "learning_rate": 5.0962513990980324e-06, "loss": 0.0556, "step": 261750 }, { "epoch": 3.85524513630138, "grad_norm": 1.6702696084976196, "learning_rate": 5.094615027131048e-06, "loss": 0.0513, "step": 261775 }, { "epoch": 3.855613319391467, "grad_norm": 1.2122410535812378, "learning_rate": 5.092978655164062e-06, "loss": 0.0528, "step": 261800 }, { "epoch": 3.855981502481554, "grad_norm": 1.269836664199829, "learning_rate": 5.091342283197078e-06, "loss": 0.0487, "step": 261825 }, { "epoch": 3.856349685571641, "grad_norm": 1.2878093719482422, "learning_rate": 5.089705911230094e-06, "loss": 0.046, "step": 261850 }, { "epoch": 3.856717868661728, "grad_norm": 1.4793931245803833, "learning_rate": 5.08806953926311e-06, "loss": 0.0509, "step": 261875 }, { "epoch": 3.857086051751815, "grad_norm": 1.5541657209396362, "learning_rate": 5.086433167296125e-06, "loss": 0.057, "step": 261900 }, { "epoch": 3.857454234841902, "grad_norm": 1.2834160327911377, "learning_rate": 5.0847967953291396e-06, "loss": 0.0547, "step": 261925 }, { "epoch": 3.857822417931989, "grad_norm": 1.3856425285339355, "learning_rate": 5.083160423362156e-06, "loss": 0.0525, "step": 261950 }, { "epoch": 3.858190601022076, "grad_norm": 1.2109434604644775, "learning_rate": 5.081524051395171e-06, "loss": 0.0509, "step": 261975 }, { "epoch": 3.858558784112163, "grad_norm": 1.5357753038406372, "learning_rate": 5.079887679428187e-06, "loss": 0.0554, "step": 262000 }, { "epoch": 3.85892696720225, "grad_norm": 1.0928494930267334, "learning_rate": 5.078251307461202e-06, "loss": 0.0502, "step": 262025 }, { "epoch": 3.859295150292337, "grad_norm": 1.6509815454483032, "learning_rate": 5.076680390372896e-06, "loss": 0.0555, "step": 262050 }, { "epoch": 3.859663333382424, "grad_norm": 1.2722944021224976, "learning_rate": 5.0750440184059125e-06, "loss": 0.0559, "step": 262075 }, { "epoch": 3.860031516472511, "grad_norm": 1.725926399230957, "learning_rate": 5.073407646438928e-06, "loss": 0.0572, "step": 262100 }, { "epoch": 3.8603996995625987, "grad_norm": 1.0288758277893066, "learning_rate": 5.071771274471943e-06, "loss": 0.0526, "step": 262125 }, { "epoch": 3.8607678826526857, "grad_norm": 1.2656705379486084, "learning_rate": 5.070134902504958e-06, "loss": 0.0496, "step": 262150 }, { "epoch": 3.8611360657427727, "grad_norm": 1.0348396301269531, "learning_rate": 5.068498530537974e-06, "loss": 0.0495, "step": 262175 }, { "epoch": 3.8615042488328597, "grad_norm": 1.073555827140808, "learning_rate": 5.06686215857099e-06, "loss": 0.0506, "step": 262200 }, { "epoch": 3.8618724319229467, "grad_norm": 1.2936421632766724, "learning_rate": 5.065225786604004e-06, "loss": 0.0522, "step": 262225 }, { "epoch": 3.8622406150130337, "grad_norm": 1.5264995098114014, "learning_rate": 5.0635894146370205e-06, "loss": 0.0502, "step": 262250 }, { "epoch": 3.8626087981031207, "grad_norm": 1.0455107688903809, "learning_rate": 5.061953042670036e-06, "loss": 0.0585, "step": 262275 }, { "epoch": 3.8629769811932078, "grad_norm": 1.1604970693588257, "learning_rate": 5.060316670703052e-06, "loss": 0.0518, "step": 262300 }, { "epoch": 3.8633451642832948, "grad_norm": 1.308096170425415, "learning_rate": 5.058680298736067e-06, "loss": 0.0488, "step": 262325 }, { "epoch": 3.863713347373382, "grad_norm": 0.7083810567855835, "learning_rate": 5.057043926769082e-06, "loss": 0.0527, "step": 262350 }, { "epoch": 3.864081530463469, "grad_norm": 1.0349743366241455, "learning_rate": 5.055407554802098e-06, "loss": 0.0579, "step": 262375 }, { "epoch": 3.864449713553556, "grad_norm": 1.4834082126617432, "learning_rate": 5.053771182835113e-06, "loss": 0.0609, "step": 262400 }, { "epoch": 3.864817896643643, "grad_norm": 1.3020261526107788, "learning_rate": 5.052134810868129e-06, "loss": 0.0538, "step": 262425 }, { "epoch": 3.86518607973373, "grad_norm": 1.718116283416748, "learning_rate": 5.050498438901144e-06, "loss": 0.0554, "step": 262450 }, { "epoch": 3.865554262823817, "grad_norm": 1.4437057971954346, "learning_rate": 5.048862066934159e-06, "loss": 0.0605, "step": 262475 }, { "epoch": 3.8659224459139043, "grad_norm": 1.3646372556686401, "learning_rate": 5.047225694967175e-06, "loss": 0.0439, "step": 262500 }, { "epoch": 3.8662906290039913, "grad_norm": 1.2433618307113647, "learning_rate": 5.0455893230001905e-06, "loss": 0.051, "step": 262525 }, { "epoch": 3.8666588120940784, "grad_norm": 1.3432698249816895, "learning_rate": 5.043952951033205e-06, "loss": 0.0494, "step": 262550 }, { "epoch": 3.8670269951841654, "grad_norm": 1.52839994430542, "learning_rate": 5.042316579066221e-06, "loss": 0.057, "step": 262575 }, { "epoch": 3.8673951782742524, "grad_norm": 0.7989602088928223, "learning_rate": 5.0406802070992364e-06, "loss": 0.0539, "step": 262600 }, { "epoch": 3.8677633613643394, "grad_norm": 1.0238368511199951, "learning_rate": 5.039043835132253e-06, "loss": 0.0535, "step": 262625 }, { "epoch": 3.8681315444544264, "grad_norm": 1.4565939903259277, "learning_rate": 5.037407463165267e-06, "loss": 0.0566, "step": 262650 }, { "epoch": 3.8684997275445134, "grad_norm": 1.0134023427963257, "learning_rate": 5.035771091198282e-06, "loss": 0.0469, "step": 262675 }, { "epoch": 3.8688679106346004, "grad_norm": 0.8739012479782104, "learning_rate": 5.0341347192312985e-06, "loss": 0.0476, "step": 262700 }, { "epoch": 3.8692360937246875, "grad_norm": 1.611704707145691, "learning_rate": 5.032498347264314e-06, "loss": 0.0558, "step": 262725 }, { "epoch": 3.8696042768147745, "grad_norm": 1.6190578937530518, "learning_rate": 5.030861975297328e-06, "loss": 0.0551, "step": 262750 }, { "epoch": 3.8699724599048615, "grad_norm": 1.473643183708191, "learning_rate": 5.029225603330344e-06, "loss": 0.0559, "step": 262775 }, { "epoch": 3.8703406429949485, "grad_norm": 1.2018488645553589, "learning_rate": 5.02758923136336e-06, "loss": 0.052, "step": 262800 }, { "epoch": 3.8707088260850355, "grad_norm": 0.9915162324905396, "learning_rate": 5.025952859396376e-06, "loss": 0.051, "step": 262825 }, { "epoch": 3.8710770091751225, "grad_norm": 1.7892978191375732, "learning_rate": 5.024316487429391e-06, "loss": 0.0601, "step": 262850 }, { "epoch": 3.8714451922652096, "grad_norm": 1.0372949838638306, "learning_rate": 5.0226801154624065e-06, "loss": 0.0492, "step": 262875 }, { "epoch": 3.8718133753552966, "grad_norm": 1.0803349018096924, "learning_rate": 5.021043743495422e-06, "loss": 0.05, "step": 262900 }, { "epoch": 3.8721815584453836, "grad_norm": 1.1820727586746216, "learning_rate": 5.019407371528437e-06, "loss": 0.0535, "step": 262925 }, { "epoch": 3.8725497415354706, "grad_norm": 1.3209728002548218, "learning_rate": 5.017770999561453e-06, "loss": 0.0577, "step": 262950 }, { "epoch": 3.8729179246255576, "grad_norm": 1.413087010383606, "learning_rate": 5.016134627594468e-06, "loss": 0.0524, "step": 262975 }, { "epoch": 3.8732861077156446, "grad_norm": 1.6288573741912842, "learning_rate": 5.014498255627484e-06, "loss": 0.054, "step": 263000 }, { "epoch": 3.8736542908057316, "grad_norm": 1.2065969705581665, "learning_rate": 5.012861883660499e-06, "loss": 0.0538, "step": 263025 }, { "epoch": 3.8740224738958187, "grad_norm": 1.3036879301071167, "learning_rate": 5.011225511693515e-06, "loss": 0.0498, "step": 263050 }, { "epoch": 3.8743906569859057, "grad_norm": 0.9634690284729004, "learning_rate": 5.00958913972653e-06, "loss": 0.054, "step": 263075 }, { "epoch": 3.874758840075993, "grad_norm": 1.1450031995773315, "learning_rate": 5.007952767759545e-06, "loss": 0.0643, "step": 263100 }, { "epoch": 3.87512702316608, "grad_norm": 1.444369912147522, "learning_rate": 5.006316395792561e-06, "loss": 0.0589, "step": 263125 }, { "epoch": 3.875495206256167, "grad_norm": 1.1926497220993042, "learning_rate": 5.0046800238255765e-06, "loss": 0.0553, "step": 263150 }, { "epoch": 3.875863389346254, "grad_norm": 1.1588252782821655, "learning_rate": 5.003043651858591e-06, "loss": 0.0512, "step": 263175 }, { "epoch": 3.876231572436341, "grad_norm": 1.4155786037445068, "learning_rate": 5.001407279891607e-06, "loss": 0.0596, "step": 263200 }, { "epoch": 3.876599755526428, "grad_norm": 1.4117778539657593, "learning_rate": 4.9997709079246224e-06, "loss": 0.0534, "step": 263225 }, { "epoch": 3.8769679386165152, "grad_norm": 1.7850371599197388, "learning_rate": 4.998134535957638e-06, "loss": 0.0533, "step": 263250 }, { "epoch": 3.8773361217066022, "grad_norm": 1.54677152633667, "learning_rate": 4.996498163990653e-06, "loss": 0.0532, "step": 263275 }, { "epoch": 3.8777043047966893, "grad_norm": 1.1184360980987549, "learning_rate": 4.994861792023669e-06, "loss": 0.0563, "step": 263300 }, { "epoch": 3.8780724878867763, "grad_norm": 1.3255177736282349, "learning_rate": 4.9932254200566845e-06, "loss": 0.0518, "step": 263325 }, { "epoch": 3.8784406709768633, "grad_norm": 1.4861990213394165, "learning_rate": 4.9915890480897e-06, "loss": 0.0508, "step": 263350 }, { "epoch": 3.8788088540669503, "grad_norm": 1.1583616733551025, "learning_rate": 4.989952676122715e-06, "loss": 0.0527, "step": 263375 }, { "epoch": 3.8791770371570373, "grad_norm": 1.5475693941116333, "learning_rate": 4.988316304155731e-06, "loss": 0.0523, "step": 263400 }, { "epoch": 3.8795452202471243, "grad_norm": 1.5123289823532104, "learning_rate": 4.986679932188746e-06, "loss": 0.0545, "step": 263425 }, { "epoch": 3.879913403337212, "grad_norm": 1.5087281465530396, "learning_rate": 4.985043560221762e-06, "loss": 0.0503, "step": 263450 }, { "epoch": 3.880281586427299, "grad_norm": 1.328429937362671, "learning_rate": 4.983407188254777e-06, "loss": 0.0508, "step": 263475 }, { "epoch": 3.880649769517386, "grad_norm": 1.2900867462158203, "learning_rate": 4.9817708162877925e-06, "loss": 0.0566, "step": 263500 }, { "epoch": 3.881017952607473, "grad_norm": 1.924965500831604, "learning_rate": 4.980134444320808e-06, "loss": 0.0527, "step": 263525 }, { "epoch": 3.88138613569756, "grad_norm": 1.3238420486450195, "learning_rate": 4.978498072353823e-06, "loss": 0.0523, "step": 263550 }, { "epoch": 3.881754318787647, "grad_norm": 1.123356819152832, "learning_rate": 4.976861700386838e-06, "loss": 0.0523, "step": 263575 }, { "epoch": 3.882122501877734, "grad_norm": 1.0134892463684082, "learning_rate": 4.9752253284198545e-06, "loss": 0.0435, "step": 263600 }, { "epoch": 3.882490684967821, "grad_norm": 1.3513299226760864, "learning_rate": 4.97358895645287e-06, "loss": 0.0528, "step": 263625 }, { "epoch": 3.882858868057908, "grad_norm": 1.4515777826309204, "learning_rate": 4.971952584485885e-06, "loss": 0.0521, "step": 263650 }, { "epoch": 3.883227051147995, "grad_norm": 1.0249426364898682, "learning_rate": 4.9703162125189005e-06, "loss": 0.0549, "step": 263675 }, { "epoch": 3.883595234238082, "grad_norm": 1.5847457647323608, "learning_rate": 4.968679840551916e-06, "loss": 0.0542, "step": 263700 }, { "epoch": 3.883963417328169, "grad_norm": 1.0688776969909668, "learning_rate": 4.967043468584931e-06, "loss": 0.0513, "step": 263725 }, { "epoch": 3.884331600418256, "grad_norm": 1.574480414390564, "learning_rate": 4.965407096617947e-06, "loss": 0.0675, "step": 263750 }, { "epoch": 3.884699783508343, "grad_norm": 1.8858754634857178, "learning_rate": 4.963770724650962e-06, "loss": 0.0537, "step": 263775 }, { "epoch": 3.88506796659843, "grad_norm": 1.6930290460586548, "learning_rate": 4.962134352683978e-06, "loss": 0.0499, "step": 263800 }, { "epoch": 3.885436149688517, "grad_norm": 1.8441869020462036, "learning_rate": 4.960497980716993e-06, "loss": 0.0534, "step": 263825 }, { "epoch": 3.885804332778604, "grad_norm": 1.0883420705795288, "learning_rate": 4.9588616087500084e-06, "loss": 0.0518, "step": 263850 }, { "epoch": 3.886172515868691, "grad_norm": 0.9594785571098328, "learning_rate": 4.957225236783025e-06, "loss": 0.0551, "step": 263875 }, { "epoch": 3.886540698958778, "grad_norm": 1.289876937866211, "learning_rate": 4.955588864816039e-06, "loss": 0.0551, "step": 263900 }, { "epoch": 3.886908882048865, "grad_norm": 0.6620969772338867, "learning_rate": 4.953952492849055e-06, "loss": 0.0458, "step": 263925 }, { "epoch": 3.887277065138952, "grad_norm": 1.5180296897888184, "learning_rate": 4.9523161208820705e-06, "loss": 0.0603, "step": 263950 }, { "epoch": 3.887645248229039, "grad_norm": 1.2539643049240112, "learning_rate": 4.950679748915086e-06, "loss": 0.049, "step": 263975 }, { "epoch": 3.888013431319126, "grad_norm": 1.4199472665786743, "learning_rate": 4.949043376948101e-06, "loss": 0.0582, "step": 264000 }, { "epoch": 3.888381614409213, "grad_norm": 1.2492717504501343, "learning_rate": 4.947407004981116e-06, "loss": 0.0476, "step": 264025 }, { "epoch": 3.8887497974993006, "grad_norm": 1.4540746212005615, "learning_rate": 4.945770633014132e-06, "loss": 0.0511, "step": 264050 }, { "epoch": 3.8891179805893876, "grad_norm": 1.660137414932251, "learning_rate": 4.944134261047148e-06, "loss": 0.0598, "step": 264075 }, { "epoch": 3.8894861636794746, "grad_norm": 1.6908013820648193, "learning_rate": 4.942497889080163e-06, "loss": 0.0497, "step": 264100 }, { "epoch": 3.8898543467695617, "grad_norm": 1.3535715341567993, "learning_rate": 4.9408615171131785e-06, "loss": 0.0515, "step": 264125 }, { "epoch": 3.8902225298596487, "grad_norm": 1.08555006980896, "learning_rate": 4.939225145146194e-06, "loss": 0.0538, "step": 264150 }, { "epoch": 3.8905907129497357, "grad_norm": 1.4909448623657227, "learning_rate": 4.937588773179209e-06, "loss": 0.0535, "step": 264175 }, { "epoch": 3.8909588960398227, "grad_norm": 1.6206080913543701, "learning_rate": 4.935952401212224e-06, "loss": 0.0493, "step": 264200 }, { "epoch": 3.8913270791299097, "grad_norm": 1.244004726409912, "learning_rate": 4.9343160292452405e-06, "loss": 0.057, "step": 264225 }, { "epoch": 3.8916952622199967, "grad_norm": 1.2292659282684326, "learning_rate": 4.932679657278255e-06, "loss": 0.0506, "step": 264250 }, { "epoch": 3.8920634453100837, "grad_norm": 0.9641752243041992, "learning_rate": 4.931043285311271e-06, "loss": 0.055, "step": 264275 }, { "epoch": 3.8924316284001708, "grad_norm": 1.9379572868347168, "learning_rate": 4.9294069133442865e-06, "loss": 0.0576, "step": 264300 }, { "epoch": 3.8927998114902578, "grad_norm": 1.2098896503448486, "learning_rate": 4.927770541377302e-06, "loss": 0.0567, "step": 264325 }, { "epoch": 3.893167994580345, "grad_norm": 1.4082597494125366, "learning_rate": 4.926134169410318e-06, "loss": 0.0521, "step": 264350 }, { "epoch": 3.893536177670432, "grad_norm": 1.9065814018249512, "learning_rate": 4.924497797443332e-06, "loss": 0.063, "step": 264375 }, { "epoch": 3.8939043607605193, "grad_norm": 1.0186904668807983, "learning_rate": 4.9228614254763485e-06, "loss": 0.0491, "step": 264400 }, { "epoch": 3.8942725438506063, "grad_norm": 1.4114601612091064, "learning_rate": 4.921225053509364e-06, "loss": 0.0519, "step": 264425 }, { "epoch": 3.8946407269406933, "grad_norm": 1.9180808067321777, "learning_rate": 4.919588681542379e-06, "loss": 0.0509, "step": 264450 }, { "epoch": 3.8950089100307803, "grad_norm": 1.1067109107971191, "learning_rate": 4.9179523095753944e-06, "loss": 0.0531, "step": 264475 }, { "epoch": 3.8953770931208673, "grad_norm": 1.7426888942718506, "learning_rate": 4.916315937608411e-06, "loss": 0.0531, "step": 264500 }, { "epoch": 3.8957452762109543, "grad_norm": 1.4335108995437622, "learning_rate": 4.914679565641425e-06, "loss": 0.0556, "step": 264525 }, { "epoch": 3.8961134593010414, "grad_norm": 1.5560462474822998, "learning_rate": 4.913043193674441e-06, "loss": 0.0539, "step": 264550 }, { "epoch": 3.8964816423911284, "grad_norm": 1.5154043436050415, "learning_rate": 4.9114068217074565e-06, "loss": 0.0518, "step": 264575 }, { "epoch": 3.8968498254812154, "grad_norm": 1.4359437227249146, "learning_rate": 4.909770449740472e-06, "loss": 0.0522, "step": 264600 }, { "epoch": 3.8972180085713024, "grad_norm": 1.1610114574432373, "learning_rate": 4.908134077773487e-06, "loss": 0.0522, "step": 264625 }, { "epoch": 3.8975861916613894, "grad_norm": 1.2974045276641846, "learning_rate": 4.906497705806502e-06, "loss": 0.055, "step": 264650 }, { "epoch": 3.8979543747514764, "grad_norm": 0.9947437644004822, "learning_rate": 4.904861333839518e-06, "loss": 0.0524, "step": 264675 }, { "epoch": 3.8983225578415635, "grad_norm": 1.2135608196258545, "learning_rate": 4.903224961872534e-06, "loss": 0.0524, "step": 264700 }, { "epoch": 3.8986907409316505, "grad_norm": 1.270745873451233, "learning_rate": 4.901588589905549e-06, "loss": 0.052, "step": 264725 }, { "epoch": 3.8990589240217375, "grad_norm": 1.5922800302505493, "learning_rate": 4.8999522179385645e-06, "loss": 0.0521, "step": 264750 }, { "epoch": 3.8994271071118245, "grad_norm": 1.6764752864837646, "learning_rate": 4.89831584597158e-06, "loss": 0.0508, "step": 264775 }, { "epoch": 3.8997952902019115, "grad_norm": 1.5361818075180054, "learning_rate": 4.896679474004595e-06, "loss": 0.0517, "step": 264800 }, { "epoch": 3.9001634732919985, "grad_norm": 1.6716116666793823, "learning_rate": 4.895043102037611e-06, "loss": 0.0558, "step": 264825 }, { "epoch": 3.9005316563820855, "grad_norm": 0.8519721031188965, "learning_rate": 4.8934067300706265e-06, "loss": 0.0507, "step": 264850 }, { "epoch": 3.9008998394721726, "grad_norm": 1.0493004322052002, "learning_rate": 4.891770358103642e-06, "loss": 0.0485, "step": 264875 }, { "epoch": 3.9012680225622596, "grad_norm": 1.2470858097076416, "learning_rate": 4.890133986136657e-06, "loss": 0.0549, "step": 264900 }, { "epoch": 3.9016362056523466, "grad_norm": 1.2583844661712646, "learning_rate": 4.8884976141696725e-06, "loss": 0.0578, "step": 264925 }, { "epoch": 3.9020043887424336, "grad_norm": 1.489508032798767, "learning_rate": 4.886861242202688e-06, "loss": 0.0513, "step": 264950 }, { "epoch": 3.9023725718325206, "grad_norm": 1.6750906705856323, "learning_rate": 4.885224870235704e-06, "loss": 0.0547, "step": 264975 }, { "epoch": 3.902740754922608, "grad_norm": 0.6309359669685364, "learning_rate": 4.883588498268718e-06, "loss": 0.0565, "step": 265000 }, { "epoch": 3.903108938012695, "grad_norm": 0.8855849504470825, "learning_rate": 4.8819521263017345e-06, "loss": 0.0504, "step": 265025 }, { "epoch": 3.903477121102782, "grad_norm": 1.6380351781845093, "learning_rate": 4.88031575433475e-06, "loss": 0.0582, "step": 265050 }, { "epoch": 3.903845304192869, "grad_norm": 1.115140676498413, "learning_rate": 4.878679382367765e-06, "loss": 0.0549, "step": 265075 }, { "epoch": 3.904213487282956, "grad_norm": 0.9697654843330383, "learning_rate": 4.8770430104007804e-06, "loss": 0.0525, "step": 265100 }, { "epoch": 3.904581670373043, "grad_norm": 1.5834414958953857, "learning_rate": 4.875406638433796e-06, "loss": 0.06, "step": 265125 }, { "epoch": 3.90494985346313, "grad_norm": 1.3147239685058594, "learning_rate": 4.873770266466811e-06, "loss": 0.0602, "step": 265150 }, { "epoch": 3.905318036553217, "grad_norm": 1.2951992750167847, "learning_rate": 4.872133894499827e-06, "loss": 0.0535, "step": 265175 }, { "epoch": 3.905686219643304, "grad_norm": 1.5094690322875977, "learning_rate": 4.8704975225328425e-06, "loss": 0.0498, "step": 265200 }, { "epoch": 3.906054402733391, "grad_norm": 1.495611548423767, "learning_rate": 4.868861150565858e-06, "loss": 0.0577, "step": 265225 }, { "epoch": 3.9064225858234782, "grad_norm": 1.5796841382980347, "learning_rate": 4.867224778598873e-06, "loss": 0.0531, "step": 265250 }, { "epoch": 3.9067907689135652, "grad_norm": 1.49833083152771, "learning_rate": 4.865588406631888e-06, "loss": 0.0563, "step": 265275 }, { "epoch": 3.9071589520036523, "grad_norm": 1.3084923028945923, "learning_rate": 4.8639520346649046e-06, "loss": 0.0498, "step": 265300 }, { "epoch": 3.9075271350937393, "grad_norm": 1.0368095636367798, "learning_rate": 4.86231566269792e-06, "loss": 0.0488, "step": 265325 }, { "epoch": 3.9078953181838263, "grad_norm": 1.35903799533844, "learning_rate": 4.860679290730935e-06, "loss": 0.0602, "step": 265350 }, { "epoch": 3.9082635012739138, "grad_norm": 1.058145523071289, "learning_rate": 4.8590429187639505e-06, "loss": 0.0462, "step": 265375 }, { "epoch": 3.9086316843640008, "grad_norm": 1.3978030681610107, "learning_rate": 4.857406546796966e-06, "loss": 0.0501, "step": 265400 }, { "epoch": 3.908999867454088, "grad_norm": 1.2879433631896973, "learning_rate": 4.855770174829981e-06, "loss": 0.0524, "step": 265425 }, { "epoch": 3.909368050544175, "grad_norm": 1.6850285530090332, "learning_rate": 4.854133802862997e-06, "loss": 0.0505, "step": 265450 }, { "epoch": 3.909736233634262, "grad_norm": 1.6365737915039062, "learning_rate": 4.852497430896012e-06, "loss": 0.0553, "step": 265475 }, { "epoch": 3.910104416724349, "grad_norm": 1.3211332559585571, "learning_rate": 4.850861058929028e-06, "loss": 0.061, "step": 265500 }, { "epoch": 3.910472599814436, "grad_norm": 1.489636778831482, "learning_rate": 4.849224686962043e-06, "loss": 0.055, "step": 265525 }, { "epoch": 3.910840782904523, "grad_norm": 1.7707267999649048, "learning_rate": 4.8475883149950585e-06, "loss": 0.0501, "step": 265550 }, { "epoch": 3.91120896599461, "grad_norm": 1.2592132091522217, "learning_rate": 4.845951943028074e-06, "loss": 0.0469, "step": 265575 }, { "epoch": 3.911577149084697, "grad_norm": 1.3604294061660767, "learning_rate": 4.84431557106109e-06, "loss": 0.0499, "step": 265600 }, { "epoch": 3.911945332174784, "grad_norm": 1.6320096254348755, "learning_rate": 4.842679199094104e-06, "loss": 0.0579, "step": 265625 }, { "epoch": 3.912313515264871, "grad_norm": 1.7693284749984741, "learning_rate": 4.8410428271271205e-06, "loss": 0.0587, "step": 265650 }, { "epoch": 3.912681698354958, "grad_norm": 1.6865720748901367, "learning_rate": 4.839406455160136e-06, "loss": 0.0528, "step": 265675 }, { "epoch": 3.913049881445045, "grad_norm": 1.4077274799346924, "learning_rate": 4.837770083193151e-06, "loss": 0.0535, "step": 265700 }, { "epoch": 3.913418064535132, "grad_norm": 1.789424180984497, "learning_rate": 4.8361337112261664e-06, "loss": 0.0508, "step": 265725 }, { "epoch": 3.913786247625219, "grad_norm": 1.5183805227279663, "learning_rate": 4.834562794137861e-06, "loss": 0.0581, "step": 265750 }, { "epoch": 3.914154430715306, "grad_norm": 1.3152365684509277, "learning_rate": 4.832926422170877e-06, "loss": 0.0572, "step": 265775 }, { "epoch": 3.914522613805393, "grad_norm": 1.2793784141540527, "learning_rate": 4.831290050203893e-06, "loss": 0.0532, "step": 265800 }, { "epoch": 3.91489079689548, "grad_norm": 0.9389621615409851, "learning_rate": 4.829653678236908e-06, "loss": 0.0491, "step": 265825 }, { "epoch": 3.915258979985567, "grad_norm": 1.8882869482040405, "learning_rate": 4.828017306269923e-06, "loss": 0.056, "step": 265850 }, { "epoch": 3.915627163075654, "grad_norm": 1.0818278789520264, "learning_rate": 4.8263809343029385e-06, "loss": 0.0528, "step": 265875 }, { "epoch": 3.915995346165741, "grad_norm": 1.7939482927322388, "learning_rate": 4.824744562335954e-06, "loss": 0.0538, "step": 265900 }, { "epoch": 3.916363529255828, "grad_norm": 0.9074087142944336, "learning_rate": 4.82310819036897e-06, "loss": 0.0457, "step": 265925 }, { "epoch": 3.916731712345915, "grad_norm": 1.7172048091888428, "learning_rate": 4.8214718184019844e-06, "loss": 0.0566, "step": 265950 }, { "epoch": 3.9170998954360026, "grad_norm": 0.9454116225242615, "learning_rate": 4.819835446435001e-06, "loss": 0.0521, "step": 265975 }, { "epoch": 3.9174680785260896, "grad_norm": 1.4153268337249756, "learning_rate": 4.818199074468016e-06, "loss": 0.0572, "step": 266000 }, { "epoch": 3.9178362616161766, "grad_norm": 1.7052981853485107, "learning_rate": 4.816562702501031e-06, "loss": 0.0532, "step": 266025 }, { "epoch": 3.9182044447062636, "grad_norm": 1.160554051399231, "learning_rate": 4.8149263305340465e-06, "loss": 0.0527, "step": 266050 }, { "epoch": 3.9185726277963506, "grad_norm": 1.0327438116073608, "learning_rate": 4.813289958567063e-06, "loss": 0.0535, "step": 266075 }, { "epoch": 3.9189408108864376, "grad_norm": 1.128814935684204, "learning_rate": 4.811653586600077e-06, "loss": 0.0545, "step": 266100 }, { "epoch": 3.9193089939765247, "grad_norm": 1.0633081197738647, "learning_rate": 4.810017214633093e-06, "loss": 0.054, "step": 266125 }, { "epoch": 3.9196771770666117, "grad_norm": 1.7413182258605957, "learning_rate": 4.8083808426661086e-06, "loss": 0.0497, "step": 266150 }, { "epoch": 3.9200453601566987, "grad_norm": 1.3477416038513184, "learning_rate": 4.806744470699124e-06, "loss": 0.0495, "step": 266175 }, { "epoch": 3.9204135432467857, "grad_norm": 1.4275422096252441, "learning_rate": 4.805108098732139e-06, "loss": 0.0514, "step": 266200 }, { "epoch": 3.9207817263368727, "grad_norm": 1.2836915254592896, "learning_rate": 4.8034717267651545e-06, "loss": 0.0534, "step": 266225 }, { "epoch": 3.9211499094269597, "grad_norm": 1.096981167793274, "learning_rate": 4.801835354798171e-06, "loss": 0.0511, "step": 266250 }, { "epoch": 3.9215180925170467, "grad_norm": 1.4388827085494995, "learning_rate": 4.800198982831186e-06, "loss": 0.0544, "step": 266275 }, { "epoch": 3.9218862756071338, "grad_norm": 1.7638380527496338, "learning_rate": 4.798562610864201e-06, "loss": 0.0554, "step": 266300 }, { "epoch": 3.9222544586972212, "grad_norm": 1.970281958580017, "learning_rate": 4.7969262388972165e-06, "loss": 0.0504, "step": 266325 }, { "epoch": 3.9226226417873082, "grad_norm": 1.3509507179260254, "learning_rate": 4.795289866930232e-06, "loss": 0.0558, "step": 266350 }, { "epoch": 3.9229908248773953, "grad_norm": 1.155869722366333, "learning_rate": 4.793653494963247e-06, "loss": 0.056, "step": 266375 }, { "epoch": 3.9233590079674823, "grad_norm": 1.2204054594039917, "learning_rate": 4.792017122996263e-06, "loss": 0.0491, "step": 266400 }, { "epoch": 3.9237271910575693, "grad_norm": 1.7223869562149048, "learning_rate": 4.790380751029279e-06, "loss": 0.0561, "step": 266425 }, { "epoch": 3.9240953741476563, "grad_norm": 1.4869788885116577, "learning_rate": 4.788744379062294e-06, "loss": 0.0595, "step": 266450 }, { "epoch": 3.9244635572377433, "grad_norm": 1.2139626741409302, "learning_rate": 4.787108007095309e-06, "loss": 0.0548, "step": 266475 }, { "epoch": 3.9248317403278303, "grad_norm": 1.4303646087646484, "learning_rate": 4.7854716351283245e-06, "loss": 0.0503, "step": 266500 }, { "epoch": 3.9251999234179173, "grad_norm": 1.713517665863037, "learning_rate": 4.78383526316134e-06, "loss": 0.0549, "step": 266525 }, { "epoch": 3.9255681065080044, "grad_norm": 2.059519052505493, "learning_rate": 4.782198891194356e-06, "loss": 0.0585, "step": 266550 }, { "epoch": 3.9259362895980914, "grad_norm": 1.407524585723877, "learning_rate": 4.7805625192273704e-06, "loss": 0.0612, "step": 266575 }, { "epoch": 3.9263044726881784, "grad_norm": 1.0501809120178223, "learning_rate": 4.778926147260387e-06, "loss": 0.0493, "step": 266600 }, { "epoch": 3.9266726557782654, "grad_norm": 1.40870201587677, "learning_rate": 4.777289775293402e-06, "loss": 0.0537, "step": 266625 }, { "epoch": 3.9270408388683524, "grad_norm": 1.3485110998153687, "learning_rate": 4.775653403326417e-06, "loss": 0.0518, "step": 266650 }, { "epoch": 3.9274090219584394, "grad_norm": 1.403550624847412, "learning_rate": 4.7740170313594325e-06, "loss": 0.0598, "step": 266675 }, { "epoch": 3.9277772050485265, "grad_norm": 1.2981377840042114, "learning_rate": 4.772380659392448e-06, "loss": 0.0493, "step": 266700 }, { "epoch": 3.9281453881386135, "grad_norm": 1.3137564659118652, "learning_rate": 4.770744287425464e-06, "loss": 0.0533, "step": 266725 }, { "epoch": 3.9285135712287005, "grad_norm": 0.9053286910057068, "learning_rate": 4.769107915458479e-06, "loss": 0.0497, "step": 266750 }, { "epoch": 3.9288817543187875, "grad_norm": 1.2881439924240112, "learning_rate": 4.7674715434914946e-06, "loss": 0.0564, "step": 266775 }, { "epoch": 3.9292499374088745, "grad_norm": 1.4748735427856445, "learning_rate": 4.76583517152451e-06, "loss": 0.0599, "step": 266800 }, { "epoch": 3.9296181204989615, "grad_norm": 1.5210356712341309, "learning_rate": 4.764198799557526e-06, "loss": 0.0589, "step": 266825 }, { "epoch": 3.9299863035890485, "grad_norm": 1.4726735353469849, "learning_rate": 4.7625624275905405e-06, "loss": 0.0527, "step": 266850 }, { "epoch": 3.9303544866791356, "grad_norm": 1.6104947328567505, "learning_rate": 4.760926055623557e-06, "loss": 0.0535, "step": 266875 }, { "epoch": 3.9307226697692226, "grad_norm": 1.428181767463684, "learning_rate": 4.759289683656572e-06, "loss": 0.0501, "step": 266900 }, { "epoch": 3.93109085285931, "grad_norm": 1.2220829725265503, "learning_rate": 4.757653311689587e-06, "loss": 0.0562, "step": 266925 }, { "epoch": 3.931459035949397, "grad_norm": 1.426183819770813, "learning_rate": 4.7560169397226025e-06, "loss": 0.0576, "step": 266950 }, { "epoch": 3.931827219039484, "grad_norm": 1.261764645576477, "learning_rate": 4.754380567755618e-06, "loss": 0.0523, "step": 266975 }, { "epoch": 3.932195402129571, "grad_norm": 1.4767704010009766, "learning_rate": 4.752744195788633e-06, "loss": 0.0509, "step": 267000 }, { "epoch": 3.932563585219658, "grad_norm": 1.2506531476974487, "learning_rate": 4.751107823821649e-06, "loss": 0.0472, "step": 267025 }, { "epoch": 3.932931768309745, "grad_norm": 1.3366501331329346, "learning_rate": 4.749471451854664e-06, "loss": 0.053, "step": 267050 }, { "epoch": 3.933299951399832, "grad_norm": 1.314798355102539, "learning_rate": 4.74783507988768e-06, "loss": 0.0533, "step": 267075 }, { "epoch": 3.933668134489919, "grad_norm": 1.0986489057540894, "learning_rate": 4.746198707920695e-06, "loss": 0.0539, "step": 267100 }, { "epoch": 3.934036317580006, "grad_norm": 1.0679348707199097, "learning_rate": 4.7445623359537105e-06, "loss": 0.0498, "step": 267125 }, { "epoch": 3.934404500670093, "grad_norm": 1.3667182922363281, "learning_rate": 4.742925963986726e-06, "loss": 0.0575, "step": 267150 }, { "epoch": 3.93477268376018, "grad_norm": 0.7756701707839966, "learning_rate": 4.741289592019742e-06, "loss": 0.0502, "step": 267175 }, { "epoch": 3.935140866850267, "grad_norm": 1.2835673093795776, "learning_rate": 4.7396532200527564e-06, "loss": 0.0503, "step": 267200 }, { "epoch": 3.935509049940354, "grad_norm": 1.3682050704956055, "learning_rate": 4.738016848085773e-06, "loss": 0.0517, "step": 267225 }, { "epoch": 3.9358772330304412, "grad_norm": 1.4204814434051514, "learning_rate": 4.736380476118788e-06, "loss": 0.0551, "step": 267250 }, { "epoch": 3.9362454161205283, "grad_norm": 1.366399884223938, "learning_rate": 4.734744104151803e-06, "loss": 0.0561, "step": 267275 }, { "epoch": 3.9366135992106157, "grad_norm": 1.3777496814727783, "learning_rate": 4.733107732184819e-06, "loss": 0.059, "step": 267300 }, { "epoch": 3.9369817823007027, "grad_norm": 0.5394716858863831, "learning_rate": 4.731471360217834e-06, "loss": 0.0454, "step": 267325 }, { "epoch": 3.9373499653907897, "grad_norm": 1.2963823080062866, "learning_rate": 4.72983498825085e-06, "loss": 0.0542, "step": 267350 }, { "epoch": 3.9377181484808768, "grad_norm": 1.249991536140442, "learning_rate": 4.728198616283865e-06, "loss": 0.0518, "step": 267375 }, { "epoch": 3.9380863315709638, "grad_norm": 1.4893606901168823, "learning_rate": 4.7265622443168806e-06, "loss": 0.0628, "step": 267400 }, { "epoch": 3.938454514661051, "grad_norm": 1.330801248550415, "learning_rate": 4.724925872349896e-06, "loss": 0.0523, "step": 267425 }, { "epoch": 3.938822697751138, "grad_norm": 2.1741268634796143, "learning_rate": 4.723289500382911e-06, "loss": 0.0617, "step": 267450 }, { "epoch": 3.939190880841225, "grad_norm": 0.8185716867446899, "learning_rate": 4.7216531284159265e-06, "loss": 0.0502, "step": 267475 }, { "epoch": 3.939559063931312, "grad_norm": 1.2055408954620361, "learning_rate": 4.720016756448943e-06, "loss": 0.0575, "step": 267500 }, { "epoch": 3.939927247021399, "grad_norm": 1.2894607782363892, "learning_rate": 4.718380384481958e-06, "loss": 0.0557, "step": 267525 }, { "epoch": 3.940295430111486, "grad_norm": 1.1453173160552979, "learning_rate": 4.716744012514973e-06, "loss": 0.0499, "step": 267550 }, { "epoch": 3.940663613201573, "grad_norm": 1.7617239952087402, "learning_rate": 4.7151076405479885e-06, "loss": 0.0576, "step": 267575 }, { "epoch": 3.94103179629166, "grad_norm": 0.9825162887573242, "learning_rate": 4.713471268581004e-06, "loss": 0.0532, "step": 267600 }, { "epoch": 3.941399979381747, "grad_norm": 1.490334153175354, "learning_rate": 4.711834896614019e-06, "loss": 0.0552, "step": 267625 }, { "epoch": 3.941768162471834, "grad_norm": 1.5881626605987549, "learning_rate": 4.710198524647035e-06, "loss": 0.0549, "step": 267650 }, { "epoch": 3.942136345561921, "grad_norm": 0.9396759867668152, "learning_rate": 4.70856215268005e-06, "loss": 0.0602, "step": 267675 }, { "epoch": 3.942504528652008, "grad_norm": 1.4920693635940552, "learning_rate": 4.706925780713066e-06, "loss": 0.0531, "step": 267700 }, { "epoch": 3.942872711742095, "grad_norm": 1.5342576503753662, "learning_rate": 4.705289408746081e-06, "loss": 0.0526, "step": 267725 }, { "epoch": 3.943240894832182, "grad_norm": 1.1870781183242798, "learning_rate": 4.7036530367790965e-06, "loss": 0.0532, "step": 267750 }, { "epoch": 3.943609077922269, "grad_norm": 1.4740474224090576, "learning_rate": 4.702016664812113e-06, "loss": 0.0501, "step": 267775 }, { "epoch": 3.943977261012356, "grad_norm": 1.4664939641952515, "learning_rate": 4.700380292845127e-06, "loss": 0.0484, "step": 267800 }, { "epoch": 3.944345444102443, "grad_norm": 1.5314321517944336, "learning_rate": 4.698743920878143e-06, "loss": 0.0599, "step": 267825 }, { "epoch": 3.94471362719253, "grad_norm": 1.668631672859192, "learning_rate": 4.697107548911159e-06, "loss": 0.0535, "step": 267850 }, { "epoch": 3.945081810282617, "grad_norm": 1.4098851680755615, "learning_rate": 4.695471176944174e-06, "loss": 0.0612, "step": 267875 }, { "epoch": 3.9454499933727045, "grad_norm": 1.2308943271636963, "learning_rate": 4.693834804977189e-06, "loss": 0.0485, "step": 267900 }, { "epoch": 3.9458181764627915, "grad_norm": 1.3214237689971924, "learning_rate": 4.692198433010205e-06, "loss": 0.0514, "step": 267925 }, { "epoch": 3.9461863595528786, "grad_norm": 0.8661231398582458, "learning_rate": 4.69056206104322e-06, "loss": 0.048, "step": 267950 }, { "epoch": 3.9465545426429656, "grad_norm": 1.5837327241897583, "learning_rate": 4.688925689076236e-06, "loss": 0.0534, "step": 267975 }, { "epoch": 3.9469227257330526, "grad_norm": 1.7167258262634277, "learning_rate": 4.687289317109251e-06, "loss": 0.0558, "step": 268000 }, { "epoch": 3.9472909088231396, "grad_norm": 2.0640058517456055, "learning_rate": 4.6856529451422666e-06, "loss": 0.0567, "step": 268025 }, { "epoch": 3.9476590919132266, "grad_norm": 0.9550992846488953, "learning_rate": 4.684016573175282e-06, "loss": 0.0557, "step": 268050 }, { "epoch": 3.9480272750033136, "grad_norm": 1.574422001838684, "learning_rate": 4.682380201208297e-06, "loss": 0.0576, "step": 268075 }, { "epoch": 3.9483954580934006, "grad_norm": 1.7121474742889404, "learning_rate": 4.6807438292413125e-06, "loss": 0.0572, "step": 268100 }, { "epoch": 3.9487636411834877, "grad_norm": 1.6270625591278076, "learning_rate": 4.679107457274329e-06, "loss": 0.0515, "step": 268125 }, { "epoch": 3.9491318242735747, "grad_norm": 1.489768147468567, "learning_rate": 4.677471085307343e-06, "loss": 0.0488, "step": 268150 }, { "epoch": 3.9495000073636617, "grad_norm": 1.512560486793518, "learning_rate": 4.675834713340359e-06, "loss": 0.048, "step": 268175 }, { "epoch": 3.9498681904537487, "grad_norm": 0.9951837062835693, "learning_rate": 4.6741983413733745e-06, "loss": 0.0512, "step": 268200 }, { "epoch": 3.9502363735438357, "grad_norm": 1.2083882093429565, "learning_rate": 4.67256196940639e-06, "loss": 0.0599, "step": 268225 }, { "epoch": 3.950604556633923, "grad_norm": 1.7658652067184448, "learning_rate": 4.670925597439406e-06, "loss": 0.0566, "step": 268250 }, { "epoch": 3.95097273972401, "grad_norm": 1.414762258529663, "learning_rate": 4.669289225472421e-06, "loss": 0.046, "step": 268275 }, { "epoch": 3.951340922814097, "grad_norm": 1.7980754375457764, "learning_rate": 4.667652853505437e-06, "loss": 0.053, "step": 268300 }, { "epoch": 3.9517091059041842, "grad_norm": 1.034510850906372, "learning_rate": 4.666016481538452e-06, "loss": 0.0503, "step": 268325 }, { "epoch": 3.9520772889942712, "grad_norm": 1.8255705833435059, "learning_rate": 4.664380109571467e-06, "loss": 0.0476, "step": 268350 }, { "epoch": 3.9524454720843583, "grad_norm": 2.084826946258545, "learning_rate": 4.6627437376044825e-06, "loss": 0.0561, "step": 268375 }, { "epoch": 3.9528136551744453, "grad_norm": 1.2996611595153809, "learning_rate": 4.661107365637499e-06, "loss": 0.0519, "step": 268400 }, { "epoch": 3.9531818382645323, "grad_norm": 1.4135969877243042, "learning_rate": 4.659470993670513e-06, "loss": 0.0502, "step": 268425 }, { "epoch": 3.9535500213546193, "grad_norm": Infinity, "learning_rate": 4.657900076582209e-06, "loss": 0.0513, "step": 268450 }, { "epoch": 3.9539182044447063, "grad_norm": 1.0560312271118164, "learning_rate": 4.656263704615224e-06, "loss": 0.0484, "step": 268475 }, { "epoch": 3.9542863875347933, "grad_norm": 1.3681700229644775, "learning_rate": 4.654627332648239e-06, "loss": 0.0521, "step": 268500 }, { "epoch": 3.9546545706248803, "grad_norm": 1.2107115983963013, "learning_rate": 4.652990960681255e-06, "loss": 0.053, "step": 268525 }, { "epoch": 3.9550227537149674, "grad_norm": 1.4557937383651733, "learning_rate": 4.65135458871427e-06, "loss": 0.0573, "step": 268550 }, { "epoch": 3.9553909368050544, "grad_norm": 0.8258102536201477, "learning_rate": 4.649718216747285e-06, "loss": 0.0542, "step": 268575 }, { "epoch": 3.9557591198951414, "grad_norm": 1.3532434701919556, "learning_rate": 4.648081844780301e-06, "loss": 0.0568, "step": 268600 }, { "epoch": 3.9561273029852284, "grad_norm": 1.3047311305999756, "learning_rate": 4.646445472813317e-06, "loss": 0.0566, "step": 268625 }, { "epoch": 3.9564954860753154, "grad_norm": 1.172406792640686, "learning_rate": 4.644809100846332e-06, "loss": 0.0603, "step": 268650 }, { "epoch": 3.9568636691654024, "grad_norm": 1.6092957258224487, "learning_rate": 4.643172728879347e-06, "loss": 0.0595, "step": 268675 }, { "epoch": 3.9572318522554895, "grad_norm": 0.9011870622634888, "learning_rate": 4.641536356912363e-06, "loss": 0.0537, "step": 268700 }, { "epoch": 3.9576000353455765, "grad_norm": 1.2932192087173462, "learning_rate": 4.639899984945379e-06, "loss": 0.05, "step": 268725 }, { "epoch": 3.9579682184356635, "grad_norm": 1.4305644035339355, "learning_rate": 4.638263612978394e-06, "loss": 0.0585, "step": 268750 }, { "epoch": 3.9583364015257505, "grad_norm": 1.1784656047821045, "learning_rate": 4.636627241011409e-06, "loss": 0.0566, "step": 268775 }, { "epoch": 3.9587045846158375, "grad_norm": 1.1213839054107666, "learning_rate": 4.634990869044425e-06, "loss": 0.0515, "step": 268800 }, { "epoch": 3.9590727677059245, "grad_norm": 1.3900045156478882, "learning_rate": 4.63335449707744e-06, "loss": 0.0472, "step": 268825 }, { "epoch": 3.959440950796012, "grad_norm": 1.5762546062469482, "learning_rate": 4.631718125110455e-06, "loss": 0.0637, "step": 268850 }, { "epoch": 3.959809133886099, "grad_norm": 1.1227755546569824, "learning_rate": 4.630081753143471e-06, "loss": 0.0502, "step": 268875 }, { "epoch": 3.960177316976186, "grad_norm": 1.465013861656189, "learning_rate": 4.628445381176486e-06, "loss": 0.0566, "step": 268900 }, { "epoch": 3.960545500066273, "grad_norm": 1.5864336490631104, "learning_rate": 4.626809009209502e-06, "loss": 0.0597, "step": 268925 }, { "epoch": 3.96091368315636, "grad_norm": 1.635076642036438, "learning_rate": 4.625172637242517e-06, "loss": 0.049, "step": 268950 }, { "epoch": 3.961281866246447, "grad_norm": 1.3956019878387451, "learning_rate": 4.623536265275533e-06, "loss": 0.0589, "step": 268975 }, { "epoch": 3.961650049336534, "grad_norm": 1.9567335844039917, "learning_rate": 4.621899893308548e-06, "loss": 0.0588, "step": 269000 }, { "epoch": 3.962018232426621, "grad_norm": 1.4911231994628906, "learning_rate": 4.620263521341563e-06, "loss": 0.0563, "step": 269025 }, { "epoch": 3.962386415516708, "grad_norm": 1.437920093536377, "learning_rate": 4.6186271493745785e-06, "loss": 0.0533, "step": 269050 }, { "epoch": 3.962754598606795, "grad_norm": 1.3426780700683594, "learning_rate": 4.616990777407595e-06, "loss": 0.0586, "step": 269075 }, { "epoch": 3.963122781696882, "grad_norm": 1.3258249759674072, "learning_rate": 4.61535440544061e-06, "loss": 0.0557, "step": 269100 }, { "epoch": 3.963490964786969, "grad_norm": 1.2463361024856567, "learning_rate": 4.613718033473625e-06, "loss": 0.0537, "step": 269125 }, { "epoch": 3.963859147877056, "grad_norm": 1.011029601097107, "learning_rate": 4.6120816615066415e-06, "loss": 0.0528, "step": 269150 }, { "epoch": 3.964227330967143, "grad_norm": 1.9055616855621338, "learning_rate": 4.610445289539656e-06, "loss": 0.0605, "step": 269175 }, { "epoch": 3.9645955140572307, "grad_norm": 1.4764559268951416, "learning_rate": 4.608808917572672e-06, "loss": 0.0507, "step": 269200 }, { "epoch": 3.9649636971473177, "grad_norm": 1.0713492631912231, "learning_rate": 4.607172545605687e-06, "loss": 0.0538, "step": 269225 }, { "epoch": 3.9653318802374047, "grad_norm": 1.5301762819290161, "learning_rate": 4.605536173638703e-06, "loss": 0.0552, "step": 269250 }, { "epoch": 3.9657000633274917, "grad_norm": 1.4716534614562988, "learning_rate": 4.603899801671718e-06, "loss": 0.0492, "step": 269275 }, { "epoch": 3.9660682464175787, "grad_norm": 1.1124612092971802, "learning_rate": 4.602263429704733e-06, "loss": 0.052, "step": 269300 }, { "epoch": 3.9664364295076657, "grad_norm": 1.2334586381912231, "learning_rate": 4.600627057737749e-06, "loss": 0.0635, "step": 269325 }, { "epoch": 3.9668046125977527, "grad_norm": 1.4479761123657227, "learning_rate": 4.598990685770765e-06, "loss": 0.0533, "step": 269350 }, { "epoch": 3.9671727956878398, "grad_norm": 1.0653736591339111, "learning_rate": 4.597354313803779e-06, "loss": 0.0498, "step": 269375 }, { "epoch": 3.9675409787779268, "grad_norm": 1.6402326822280884, "learning_rate": 4.595717941836795e-06, "loss": 0.0543, "step": 269400 }, { "epoch": 3.967909161868014, "grad_norm": 1.0338371992111206, "learning_rate": 4.594081569869811e-06, "loss": 0.0569, "step": 269425 }, { "epoch": 3.968277344958101, "grad_norm": 1.7422449588775635, "learning_rate": 4.592445197902826e-06, "loss": 0.0586, "step": 269450 }, { "epoch": 3.968645528048188, "grad_norm": 1.5586791038513184, "learning_rate": 4.590808825935841e-06, "loss": 0.0544, "step": 269475 }, { "epoch": 3.969013711138275, "grad_norm": 1.1228259801864624, "learning_rate": 4.589172453968857e-06, "loss": 0.055, "step": 269500 }, { "epoch": 3.969381894228362, "grad_norm": 1.6707308292388916, "learning_rate": 4.587536082001872e-06, "loss": 0.0485, "step": 269525 }, { "epoch": 3.969750077318449, "grad_norm": 1.7484828233718872, "learning_rate": 4.585899710034888e-06, "loss": 0.0548, "step": 269550 }, { "epoch": 3.970118260408536, "grad_norm": 1.5427132844924927, "learning_rate": 4.584263338067903e-06, "loss": 0.0548, "step": 269575 }, { "epoch": 3.970486443498623, "grad_norm": 1.3491164445877075, "learning_rate": 4.582626966100919e-06, "loss": 0.0542, "step": 269600 }, { "epoch": 3.97085462658871, "grad_norm": 0.9970679879188538, "learning_rate": 4.580990594133934e-06, "loss": 0.0508, "step": 269625 }, { "epoch": 3.971222809678797, "grad_norm": 1.3284558057785034, "learning_rate": 4.579354222166949e-06, "loss": 0.055, "step": 269650 }, { "epoch": 3.971590992768884, "grad_norm": 1.1876049041748047, "learning_rate": 4.577717850199965e-06, "loss": 0.054, "step": 269675 }, { "epoch": 3.971959175858971, "grad_norm": 1.3055858612060547, "learning_rate": 4.576081478232981e-06, "loss": 0.049, "step": 269700 }, { "epoch": 3.972327358949058, "grad_norm": 1.3097436428070068, "learning_rate": 4.574445106265996e-06, "loss": 0.0503, "step": 269725 }, { "epoch": 3.972695542039145, "grad_norm": 0.7987446188926697, "learning_rate": 4.572808734299011e-06, "loss": 0.0566, "step": 269750 }, { "epoch": 3.973063725129232, "grad_norm": 0.9984710216522217, "learning_rate": 4.571172362332027e-06, "loss": 0.0494, "step": 269775 }, { "epoch": 3.9734319082193195, "grad_norm": 1.938915491104126, "learning_rate": 4.569535990365042e-06, "loss": 0.0545, "step": 269800 }, { "epoch": 3.9738000913094065, "grad_norm": 1.6826567649841309, "learning_rate": 4.567899618398058e-06, "loss": 0.0516, "step": 269825 }, { "epoch": 3.9741682743994935, "grad_norm": 1.4236141443252563, "learning_rate": 4.566263246431073e-06, "loss": 0.0538, "step": 269850 }, { "epoch": 3.9745364574895805, "grad_norm": 1.7639985084533691, "learning_rate": 4.564626874464089e-06, "loss": 0.0567, "step": 269875 }, { "epoch": 3.9749046405796675, "grad_norm": 1.604337215423584, "learning_rate": 4.562990502497104e-06, "loss": 0.0555, "step": 269900 }, { "epoch": 3.9752728236697545, "grad_norm": 1.2788645029067993, "learning_rate": 4.561354130530119e-06, "loss": 0.0568, "step": 269925 }, { "epoch": 3.9756410067598416, "grad_norm": 1.5617719888687134, "learning_rate": 4.559717758563135e-06, "loss": 0.0547, "step": 269950 }, { "epoch": 3.9760091898499286, "grad_norm": 1.111019492149353, "learning_rate": 4.558081386596151e-06, "loss": 0.0519, "step": 269975 }, { "epoch": 3.9763773729400156, "grad_norm": 1.796332597732544, "learning_rate": 4.556445014629165e-06, "loss": 0.0556, "step": 270000 }, { "epoch": 3.9767455560301026, "grad_norm": 1.623569130897522, "learning_rate": 4.554808642662181e-06, "loss": 0.0517, "step": 270025 }, { "epoch": 3.9771137391201896, "grad_norm": 0.7278119921684265, "learning_rate": 4.553172270695197e-06, "loss": 0.0524, "step": 270050 }, { "epoch": 3.9774819222102766, "grad_norm": 1.6087461709976196, "learning_rate": 4.551535898728212e-06, "loss": 0.0581, "step": 270075 }, { "epoch": 3.9778501053003636, "grad_norm": 1.0999680757522583, "learning_rate": 4.549899526761227e-06, "loss": 0.0504, "step": 270100 }, { "epoch": 3.9782182883904507, "grad_norm": 1.5412969589233398, "learning_rate": 4.5482631547942426e-06, "loss": 0.0545, "step": 270125 }, { "epoch": 3.9785864714805377, "grad_norm": 1.3886959552764893, "learning_rate": 4.546626782827259e-06, "loss": 0.0625, "step": 270150 }, { "epoch": 3.978954654570625, "grad_norm": 1.2410922050476074, "learning_rate": 4.544990410860274e-06, "loss": 0.0581, "step": 270175 }, { "epoch": 3.979322837660712, "grad_norm": 1.2803741693496704, "learning_rate": 4.543354038893289e-06, "loss": 0.0449, "step": 270200 }, { "epoch": 3.979691020750799, "grad_norm": 0.9015976786613464, "learning_rate": 4.541717666926305e-06, "loss": 0.0501, "step": 270225 }, { "epoch": 3.980059203840886, "grad_norm": 1.4364522695541382, "learning_rate": 4.540081294959321e-06, "loss": 0.0576, "step": 270250 }, { "epoch": 3.980427386930973, "grad_norm": 1.476516842842102, "learning_rate": 4.538444922992335e-06, "loss": 0.0577, "step": 270275 }, { "epoch": 3.98079557002106, "grad_norm": 1.1962724924087524, "learning_rate": 4.536808551025351e-06, "loss": 0.0454, "step": 270300 }, { "epoch": 3.9811637531111472, "grad_norm": 1.4422894716262817, "learning_rate": 4.535172179058367e-06, "loss": 0.0553, "step": 270325 }, { "epoch": 3.9815319362012342, "grad_norm": 1.0398858785629272, "learning_rate": 4.533535807091382e-06, "loss": 0.0487, "step": 270350 }, { "epoch": 3.9819001192913213, "grad_norm": 0.9367135763168335, "learning_rate": 4.531899435124397e-06, "loss": 0.0509, "step": 270375 }, { "epoch": 3.9822683023814083, "grad_norm": 1.9063533544540405, "learning_rate": 4.530263063157413e-06, "loss": 0.0587, "step": 270400 }, { "epoch": 3.9826364854714953, "grad_norm": 1.2517178058624268, "learning_rate": 4.528626691190428e-06, "loss": 0.0542, "step": 270425 }, { "epoch": 3.9830046685615823, "grad_norm": 1.330877423286438, "learning_rate": 4.526990319223444e-06, "loss": 0.0501, "step": 270450 }, { "epoch": 3.9833728516516693, "grad_norm": 1.5600701570510864, "learning_rate": 4.5253539472564585e-06, "loss": 0.0572, "step": 270475 }, { "epoch": 3.9837410347417563, "grad_norm": 1.7591938972473145, "learning_rate": 4.523717575289475e-06, "loss": 0.0562, "step": 270500 }, { "epoch": 3.9841092178318434, "grad_norm": 1.2087832689285278, "learning_rate": 4.52208120332249e-06, "loss": 0.0587, "step": 270525 }, { "epoch": 3.9844774009219304, "grad_norm": 1.0783907175064087, "learning_rate": 4.520444831355505e-06, "loss": 0.0538, "step": 270550 }, { "epoch": 3.9848455840120174, "grad_norm": 1.8448988199234009, "learning_rate": 4.518808459388521e-06, "loss": 0.0566, "step": 270575 }, { "epoch": 3.9852137671021044, "grad_norm": 1.0080883502960205, "learning_rate": 4.517172087421537e-06, "loss": 0.0567, "step": 270600 }, { "epoch": 3.9855819501921914, "grad_norm": 1.3988831043243408, "learning_rate": 4.515535715454552e-06, "loss": 0.0508, "step": 270625 }, { "epoch": 3.9859501332822784, "grad_norm": 1.3583345413208008, "learning_rate": 4.513899343487567e-06, "loss": 0.0602, "step": 270650 }, { "epoch": 3.9863183163723654, "grad_norm": 1.6458958387374878, "learning_rate": 4.512262971520583e-06, "loss": 0.0534, "step": 270675 }, { "epoch": 3.9866864994624525, "grad_norm": 1.4257688522338867, "learning_rate": 4.510626599553598e-06, "loss": 0.0498, "step": 270700 }, { "epoch": 3.9870546825525395, "grad_norm": 0.8840227127075195, "learning_rate": 4.508990227586614e-06, "loss": 0.0555, "step": 270725 }, { "epoch": 3.9874228656426265, "grad_norm": 1.1897773742675781, "learning_rate": 4.5073538556196286e-06, "loss": 0.0562, "step": 270750 }, { "epoch": 3.987791048732714, "grad_norm": 1.2229746580123901, "learning_rate": 4.505717483652645e-06, "loss": 0.0553, "step": 270775 }, { "epoch": 3.988159231822801, "grad_norm": 0.9923685193061829, "learning_rate": 4.50408111168566e-06, "loss": 0.0513, "step": 270800 }, { "epoch": 3.988527414912888, "grad_norm": 0.8561668992042542, "learning_rate": 4.502444739718675e-06, "loss": 0.0501, "step": 270825 }, { "epoch": 3.988895598002975, "grad_norm": 1.9197190999984741, "learning_rate": 4.500808367751691e-06, "loss": 0.057, "step": 270850 }, { "epoch": 3.989263781093062, "grad_norm": 1.3146735429763794, "learning_rate": 4.499171995784706e-06, "loss": 0.0609, "step": 270875 }, { "epoch": 3.989631964183149, "grad_norm": 1.598151683807373, "learning_rate": 4.497535623817721e-06, "loss": 0.0557, "step": 270900 }, { "epoch": 3.990000147273236, "grad_norm": 1.4592117071151733, "learning_rate": 4.495899251850737e-06, "loss": 0.0607, "step": 270925 }, { "epoch": 3.990368330363323, "grad_norm": 1.409018874168396, "learning_rate": 4.494262879883753e-06, "loss": 0.0529, "step": 270950 }, { "epoch": 3.99073651345341, "grad_norm": 1.162003517150879, "learning_rate": 4.492626507916768e-06, "loss": 0.0535, "step": 270975 }, { "epoch": 3.991104696543497, "grad_norm": 1.278775691986084, "learning_rate": 4.490990135949783e-06, "loss": 0.0572, "step": 271000 }, { "epoch": 3.991472879633584, "grad_norm": 0.8224690556526184, "learning_rate": 4.489353763982799e-06, "loss": 0.0527, "step": 271025 }, { "epoch": 3.991841062723671, "grad_norm": 1.4066331386566162, "learning_rate": 4.487717392015814e-06, "loss": 0.0553, "step": 271050 }, { "epoch": 3.992209245813758, "grad_norm": 1.0099693536758423, "learning_rate": 4.48608102004883e-06, "loss": 0.0586, "step": 271075 }, { "epoch": 3.992577428903845, "grad_norm": 1.4977132081985474, "learning_rate": 4.4844446480818445e-06, "loss": 0.0474, "step": 271100 }, { "epoch": 3.9929456119939326, "grad_norm": 1.8170990943908691, "learning_rate": 4.482808276114861e-06, "loss": 0.0636, "step": 271125 }, { "epoch": 3.9933137950840196, "grad_norm": 1.537147879600525, "learning_rate": 4.481171904147876e-06, "loss": 0.0509, "step": 271150 }, { "epoch": 3.9936819781741066, "grad_norm": 1.4248909950256348, "learning_rate": 4.479535532180891e-06, "loss": 0.0543, "step": 271175 }, { "epoch": 3.9940501612641937, "grad_norm": 0.7828039526939392, "learning_rate": 4.4778991602139074e-06, "loss": 0.0486, "step": 271200 }, { "epoch": 3.9944183443542807, "grad_norm": 1.4615932703018188, "learning_rate": 4.476262788246922e-06, "loss": 0.0573, "step": 271225 }, { "epoch": 3.9947865274443677, "grad_norm": 0.7762133479118347, "learning_rate": 4.474626416279938e-06, "loss": 0.0567, "step": 271250 }, { "epoch": 3.9951547105344547, "grad_norm": 1.386361002922058, "learning_rate": 4.472990044312953e-06, "loss": 0.0522, "step": 271275 }, { "epoch": 3.9955228936245417, "grad_norm": 1.1249874830245972, "learning_rate": 4.471353672345969e-06, "loss": 0.0607, "step": 271300 }, { "epoch": 3.9958910767146287, "grad_norm": 0.9048838019371033, "learning_rate": 4.469717300378984e-06, "loss": 0.0491, "step": 271325 }, { "epoch": 3.9962592598047157, "grad_norm": 1.614412546157837, "learning_rate": 4.468080928412e-06, "loss": 0.0507, "step": 271350 }, { "epoch": 3.9966274428948028, "grad_norm": 1.4011964797973633, "learning_rate": 4.4664445564450146e-06, "loss": 0.052, "step": 271375 }, { "epoch": 3.9969956259848898, "grad_norm": 1.6174417734146118, "learning_rate": 4.464808184478031e-06, "loss": 0.0498, "step": 271400 }, { "epoch": 3.997363809074977, "grad_norm": 1.1240805387496948, "learning_rate": 4.463171812511046e-06, "loss": 0.0514, "step": 271425 }, { "epoch": 3.997731992165064, "grad_norm": 0.9346145987510681, "learning_rate": 4.461535440544061e-06, "loss": 0.0478, "step": 271450 }, { "epoch": 3.998100175255151, "grad_norm": 1.1280893087387085, "learning_rate": 4.459899068577077e-06, "loss": 0.05, "step": 271475 }, { "epoch": 3.998468358345238, "grad_norm": 1.5235371589660645, "learning_rate": 4.458262696610092e-06, "loss": 0.0571, "step": 271500 }, { "epoch": 3.998836541435325, "grad_norm": 1.5510966777801514, "learning_rate": 4.456626324643107e-06, "loss": 0.0491, "step": 271525 }, { "epoch": 3.999204724525412, "grad_norm": 1.0778614282608032, "learning_rate": 4.454989952676123e-06, "loss": 0.0483, "step": 271550 }, { "epoch": 3.999572907615499, "grad_norm": 1.274242639541626, "learning_rate": 4.453353580709138e-06, "loss": 0.0556, "step": 271575 }, { "epoch": 3.999941090705586, "grad_norm": 1.295026183128357, "learning_rate": 4.451717208742154e-06, "loss": 0.0519, "step": 271600 }, { "epoch": 4.0, "eval_loss": 0.058289363980293274, "eval_runtime": 112.6241, "eval_samples_per_second": 3149.84, "eval_steps_per_second": 6.153, "step": 271604 }, { "epoch": 4.000309273795673, "grad_norm": 1.0060703754425049, "learning_rate": 4.450080836775169e-06, "loss": 0.0538, "step": 271625 }, { "epoch": 4.00067745688576, "grad_norm": 1.6256768703460693, "learning_rate": 4.448444464808185e-06, "loss": 0.0523, "step": 271650 }, { "epoch": 4.001045639975847, "grad_norm": 1.5601869821548462, "learning_rate": 4.446808092841201e-06, "loss": 0.0566, "step": 271675 }, { "epoch": 4.001413823065934, "grad_norm": 1.1750060319900513, "learning_rate": 4.445171720874216e-06, "loss": 0.0642, "step": 271700 }, { "epoch": 4.001782006156021, "grad_norm": 1.1978039741516113, "learning_rate": 4.443535348907231e-06, "loss": 0.0529, "step": 271725 }, { "epoch": 4.002150189246108, "grad_norm": 1.0152662992477417, "learning_rate": 4.441898976940247e-06, "loss": 0.0509, "step": 271750 }, { "epoch": 4.002518372336195, "grad_norm": 1.405200481414795, "learning_rate": 4.440262604973262e-06, "loss": 0.0546, "step": 271775 }, { "epoch": 4.002886555426282, "grad_norm": 1.8479878902435303, "learning_rate": 4.438626233006277e-06, "loss": 0.0545, "step": 271800 }, { "epoch": 4.003254738516369, "grad_norm": 1.265377163887024, "learning_rate": 4.4369898610392934e-06, "loss": 0.0633, "step": 271825 }, { "epoch": 4.003622921606456, "grad_norm": 1.418918251991272, "learning_rate": 4.435353489072308e-06, "loss": 0.0524, "step": 271850 }, { "epoch": 4.003991104696543, "grad_norm": 1.582596778869629, "learning_rate": 4.433717117105324e-06, "loss": 0.054, "step": 271875 }, { "epoch": 4.004359287786631, "grad_norm": 1.4304360151290894, "learning_rate": 4.432080745138339e-06, "loss": 0.0525, "step": 271900 }, { "epoch": 4.004727470876718, "grad_norm": 1.1909329891204834, "learning_rate": 4.430444373171355e-06, "loss": 0.0545, "step": 271925 }, { "epoch": 4.005095653966805, "grad_norm": 1.5431170463562012, "learning_rate": 4.42880800120437e-06, "loss": 0.046, "step": 271950 }, { "epoch": 4.005463837056892, "grad_norm": 1.464337706565857, "learning_rate": 4.427171629237385e-06, "loss": 0.048, "step": 271975 }, { "epoch": 4.005832020146979, "grad_norm": 0.8372604250907898, "learning_rate": 4.4255352572704006e-06, "loss": 0.0545, "step": 272000 }, { "epoch": 4.006200203237066, "grad_norm": 0.969363808631897, "learning_rate": 4.423898885303417e-06, "loss": 0.0471, "step": 272025 }, { "epoch": 4.006568386327153, "grad_norm": 1.3070334196090698, "learning_rate": 4.422262513336432e-06, "loss": 0.0523, "step": 272050 }, { "epoch": 4.00693656941724, "grad_norm": 1.105718731880188, "learning_rate": 4.420626141369447e-06, "loss": 0.0478, "step": 272075 }, { "epoch": 4.007304752507327, "grad_norm": 1.1749552488327026, "learning_rate": 4.418989769402463e-06, "loss": 0.049, "step": 272100 }, { "epoch": 4.007672935597414, "grad_norm": 1.798275351524353, "learning_rate": 4.417353397435478e-06, "loss": 0.0511, "step": 272125 }, { "epoch": 4.008041118687501, "grad_norm": 1.2194712162017822, "learning_rate": 4.415717025468494e-06, "loss": 0.0469, "step": 272150 }, { "epoch": 4.008409301777588, "grad_norm": 1.7627757787704468, "learning_rate": 4.414080653501509e-06, "loss": 0.0523, "step": 272175 }, { "epoch": 4.008777484867675, "grad_norm": 1.3913440704345703, "learning_rate": 4.412444281534525e-06, "loss": 0.0508, "step": 272200 }, { "epoch": 4.009145667957762, "grad_norm": 1.026561975479126, "learning_rate": 4.41080790956754e-06, "loss": 0.055, "step": 272225 }, { "epoch": 4.009513851047849, "grad_norm": 1.353627324104309, "learning_rate": 4.409171537600555e-06, "loss": 0.0545, "step": 272250 }, { "epoch": 4.009882034137936, "grad_norm": 1.5493100881576538, "learning_rate": 4.407535165633571e-06, "loss": 0.0486, "step": 272275 }, { "epoch": 4.010250217228023, "grad_norm": 1.1753915548324585, "learning_rate": 4.405898793666587e-06, "loss": 0.053, "step": 272300 }, { "epoch": 4.01061840031811, "grad_norm": 1.3746318817138672, "learning_rate": 4.404262421699601e-06, "loss": 0.0486, "step": 272325 }, { "epoch": 4.010986583408197, "grad_norm": 1.4474351406097412, "learning_rate": 4.402626049732617e-06, "loss": 0.053, "step": 272350 }, { "epoch": 4.011354766498284, "grad_norm": 1.0470190048217773, "learning_rate": 4.400989677765633e-06, "loss": 0.0548, "step": 272375 }, { "epoch": 4.011722949588371, "grad_norm": 1.1962851285934448, "learning_rate": 4.399353305798648e-06, "loss": 0.0563, "step": 272400 }, { "epoch": 4.012091132678458, "grad_norm": 1.5802463293075562, "learning_rate": 4.397716933831663e-06, "loss": 0.0536, "step": 272425 }, { "epoch": 4.012459315768545, "grad_norm": 1.2477504014968872, "learning_rate": 4.3960805618646794e-06, "loss": 0.067, "step": 272450 }, { "epoch": 4.012827498858632, "grad_norm": 1.3278979063034058, "learning_rate": 4.394509644776373e-06, "loss": 0.0536, "step": 272475 }, { "epoch": 4.013195681948719, "grad_norm": 1.1894779205322266, "learning_rate": 4.3928732728093895e-06, "loss": 0.0478, "step": 272500 }, { "epoch": 4.013563865038806, "grad_norm": 1.1184048652648926, "learning_rate": 4.391236900842405e-06, "loss": 0.0542, "step": 272525 }, { "epoch": 4.013932048128893, "grad_norm": 1.0860342979431152, "learning_rate": 4.38960052887542e-06, "loss": 0.0477, "step": 272550 }, { "epoch": 4.01430023121898, "grad_norm": 0.7316974997520447, "learning_rate": 4.387964156908435e-06, "loss": 0.0534, "step": 272575 }, { "epoch": 4.014668414309067, "grad_norm": 1.1899856328964233, "learning_rate": 4.386327784941451e-06, "loss": 0.0506, "step": 272600 }, { "epoch": 4.015036597399154, "grad_norm": 1.1814501285552979, "learning_rate": 4.384691412974467e-06, "loss": 0.0557, "step": 272625 }, { "epoch": 4.015404780489241, "grad_norm": 1.3784784078598022, "learning_rate": 4.383055041007482e-06, "loss": 0.0521, "step": 272650 }, { "epoch": 4.0157729635793284, "grad_norm": 0.9269449710845947, "learning_rate": 4.3814186690404974e-06, "loss": 0.0513, "step": 272675 }, { "epoch": 4.0161411466694155, "grad_norm": 1.6109111309051514, "learning_rate": 4.379782297073513e-06, "loss": 0.0481, "step": 272700 }, { "epoch": 4.0165093297595025, "grad_norm": 1.2411295175552368, "learning_rate": 4.378145925106528e-06, "loss": 0.053, "step": 272725 }, { "epoch": 4.0168775128495895, "grad_norm": 1.662144660949707, "learning_rate": 4.376509553139543e-06, "loss": 0.056, "step": 272750 }, { "epoch": 4.0172456959396765, "grad_norm": 1.5705420970916748, "learning_rate": 4.3748731811725595e-06, "loss": 0.055, "step": 272775 }, { "epoch": 4.0176138790297635, "grad_norm": 1.402494192123413, "learning_rate": 4.373236809205574e-06, "loss": 0.0438, "step": 272800 }, { "epoch": 4.0179820621198505, "grad_norm": 1.5950024127960205, "learning_rate": 4.37160043723859e-06, "loss": 0.0467, "step": 272825 }, { "epoch": 4.0183502452099376, "grad_norm": 1.5156582593917847, "learning_rate": 4.369964065271605e-06, "loss": 0.0469, "step": 272850 }, { "epoch": 4.0187184283000255, "grad_norm": 1.172438621520996, "learning_rate": 4.368327693304621e-06, "loss": 0.0486, "step": 272875 }, { "epoch": 4.0190866113901125, "grad_norm": 1.0187807083129883, "learning_rate": 4.366691321337636e-06, "loss": 0.0586, "step": 272900 }, { "epoch": 4.0194547944801995, "grad_norm": 1.3460967540740967, "learning_rate": 4.365054949370652e-06, "loss": 0.054, "step": 272925 }, { "epoch": 4.0198229775702865, "grad_norm": 1.8421919345855713, "learning_rate": 4.363418577403667e-06, "loss": 0.0517, "step": 272950 }, { "epoch": 4.0201911606603735, "grad_norm": 1.0277374982833862, "learning_rate": 4.361782205436683e-06, "loss": 0.0455, "step": 272975 }, { "epoch": 4.0205593437504605, "grad_norm": 0.9868427515029907, "learning_rate": 4.360145833469698e-06, "loss": 0.0535, "step": 273000 }, { "epoch": 4.0209275268405476, "grad_norm": 1.530604600906372, "learning_rate": 4.358509461502713e-06, "loss": 0.0516, "step": 273025 }, { "epoch": 4.021295709930635, "grad_norm": 0.7439272403717041, "learning_rate": 4.356873089535729e-06, "loss": 0.0455, "step": 273050 }, { "epoch": 4.021663893020722, "grad_norm": 1.4210286140441895, "learning_rate": 4.355236717568744e-06, "loss": 0.053, "step": 273075 }, { "epoch": 4.022032076110809, "grad_norm": 1.286408543586731, "learning_rate": 4.35360034560176e-06, "loss": 0.0546, "step": 273100 }, { "epoch": 4.022400259200896, "grad_norm": 1.2621315717697144, "learning_rate": 4.3519639736347755e-06, "loss": 0.05, "step": 273125 }, { "epoch": 4.022768442290983, "grad_norm": 1.1417889595031738, "learning_rate": 4.350327601667791e-06, "loss": 0.0485, "step": 273150 }, { "epoch": 4.02313662538107, "grad_norm": 2.103334665298462, "learning_rate": 4.348691229700806e-06, "loss": 0.0502, "step": 273175 }, { "epoch": 4.023504808471157, "grad_norm": 1.42632257938385, "learning_rate": 4.347054857733821e-06, "loss": 0.0516, "step": 273200 }, { "epoch": 4.023872991561244, "grad_norm": 1.1343106031417847, "learning_rate": 4.345418485766837e-06, "loss": 0.0503, "step": 273225 }, { "epoch": 4.024241174651331, "grad_norm": 1.399665117263794, "learning_rate": 4.343782113799853e-06, "loss": 0.0543, "step": 273250 }, { "epoch": 4.024609357741418, "grad_norm": 1.361912488937378, "learning_rate": 4.342145741832868e-06, "loss": 0.0522, "step": 273275 }, { "epoch": 4.024977540831505, "grad_norm": 1.5529487133026123, "learning_rate": 4.3405093698658834e-06, "loss": 0.055, "step": 273300 }, { "epoch": 4.025345723921592, "grad_norm": 1.268615961074829, "learning_rate": 4.338872997898899e-06, "loss": 0.0511, "step": 273325 }, { "epoch": 4.025713907011679, "grad_norm": 1.5254231691360474, "learning_rate": 4.337236625931914e-06, "loss": 0.0533, "step": 273350 }, { "epoch": 4.026082090101766, "grad_norm": 1.4463733434677124, "learning_rate": 4.335600253964929e-06, "loss": 0.0575, "step": 273375 }, { "epoch": 4.026450273191853, "grad_norm": 1.6330395936965942, "learning_rate": 4.3339638819979455e-06, "loss": 0.057, "step": 273400 }, { "epoch": 4.02681845628194, "grad_norm": 1.4489814043045044, "learning_rate": 4.33232751003096e-06, "loss": 0.0537, "step": 273425 }, { "epoch": 4.027186639372027, "grad_norm": 1.350322961807251, "learning_rate": 4.330691138063976e-06, "loss": 0.0499, "step": 273450 }, { "epoch": 4.027554822462114, "grad_norm": 1.1626771688461304, "learning_rate": 4.329054766096991e-06, "loss": 0.048, "step": 273475 }, { "epoch": 4.027923005552201, "grad_norm": 1.0108155012130737, "learning_rate": 4.327418394130007e-06, "loss": 0.0527, "step": 273500 }, { "epoch": 4.028291188642288, "grad_norm": 1.025947093963623, "learning_rate": 4.325782022163022e-06, "loss": 0.0497, "step": 273525 }, { "epoch": 4.028659371732375, "grad_norm": 0.9710474014282227, "learning_rate": 4.324211105074717e-06, "loss": 0.0526, "step": 273550 }, { "epoch": 4.029027554822462, "grad_norm": 0.8908804655075073, "learning_rate": 4.322574733107733e-06, "loss": 0.0558, "step": 273575 }, { "epoch": 4.029395737912549, "grad_norm": 1.1707205772399902, "learning_rate": 4.320938361140748e-06, "loss": 0.0425, "step": 273600 }, { "epoch": 4.029763921002636, "grad_norm": 1.9893933534622192, "learning_rate": 4.3193019891737635e-06, "loss": 0.0541, "step": 273625 }, { "epoch": 4.030132104092723, "grad_norm": 1.2278904914855957, "learning_rate": 4.317665617206779e-06, "loss": 0.0488, "step": 273650 }, { "epoch": 4.03050028718281, "grad_norm": 1.466922640800476, "learning_rate": 4.316029245239794e-06, "loss": 0.0507, "step": 273675 }, { "epoch": 4.030868470272897, "grad_norm": 1.2266409397125244, "learning_rate": 4.3143928732728094e-06, "loss": 0.0505, "step": 273700 }, { "epoch": 4.031236653362984, "grad_norm": 1.2560009956359863, "learning_rate": 4.312756501305826e-06, "loss": 0.0495, "step": 273725 }, { "epoch": 4.031604836453071, "grad_norm": 1.5691561698913574, "learning_rate": 4.311120129338841e-06, "loss": 0.058, "step": 273750 }, { "epoch": 4.031973019543158, "grad_norm": 1.390404224395752, "learning_rate": 4.309483757371856e-06, "loss": 0.0472, "step": 273775 }, { "epoch": 4.032341202633245, "grad_norm": 1.7149642705917358, "learning_rate": 4.3078473854048715e-06, "loss": 0.0545, "step": 273800 }, { "epoch": 4.032709385723333, "grad_norm": 1.1567271947860718, "learning_rate": 4.306211013437887e-06, "loss": 0.0432, "step": 273825 }, { "epoch": 4.03307756881342, "grad_norm": 1.3016043901443481, "learning_rate": 4.304574641470902e-06, "loss": 0.0502, "step": 273850 }, { "epoch": 4.033445751903507, "grad_norm": 1.5264921188354492, "learning_rate": 4.302938269503918e-06, "loss": 0.0527, "step": 273875 }, { "epoch": 4.033813934993594, "grad_norm": 1.398117184638977, "learning_rate": 4.301301897536933e-06, "loss": 0.0474, "step": 273900 }, { "epoch": 4.034182118083681, "grad_norm": 1.3414297103881836, "learning_rate": 4.299665525569949e-06, "loss": 0.0436, "step": 273925 }, { "epoch": 4.034550301173768, "grad_norm": 1.3082327842712402, "learning_rate": 4.298029153602964e-06, "loss": 0.0564, "step": 273950 }, { "epoch": 4.034918484263855, "grad_norm": 1.8074843883514404, "learning_rate": 4.2963927816359795e-06, "loss": 0.0561, "step": 273975 }, { "epoch": 4.035286667353942, "grad_norm": 1.2525572776794434, "learning_rate": 4.294756409668995e-06, "loss": 0.0516, "step": 274000 }, { "epoch": 4.035654850444029, "grad_norm": 0.9967262744903564, "learning_rate": 4.29312003770201e-06, "loss": 0.0515, "step": 274025 }, { "epoch": 4.036023033534116, "grad_norm": 0.7842008471488953, "learning_rate": 4.291483665735026e-06, "loss": 0.0538, "step": 274050 }, { "epoch": 4.036391216624203, "grad_norm": 1.54563307762146, "learning_rate": 4.2898472937680415e-06, "loss": 0.049, "step": 274075 }, { "epoch": 4.03675939971429, "grad_norm": 1.1523215770721436, "learning_rate": 4.288210921801057e-06, "loss": 0.053, "step": 274100 }, { "epoch": 4.037127582804377, "grad_norm": 1.4274777173995972, "learning_rate": 4.286574549834072e-06, "loss": 0.0516, "step": 274125 }, { "epoch": 4.037495765894464, "grad_norm": 1.737199306488037, "learning_rate": 4.284938177867088e-06, "loss": 0.0494, "step": 274150 }, { "epoch": 4.037863948984551, "grad_norm": 0.4694286584854126, "learning_rate": 4.283301805900103e-06, "loss": 0.0536, "step": 274175 }, { "epoch": 4.038232132074638, "grad_norm": 1.7536449432373047, "learning_rate": 4.281665433933119e-06, "loss": 0.0569, "step": 274200 }, { "epoch": 4.038600315164725, "grad_norm": 1.1667371988296509, "learning_rate": 4.280029061966134e-06, "loss": 0.0511, "step": 274225 }, { "epoch": 4.038968498254812, "grad_norm": 1.3396191596984863, "learning_rate": 4.2783926899991495e-06, "loss": 0.0561, "step": 274250 }, { "epoch": 4.039336681344899, "grad_norm": 1.516628384590149, "learning_rate": 4.276756318032165e-06, "loss": 0.0535, "step": 274275 }, { "epoch": 4.039704864434986, "grad_norm": 1.5235519409179688, "learning_rate": 4.27511994606518e-06, "loss": 0.0545, "step": 274300 }, { "epoch": 4.040073047525073, "grad_norm": 1.9207894802093506, "learning_rate": 4.2734835740981954e-06, "loss": 0.0494, "step": 274325 }, { "epoch": 4.04044123061516, "grad_norm": 1.4713939428329468, "learning_rate": 4.271847202131212e-06, "loss": 0.05, "step": 274350 }, { "epoch": 4.040809413705247, "grad_norm": 1.1645876169204712, "learning_rate": 4.270210830164226e-06, "loss": 0.0486, "step": 274375 }, { "epoch": 4.041177596795334, "grad_norm": 0.8513479232788086, "learning_rate": 4.268574458197242e-06, "loss": 0.0488, "step": 274400 }, { "epoch": 4.041545779885421, "grad_norm": 1.943671464920044, "learning_rate": 4.2669380862302575e-06, "loss": 0.0507, "step": 274425 }, { "epoch": 4.041913962975508, "grad_norm": 1.4613126516342163, "learning_rate": 4.265301714263273e-06, "loss": 0.0495, "step": 274450 }, { "epoch": 4.042282146065595, "grad_norm": 1.0549331903457642, "learning_rate": 4.263665342296288e-06, "loss": 0.0459, "step": 274475 }, { "epoch": 4.042650329155682, "grad_norm": 1.629569411277771, "learning_rate": 4.262028970329304e-06, "loss": 0.0459, "step": 274500 }, { "epoch": 4.043018512245769, "grad_norm": 1.6735188961029053, "learning_rate": 4.2603925983623196e-06, "loss": 0.0529, "step": 274525 }, { "epoch": 4.043386695335856, "grad_norm": 1.1502186059951782, "learning_rate": 4.258756226395335e-06, "loss": 0.0474, "step": 274550 }, { "epoch": 4.043754878425943, "grad_norm": 1.0216065645217896, "learning_rate": 4.25711985442835e-06, "loss": 0.0554, "step": 274575 }, { "epoch": 4.04412306151603, "grad_norm": 1.4073421955108643, "learning_rate": 4.2554834824613655e-06, "loss": 0.0561, "step": 274600 }, { "epoch": 4.044491244606117, "grad_norm": 1.106372594833374, "learning_rate": 4.253847110494382e-06, "loss": 0.0584, "step": 274625 }, { "epoch": 4.044859427696204, "grad_norm": 1.419223427772522, "learning_rate": 4.252210738527396e-06, "loss": 0.0471, "step": 274650 }, { "epoch": 4.0452276107862915, "grad_norm": 1.1996541023254395, "learning_rate": 4.250574366560412e-06, "loss": 0.0497, "step": 274675 }, { "epoch": 4.0455957938763785, "grad_norm": 1.3995702266693115, "learning_rate": 4.2489379945934275e-06, "loss": 0.0493, "step": 274700 }, { "epoch": 4.0459639769664655, "grad_norm": 1.0698034763336182, "learning_rate": 4.247301622626443e-06, "loss": 0.047, "step": 274725 }, { "epoch": 4.0463321600565525, "grad_norm": 1.38143789768219, "learning_rate": 4.245665250659458e-06, "loss": 0.0501, "step": 274750 }, { "epoch": 4.04670034314664, "grad_norm": 1.5964407920837402, "learning_rate": 4.2440288786924734e-06, "loss": 0.0504, "step": 274775 }, { "epoch": 4.047068526236727, "grad_norm": 1.6865134239196777, "learning_rate": 4.242392506725489e-06, "loss": 0.0448, "step": 274800 }, { "epoch": 4.047436709326814, "grad_norm": 1.5240774154663086, "learning_rate": 4.240756134758505e-06, "loss": 0.0509, "step": 274825 }, { "epoch": 4.0478048924169014, "grad_norm": 1.4935787916183472, "learning_rate": 4.23911976279152e-06, "loss": 0.0498, "step": 274850 }, { "epoch": 4.0481730755069885, "grad_norm": 1.2989176511764526, "learning_rate": 4.2374833908245355e-06, "loss": 0.0534, "step": 274875 }, { "epoch": 4.0485412585970755, "grad_norm": 1.2426633834838867, "learning_rate": 4.235847018857551e-06, "loss": 0.055, "step": 274900 }, { "epoch": 4.0489094416871625, "grad_norm": 1.389223575592041, "learning_rate": 4.234210646890566e-06, "loss": 0.054, "step": 274925 }, { "epoch": 4.0492776247772495, "grad_norm": 1.1781851053237915, "learning_rate": 4.2325742749235814e-06, "loss": 0.053, "step": 274950 }, { "epoch": 4.0496458078673365, "grad_norm": 1.1376879215240479, "learning_rate": 4.230937902956598e-06, "loss": 0.0493, "step": 274975 }, { "epoch": 4.0500139909574235, "grad_norm": 1.385263442993164, "learning_rate": 4.229301530989612e-06, "loss": 0.0512, "step": 275000 }, { "epoch": 4.0503821740475106, "grad_norm": 1.1493520736694336, "learning_rate": 4.227665159022628e-06, "loss": 0.0588, "step": 275025 }, { "epoch": 4.050750357137598, "grad_norm": 0.9774383306503296, "learning_rate": 4.2260287870556435e-06, "loss": 0.0561, "step": 275050 }, { "epoch": 4.051118540227685, "grad_norm": 1.7937761545181274, "learning_rate": 4.224392415088659e-06, "loss": 0.0505, "step": 275075 }, { "epoch": 4.051486723317772, "grad_norm": 0.9946050643920898, "learning_rate": 4.222756043121675e-06, "loss": 0.0524, "step": 275100 }, { "epoch": 4.051854906407859, "grad_norm": 1.591372013092041, "learning_rate": 4.221119671154689e-06, "loss": 0.0568, "step": 275125 }, { "epoch": 4.052223089497946, "grad_norm": 1.4732050895690918, "learning_rate": 4.2194832991877056e-06, "loss": 0.0497, "step": 275150 }, { "epoch": 4.052591272588033, "grad_norm": 2.2683539390563965, "learning_rate": 4.217846927220721e-06, "loss": 0.0538, "step": 275175 }, { "epoch": 4.05295945567812, "grad_norm": 1.082350254058838, "learning_rate": 4.216210555253736e-06, "loss": 0.0492, "step": 275200 }, { "epoch": 4.053327638768207, "grad_norm": 1.438417673110962, "learning_rate": 4.2145741832867515e-06, "loss": 0.0474, "step": 275225 }, { "epoch": 4.053695821858294, "grad_norm": 1.1476970911026, "learning_rate": 4.212937811319768e-06, "loss": 0.0513, "step": 275250 }, { "epoch": 4.054064004948381, "grad_norm": 1.1711137294769287, "learning_rate": 4.211301439352782e-06, "loss": 0.0496, "step": 275275 }, { "epoch": 4.054432188038468, "grad_norm": 1.4307100772857666, "learning_rate": 4.209665067385798e-06, "loss": 0.0533, "step": 275300 }, { "epoch": 4.054800371128555, "grad_norm": 1.5378831624984741, "learning_rate": 4.2080286954188135e-06, "loss": 0.055, "step": 275325 }, { "epoch": 4.055168554218642, "grad_norm": 1.559235692024231, "learning_rate": 4.206392323451829e-06, "loss": 0.054, "step": 275350 }, { "epoch": 4.055536737308729, "grad_norm": 1.107239842414856, "learning_rate": 4.204755951484844e-06, "loss": 0.0499, "step": 275375 }, { "epoch": 4.055904920398816, "grad_norm": 1.0033743381500244, "learning_rate": 4.2031195795178594e-06, "loss": 0.0475, "step": 275400 }, { "epoch": 4.056273103488903, "grad_norm": 1.441225290298462, "learning_rate": 4.201483207550875e-06, "loss": 0.0493, "step": 275425 }, { "epoch": 4.05664128657899, "grad_norm": 1.8936614990234375, "learning_rate": 4.199846835583891e-06, "loss": 0.0439, "step": 275450 }, { "epoch": 4.057009469669077, "grad_norm": 1.6256004571914673, "learning_rate": 4.198210463616905e-06, "loss": 0.0531, "step": 275475 }, { "epoch": 4.057377652759164, "grad_norm": 1.1434792280197144, "learning_rate": 4.1965740916499215e-06, "loss": 0.0534, "step": 275500 }, { "epoch": 4.057745835849251, "grad_norm": 1.2260769605636597, "learning_rate": 4.194937719682937e-06, "loss": 0.0438, "step": 275525 }, { "epoch": 4.058114018939338, "grad_norm": 1.2241178750991821, "learning_rate": 4.193301347715952e-06, "loss": 0.051, "step": 275550 }, { "epoch": 4.058482202029425, "grad_norm": 1.7049682140350342, "learning_rate": 4.191664975748968e-06, "loss": 0.0481, "step": 275575 }, { "epoch": 4.058850385119512, "grad_norm": 1.180618166923523, "learning_rate": 4.190028603781984e-06, "loss": 0.0471, "step": 275600 }, { "epoch": 4.059218568209599, "grad_norm": 1.0967459678649902, "learning_rate": 4.188392231814999e-06, "loss": 0.0566, "step": 275625 }, { "epoch": 4.059586751299686, "grad_norm": 1.3460229635238647, "learning_rate": 4.186755859848014e-06, "loss": 0.0489, "step": 275650 }, { "epoch": 4.059954934389773, "grad_norm": 1.5702486038208008, "learning_rate": 4.185184942759709e-06, "loss": 0.0602, "step": 275675 }, { "epoch": 4.06032311747986, "grad_norm": 1.0367343425750732, "learning_rate": 4.183548570792724e-06, "loss": 0.0506, "step": 275700 }, { "epoch": 4.060691300569948, "grad_norm": 0.8412879109382629, "learning_rate": 4.18191219882574e-06, "loss": 0.0564, "step": 275725 }, { "epoch": 4.061059483660035, "grad_norm": 1.5939195156097412, "learning_rate": 4.180275826858755e-06, "loss": 0.0486, "step": 275750 }, { "epoch": 4.061427666750122, "grad_norm": 1.1993752717971802, "learning_rate": 4.178639454891771e-06, "loss": 0.0484, "step": 275775 }, { "epoch": 4.061795849840209, "grad_norm": 1.3193093538284302, "learning_rate": 4.177003082924786e-06, "loss": 0.0497, "step": 275800 }, { "epoch": 4.062164032930296, "grad_norm": 1.7645987272262573, "learning_rate": 4.175366710957802e-06, "loss": 0.0512, "step": 275825 }, { "epoch": 4.062532216020383, "grad_norm": 1.0492877960205078, "learning_rate": 4.173730338990817e-06, "loss": 0.0487, "step": 275850 }, { "epoch": 4.06290039911047, "grad_norm": 1.2446444034576416, "learning_rate": 4.172093967023832e-06, "loss": 0.053, "step": 275875 }, { "epoch": 4.063268582200557, "grad_norm": 0.8729466795921326, "learning_rate": 4.1704575950568475e-06, "loss": 0.0557, "step": 275900 }, { "epoch": 4.063636765290644, "grad_norm": 1.142698884010315, "learning_rate": 4.168821223089864e-06, "loss": 0.0529, "step": 275925 }, { "epoch": 4.064004948380731, "grad_norm": 1.6176023483276367, "learning_rate": 4.167184851122878e-06, "loss": 0.045, "step": 275950 }, { "epoch": 4.064373131470818, "grad_norm": 1.1004976034164429, "learning_rate": 4.165548479155894e-06, "loss": 0.0483, "step": 275975 }, { "epoch": 4.064741314560905, "grad_norm": 0.9486537575721741, "learning_rate": 4.1639121071889096e-06, "loss": 0.0537, "step": 276000 }, { "epoch": 4.065109497650992, "grad_norm": 1.4191198348999023, "learning_rate": 4.162275735221925e-06, "loss": 0.0428, "step": 276025 }, { "epoch": 4.065477680741079, "grad_norm": 1.368203043937683, "learning_rate": 4.160639363254941e-06, "loss": 0.0496, "step": 276050 }, { "epoch": 4.065845863831166, "grad_norm": 1.5861294269561768, "learning_rate": 4.159002991287956e-06, "loss": 0.0529, "step": 276075 }, { "epoch": 4.066214046921253, "grad_norm": 1.3480197191238403, "learning_rate": 4.157366619320972e-06, "loss": 0.058, "step": 276100 }, { "epoch": 4.06658223001134, "grad_norm": 1.3167120218276978, "learning_rate": 4.155730247353987e-06, "loss": 0.0487, "step": 276125 }, { "epoch": 4.066950413101427, "grad_norm": 1.538070797920227, "learning_rate": 4.154093875387002e-06, "loss": 0.0564, "step": 276150 }, { "epoch": 4.067318596191514, "grad_norm": 1.1279892921447754, "learning_rate": 4.1524575034200175e-06, "loss": 0.0475, "step": 276175 }, { "epoch": 4.067686779281601, "grad_norm": 1.28226637840271, "learning_rate": 4.150821131453034e-06, "loss": 0.0518, "step": 276200 }, { "epoch": 4.068054962371688, "grad_norm": 1.1002174615859985, "learning_rate": 4.149184759486048e-06, "loss": 0.0457, "step": 276225 }, { "epoch": 4.068423145461775, "grad_norm": 1.401401162147522, "learning_rate": 4.147548387519064e-06, "loss": 0.0533, "step": 276250 }, { "epoch": 4.068791328551862, "grad_norm": 1.037559151649475, "learning_rate": 4.14591201555208e-06, "loss": 0.0458, "step": 276275 }, { "epoch": 4.069159511641949, "grad_norm": 1.0772476196289062, "learning_rate": 4.144275643585095e-06, "loss": 0.0499, "step": 276300 }, { "epoch": 4.069527694732036, "grad_norm": 0.9725455641746521, "learning_rate": 4.14263927161811e-06, "loss": 0.0546, "step": 276325 }, { "epoch": 4.069895877822123, "grad_norm": 1.4394406080245972, "learning_rate": 4.1410028996511255e-06, "loss": 0.0556, "step": 276350 }, { "epoch": 4.07026406091221, "grad_norm": 1.7116791009902954, "learning_rate": 4.139366527684141e-06, "loss": 0.0476, "step": 276375 }, { "epoch": 4.070632244002297, "grad_norm": 1.3169617652893066, "learning_rate": 4.137730155717157e-06, "loss": 0.05, "step": 276400 }, { "epoch": 4.071000427092384, "grad_norm": 1.8277477025985718, "learning_rate": 4.136093783750172e-06, "loss": 0.0529, "step": 276425 }, { "epoch": 4.071368610182471, "grad_norm": 1.1039022207260132, "learning_rate": 4.134457411783188e-06, "loss": 0.053, "step": 276450 }, { "epoch": 4.071736793272558, "grad_norm": 0.8616538047790527, "learning_rate": 4.132821039816203e-06, "loss": 0.0515, "step": 276475 }, { "epoch": 4.072104976362645, "grad_norm": 0.7401161789894104, "learning_rate": 4.131184667849218e-06, "loss": 0.0474, "step": 276500 }, { "epoch": 4.072473159452732, "grad_norm": 0.8509481549263, "learning_rate": 4.129548295882234e-06, "loss": 0.0539, "step": 276525 }, { "epoch": 4.072841342542819, "grad_norm": 0.9534338712692261, "learning_rate": 4.12791192391525e-06, "loss": 0.053, "step": 276550 }, { "epoch": 4.073209525632906, "grad_norm": 1.070122480392456, "learning_rate": 4.126275551948265e-06, "loss": 0.0465, "step": 276575 }, { "epoch": 4.073577708722993, "grad_norm": 1.021049976348877, "learning_rate": 4.12463917998128e-06, "loss": 0.0511, "step": 276600 }, { "epoch": 4.07394589181308, "grad_norm": 1.3801721334457397, "learning_rate": 4.1230028080142956e-06, "loss": 0.053, "step": 276625 }, { "epoch": 4.074314074903167, "grad_norm": 1.4900864362716675, "learning_rate": 4.121366436047311e-06, "loss": 0.0502, "step": 276650 }, { "epoch": 4.0746822579932545, "grad_norm": 1.3341633081436157, "learning_rate": 4.119730064080327e-06, "loss": 0.0515, "step": 276675 }, { "epoch": 4.075050441083342, "grad_norm": 1.6892269849777222, "learning_rate": 4.1180936921133415e-06, "loss": 0.0586, "step": 276700 }, { "epoch": 4.075418624173429, "grad_norm": 1.7524826526641846, "learning_rate": 4.116457320146358e-06, "loss": 0.0524, "step": 276725 }, { "epoch": 4.075786807263516, "grad_norm": 1.2068977355957031, "learning_rate": 4.114820948179373e-06, "loss": 0.0529, "step": 276750 }, { "epoch": 4.076154990353603, "grad_norm": 1.2106729745864868, "learning_rate": 4.113184576212388e-06, "loss": 0.0515, "step": 276775 }, { "epoch": 4.07652317344369, "grad_norm": 1.0615928173065186, "learning_rate": 4.1115482042454035e-06, "loss": 0.0488, "step": 276800 }, { "epoch": 4.076891356533777, "grad_norm": 0.9959233403205872, "learning_rate": 4.10991183227842e-06, "loss": 0.0593, "step": 276825 }, { "epoch": 4.0772595396238644, "grad_norm": 1.3831136226654053, "learning_rate": 4.108275460311434e-06, "loss": 0.0532, "step": 276850 }, { "epoch": 4.0776277227139515, "grad_norm": 1.4975295066833496, "learning_rate": 4.10663908834445e-06, "loss": 0.0491, "step": 276875 }, { "epoch": 4.0779959058040385, "grad_norm": 0.7060441374778748, "learning_rate": 4.105002716377466e-06, "loss": 0.0475, "step": 276900 }, { "epoch": 4.0783640888941255, "grad_norm": 1.2254325151443481, "learning_rate": 4.103366344410481e-06, "loss": 0.0554, "step": 276925 }, { "epoch": 4.0787322719842125, "grad_norm": 1.3210432529449463, "learning_rate": 4.101729972443497e-06, "loss": 0.0507, "step": 276950 }, { "epoch": 4.0791004550742995, "grad_norm": 1.3877944946289062, "learning_rate": 4.1000936004765115e-06, "loss": 0.0502, "step": 276975 }, { "epoch": 4.0794686381643865, "grad_norm": 1.5770561695098877, "learning_rate": 4.098457228509528e-06, "loss": 0.0502, "step": 277000 }, { "epoch": 4.0798368212544736, "grad_norm": 1.1314924955368042, "learning_rate": 4.096820856542543e-06, "loss": 0.0499, "step": 277025 }, { "epoch": 4.080205004344561, "grad_norm": 1.0255426168441772, "learning_rate": 4.095184484575558e-06, "loss": 0.0513, "step": 277050 }, { "epoch": 4.080573187434648, "grad_norm": 1.539833664894104, "learning_rate": 4.093548112608574e-06, "loss": 0.0566, "step": 277075 }, { "epoch": 4.080941370524735, "grad_norm": 0.6666017770767212, "learning_rate": 4.091911740641589e-06, "loss": 0.0452, "step": 277100 }, { "epoch": 4.081309553614822, "grad_norm": 0.8971357941627502, "learning_rate": 4.090275368674604e-06, "loss": 0.0467, "step": 277125 }, { "epoch": 4.081677736704909, "grad_norm": 1.290385127067566, "learning_rate": 4.08863899670762e-06, "loss": 0.0473, "step": 277150 }, { "epoch": 4.082045919794996, "grad_norm": 1.6305418014526367, "learning_rate": 4.087002624740636e-06, "loss": 0.0545, "step": 277175 }, { "epoch": 4.082414102885083, "grad_norm": 1.1996963024139404, "learning_rate": 4.085366252773651e-06, "loss": 0.0521, "step": 277200 }, { "epoch": 4.08278228597517, "grad_norm": 0.871673583984375, "learning_rate": 4.083729880806666e-06, "loss": 0.0485, "step": 277225 }, { "epoch": 4.083150469065257, "grad_norm": 0.8779425024986267, "learning_rate": 4.0820935088396816e-06, "loss": 0.0519, "step": 277250 }, { "epoch": 4.083518652155344, "grad_norm": 1.308681607246399, "learning_rate": 4.080457136872697e-06, "loss": 0.048, "step": 277275 }, { "epoch": 4.083886835245431, "grad_norm": 1.2734334468841553, "learning_rate": 4.078820764905713e-06, "loss": 0.0516, "step": 277300 }, { "epoch": 4.084255018335518, "grad_norm": 1.5363825559616089, "learning_rate": 4.0771843929387275e-06, "loss": 0.0474, "step": 277325 }, { "epoch": 4.084623201425605, "grad_norm": 1.5124701261520386, "learning_rate": 4.075548020971744e-06, "loss": 0.0508, "step": 277350 }, { "epoch": 4.084991384515692, "grad_norm": 1.2399643659591675, "learning_rate": 4.073911649004759e-06, "loss": 0.0547, "step": 277375 }, { "epoch": 4.085359567605779, "grad_norm": 1.130418062210083, "learning_rate": 4.072275277037774e-06, "loss": 0.0534, "step": 277400 }, { "epoch": 4.085727750695866, "grad_norm": 1.3833637237548828, "learning_rate": 4.0706389050707895e-06, "loss": 0.0524, "step": 277425 }, { "epoch": 4.086095933785953, "grad_norm": 0.9419562220573425, "learning_rate": 4.069002533103805e-06, "loss": 0.0584, "step": 277450 }, { "epoch": 4.08646411687604, "grad_norm": 1.365135669708252, "learning_rate": 4.067366161136821e-06, "loss": 0.0566, "step": 277475 }, { "epoch": 4.086832299966127, "grad_norm": 0.8531058430671692, "learning_rate": 4.065729789169836e-06, "loss": 0.051, "step": 277500 }, { "epoch": 4.087200483056214, "grad_norm": 1.4153810739517212, "learning_rate": 4.064093417202852e-06, "loss": 0.0543, "step": 277525 }, { "epoch": 4.087568666146301, "grad_norm": 0.8011882305145264, "learning_rate": 4.062457045235867e-06, "loss": 0.0449, "step": 277550 }, { "epoch": 4.087936849236388, "grad_norm": 1.3054267168045044, "learning_rate": 4.060820673268882e-06, "loss": 0.0596, "step": 277575 }, { "epoch": 4.088305032326475, "grad_norm": 1.223548173904419, "learning_rate": 4.0591843013018975e-06, "loss": 0.0469, "step": 277600 }, { "epoch": 4.088673215416562, "grad_norm": 1.245463252067566, "learning_rate": 4.057547929334914e-06, "loss": 0.0501, "step": 277625 }, { "epoch": 4.08904139850665, "grad_norm": 1.0151854753494263, "learning_rate": 4.055911557367929e-06, "loss": 0.0552, "step": 277650 }, { "epoch": 4.089409581596737, "grad_norm": 1.369176983833313, "learning_rate": 4.054275185400944e-06, "loss": 0.0495, "step": 277675 }, { "epoch": 4.089777764686824, "grad_norm": 0.9294801950454712, "learning_rate": 4.05263881343396e-06, "loss": 0.0513, "step": 277700 }, { "epoch": 4.090145947776911, "grad_norm": 1.0617151260375977, "learning_rate": 4.051002441466975e-06, "loss": 0.0512, "step": 277725 }, { "epoch": 4.090514130866998, "grad_norm": 1.2690285444259644, "learning_rate": 4.04936606949999e-06, "loss": 0.0534, "step": 277750 }, { "epoch": 4.090882313957085, "grad_norm": 1.0979222059249878, "learning_rate": 4.047729697533006e-06, "loss": 0.0505, "step": 277775 }, { "epoch": 4.091250497047172, "grad_norm": 1.1480523347854614, "learning_rate": 4.046093325566021e-06, "loss": 0.0534, "step": 277800 }, { "epoch": 4.091618680137259, "grad_norm": 1.834741473197937, "learning_rate": 4.044456953599037e-06, "loss": 0.0596, "step": 277825 }, { "epoch": 4.091986863227346, "grad_norm": 1.4808859825134277, "learning_rate": 4.042820581632052e-06, "loss": 0.0496, "step": 277850 }, { "epoch": 4.092355046317433, "grad_norm": 1.0610440969467163, "learning_rate": 4.0411842096650676e-06, "loss": 0.0529, "step": 277875 }, { "epoch": 4.09272322940752, "grad_norm": 1.6741849184036255, "learning_rate": 4.039547837698083e-06, "loss": 0.0515, "step": 277900 }, { "epoch": 4.093091412497607, "grad_norm": 1.0648272037506104, "learning_rate": 4.037911465731099e-06, "loss": 0.0534, "step": 277925 }, { "epoch": 4.093459595587694, "grad_norm": 0.7898022532463074, "learning_rate": 4.036275093764114e-06, "loss": 0.0449, "step": 277950 }, { "epoch": 4.093827778677781, "grad_norm": 1.3693745136260986, "learning_rate": 4.03463872179713e-06, "loss": 0.049, "step": 277975 }, { "epoch": 4.094195961767868, "grad_norm": 0.8926895260810852, "learning_rate": 4.033002349830145e-06, "loss": 0.0468, "step": 278000 }, { "epoch": 4.094564144857955, "grad_norm": 1.4297376871109009, "learning_rate": 4.03136597786316e-06, "loss": 0.0568, "step": 278025 }, { "epoch": 4.094932327948042, "grad_norm": 1.0375431776046753, "learning_rate": 4.029729605896176e-06, "loss": 0.0521, "step": 278050 }, { "epoch": 4.095300511038129, "grad_norm": 1.9625438451766968, "learning_rate": 4.028093233929191e-06, "loss": 0.0533, "step": 278075 }, { "epoch": 4.095668694128216, "grad_norm": 1.1361236572265625, "learning_rate": 4.026456861962207e-06, "loss": 0.048, "step": 278100 }, { "epoch": 4.096036877218303, "grad_norm": 1.4360212087631226, "learning_rate": 4.024820489995222e-06, "loss": 0.056, "step": 278125 }, { "epoch": 4.09640506030839, "grad_norm": 1.3297650814056396, "learning_rate": 4.023184118028238e-06, "loss": 0.0484, "step": 278150 }, { "epoch": 4.096773243398477, "grad_norm": 0.8162236213684082, "learning_rate": 4.021547746061253e-06, "loss": 0.0562, "step": 278175 }, { "epoch": 4.097141426488564, "grad_norm": 1.244158387184143, "learning_rate": 4.019911374094268e-06, "loss": 0.0492, "step": 278200 }, { "epoch": 4.097509609578651, "grad_norm": 1.918629765510559, "learning_rate": 4.0182750021272835e-06, "loss": 0.0518, "step": 278225 }, { "epoch": 4.097877792668738, "grad_norm": 1.3182944059371948, "learning_rate": 4.0166386301603e-06, "loss": 0.0562, "step": 278250 }, { "epoch": 4.098245975758825, "grad_norm": 1.0243265628814697, "learning_rate": 4.015002258193315e-06, "loss": 0.0483, "step": 278275 }, { "epoch": 4.098614158848912, "grad_norm": 1.120998740196228, "learning_rate": 4.01336588622633e-06, "loss": 0.0537, "step": 278300 }, { "epoch": 4.098982341938999, "grad_norm": 0.9357917308807373, "learning_rate": 4.011729514259346e-06, "loss": 0.0553, "step": 278325 }, { "epoch": 4.099350525029086, "grad_norm": 1.3658519983291626, "learning_rate": 4.010093142292361e-06, "loss": 0.053, "step": 278350 }, { "epoch": 4.099718708119173, "grad_norm": 1.928463339805603, "learning_rate": 4.008456770325376e-06, "loss": 0.0541, "step": 278375 }, { "epoch": 4.10008689120926, "grad_norm": 1.170521855354309, "learning_rate": 4.006820398358392e-06, "loss": 0.0487, "step": 278400 }, { "epoch": 4.100455074299347, "grad_norm": 1.3885964155197144, "learning_rate": 4.005184026391408e-06, "loss": 0.0551, "step": 278425 }, { "epoch": 4.100823257389434, "grad_norm": 1.4824920892715454, "learning_rate": 4.003547654424423e-06, "loss": 0.053, "step": 278450 }, { "epoch": 4.101191440479521, "grad_norm": 1.451856017112732, "learning_rate": 4.001911282457438e-06, "loss": 0.0503, "step": 278475 }, { "epoch": 4.101559623569608, "grad_norm": 1.8395278453826904, "learning_rate": 4.0002749104904536e-06, "loss": 0.0504, "step": 278500 }, { "epoch": 4.101927806659695, "grad_norm": 1.5039464235305786, "learning_rate": 3.99863853852347e-06, "loss": 0.0535, "step": 278525 }, { "epoch": 4.102295989749782, "grad_norm": 0.820640504360199, "learning_rate": 3.997002166556484e-06, "loss": 0.0454, "step": 278550 }, { "epoch": 4.102664172839869, "grad_norm": 1.6552753448486328, "learning_rate": 3.9953657945895e-06, "loss": 0.0519, "step": 278575 }, { "epoch": 4.103032355929956, "grad_norm": 1.631272792816162, "learning_rate": 3.993729422622516e-06, "loss": 0.0524, "step": 278600 }, { "epoch": 4.103400539020044, "grad_norm": 1.3876011371612549, "learning_rate": 3.992093050655531e-06, "loss": 0.0533, "step": 278625 }, { "epoch": 4.103768722110131, "grad_norm": 1.0428255796432495, "learning_rate": 3.990456678688546e-06, "loss": 0.0561, "step": 278650 }, { "epoch": 4.104136905200218, "grad_norm": 1.0484966039657593, "learning_rate": 3.9888203067215615e-06, "loss": 0.0495, "step": 278675 }, { "epoch": 4.104505088290305, "grad_norm": 1.3320986032485962, "learning_rate": 3.987183934754577e-06, "loss": 0.047, "step": 278700 }, { "epoch": 4.104873271380392, "grad_norm": 0.9794262647628784, "learning_rate": 3.985547562787593e-06, "loss": 0.0517, "step": 278725 }, { "epoch": 4.105241454470479, "grad_norm": 1.261979103088379, "learning_rate": 3.983911190820608e-06, "loss": 0.0511, "step": 278750 }, { "epoch": 4.105609637560566, "grad_norm": 1.1944228410720825, "learning_rate": 3.982274818853624e-06, "loss": 0.051, "step": 278775 }, { "epoch": 4.105977820650653, "grad_norm": 1.2041842937469482, "learning_rate": 3.980638446886639e-06, "loss": 0.0544, "step": 278800 }, { "epoch": 4.10634600374074, "grad_norm": 1.5027638673782349, "learning_rate": 3.979002074919654e-06, "loss": 0.053, "step": 278825 }, { "epoch": 4.1067141868308275, "grad_norm": 1.4179985523223877, "learning_rate": 3.9773657029526695e-06, "loss": 0.0505, "step": 278850 }, { "epoch": 4.1070823699209145, "grad_norm": 1.3135807514190674, "learning_rate": 3.975729330985686e-06, "loss": 0.0547, "step": 278875 }, { "epoch": 4.1074505530110015, "grad_norm": 1.3814418315887451, "learning_rate": 3.9740929590187e-06, "loss": 0.0517, "step": 278900 }, { "epoch": 4.1078187361010885, "grad_norm": 1.207383394241333, "learning_rate": 3.972456587051716e-06, "loss": 0.0519, "step": 278925 }, { "epoch": 4.1081869191911755, "grad_norm": 1.3203582763671875, "learning_rate": 3.970820215084732e-06, "loss": 0.0626, "step": 278950 }, { "epoch": 4.1085551022812625, "grad_norm": 1.9190857410430908, "learning_rate": 3.969183843117747e-06, "loss": 0.0546, "step": 278975 }, { "epoch": 4.1089232853713495, "grad_norm": 1.5795048475265503, "learning_rate": 3.967547471150763e-06, "loss": 0.0543, "step": 279000 }, { "epoch": 4.109291468461437, "grad_norm": 1.1442686319351196, "learning_rate": 3.965911099183778e-06, "loss": 0.0572, "step": 279025 }, { "epoch": 4.109659651551524, "grad_norm": 1.3957563638687134, "learning_rate": 3.964274727216794e-06, "loss": 0.0546, "step": 279050 }, { "epoch": 4.110027834641611, "grad_norm": 1.3441755771636963, "learning_rate": 3.962638355249809e-06, "loss": 0.0479, "step": 279075 }, { "epoch": 4.110396017731698, "grad_norm": 1.243120551109314, "learning_rate": 3.961001983282824e-06, "loss": 0.0532, "step": 279100 }, { "epoch": 4.110764200821785, "grad_norm": 1.1591179370880127, "learning_rate": 3.9593656113158396e-06, "loss": 0.051, "step": 279125 }, { "epoch": 4.111132383911872, "grad_norm": 1.3041765689849854, "learning_rate": 3.957729239348856e-06, "loss": 0.0517, "step": 279150 }, { "epoch": 4.111500567001959, "grad_norm": 1.2058486938476562, "learning_rate": 3.95609286738187e-06, "loss": 0.0461, "step": 279175 }, { "epoch": 4.111868750092046, "grad_norm": 1.1057459115982056, "learning_rate": 3.954456495414886e-06, "loss": 0.0529, "step": 279200 }, { "epoch": 4.112236933182133, "grad_norm": 1.5318320989608765, "learning_rate": 3.952820123447902e-06, "loss": 0.0525, "step": 279225 }, { "epoch": 4.11260511627222, "grad_norm": 1.34402334690094, "learning_rate": 3.951183751480917e-06, "loss": 0.0534, "step": 279250 }, { "epoch": 4.112973299362307, "grad_norm": 0.8894164562225342, "learning_rate": 3.949547379513932e-06, "loss": 0.052, "step": 279275 }, { "epoch": 4.113341482452394, "grad_norm": 1.1744776964187622, "learning_rate": 3.9479110075469475e-06, "loss": 0.0469, "step": 279300 }, { "epoch": 4.113709665542481, "grad_norm": 0.9439015984535217, "learning_rate": 3.946274635579963e-06, "loss": 0.0522, "step": 279325 }, { "epoch": 4.114077848632568, "grad_norm": 1.3068386316299438, "learning_rate": 3.944638263612979e-06, "loss": 0.0508, "step": 279350 }, { "epoch": 4.114446031722655, "grad_norm": 0.9515126347541809, "learning_rate": 3.943001891645994e-06, "loss": 0.0563, "step": 279375 }, { "epoch": 4.114814214812742, "grad_norm": 1.7591304779052734, "learning_rate": 3.94136551967901e-06, "loss": 0.0522, "step": 279400 }, { "epoch": 4.115182397902829, "grad_norm": 1.4973087310791016, "learning_rate": 3.939729147712025e-06, "loss": 0.0506, "step": 279425 }, { "epoch": 4.115550580992916, "grad_norm": 1.8153070211410522, "learning_rate": 3.93809277574504e-06, "loss": 0.0517, "step": 279450 }, { "epoch": 4.115918764083003, "grad_norm": 1.1951584815979004, "learning_rate": 3.936456403778056e-06, "loss": 0.0539, "step": 279475 }, { "epoch": 4.11628694717309, "grad_norm": 1.7845375537872314, "learning_rate": 3.934820031811072e-06, "loss": 0.0538, "step": 279500 }, { "epoch": 4.116655130263177, "grad_norm": 0.9844855070114136, "learning_rate": 3.933183659844087e-06, "loss": 0.0434, "step": 279525 }, { "epoch": 4.117023313353264, "grad_norm": 1.21844482421875, "learning_rate": 3.931547287877102e-06, "loss": 0.0499, "step": 279550 }, { "epoch": 4.117391496443352, "grad_norm": 1.4345084428787231, "learning_rate": 3.929910915910118e-06, "loss": 0.0498, "step": 279575 }, { "epoch": 4.117759679533439, "grad_norm": 1.4880101680755615, "learning_rate": 3.928274543943133e-06, "loss": 0.0472, "step": 279600 }, { "epoch": 4.118127862623526, "grad_norm": 1.296320915222168, "learning_rate": 3.926638171976149e-06, "loss": 0.0511, "step": 279625 }, { "epoch": 4.118496045713613, "grad_norm": 1.3764183521270752, "learning_rate": 3.9250018000091635e-06, "loss": 0.0473, "step": 279650 }, { "epoch": 4.1188642288037, "grad_norm": 1.530476450920105, "learning_rate": 3.923430882920859e-06, "loss": 0.0589, "step": 279675 }, { "epoch": 4.119232411893787, "grad_norm": 1.2778176069259644, "learning_rate": 3.921794510953874e-06, "loss": 0.05, "step": 279700 }, { "epoch": 4.119600594983874, "grad_norm": 1.380972146987915, "learning_rate": 3.92015813898689e-06, "loss": 0.0483, "step": 279725 }, { "epoch": 4.119968778073961, "grad_norm": 1.468732476234436, "learning_rate": 3.918521767019905e-06, "loss": 0.0538, "step": 279750 }, { "epoch": 4.120336961164048, "grad_norm": 1.6027483940124512, "learning_rate": 3.91688539505292e-06, "loss": 0.0546, "step": 279775 }, { "epoch": 4.120705144254135, "grad_norm": 0.9671803712844849, "learning_rate": 3.915249023085936e-06, "loss": 0.0496, "step": 279800 }, { "epoch": 4.121073327344222, "grad_norm": 1.381050705909729, "learning_rate": 3.913612651118952e-06, "loss": 0.0454, "step": 279825 }, { "epoch": 4.121441510434309, "grad_norm": 0.8141024708747864, "learning_rate": 3.911976279151967e-06, "loss": 0.0513, "step": 279850 }, { "epoch": 4.121809693524396, "grad_norm": 1.382088303565979, "learning_rate": 3.910339907184982e-06, "loss": 0.0521, "step": 279875 }, { "epoch": 4.122177876614483, "grad_norm": 1.3073772192001343, "learning_rate": 3.908703535217998e-06, "loss": 0.0542, "step": 279900 }, { "epoch": 4.12254605970457, "grad_norm": 1.2595528364181519, "learning_rate": 3.907067163251013e-06, "loss": 0.0564, "step": 279925 }, { "epoch": 4.122914242794657, "grad_norm": 0.7806668877601624, "learning_rate": 3.905430791284029e-06, "loss": 0.0516, "step": 279950 }, { "epoch": 4.123282425884744, "grad_norm": 1.7917717695236206, "learning_rate": 3.903794419317044e-06, "loss": 0.0454, "step": 279975 }, { "epoch": 4.123650608974831, "grad_norm": 0.6870917081832886, "learning_rate": 3.90215804735006e-06, "loss": 0.058, "step": 280000 }, { "epoch": 4.124018792064918, "grad_norm": 1.9068914651870728, "learning_rate": 3.900521675383075e-06, "loss": 0.0508, "step": 280025 }, { "epoch": 4.124386975155005, "grad_norm": 1.4279999732971191, "learning_rate": 3.89888530341609e-06, "loss": 0.051, "step": 280050 }, { "epoch": 4.124755158245092, "grad_norm": 1.2259025573730469, "learning_rate": 3.897248931449106e-06, "loss": 0.0484, "step": 280075 }, { "epoch": 4.125123341335179, "grad_norm": 1.1658406257629395, "learning_rate": 3.895612559482122e-06, "loss": 0.0487, "step": 280100 }, { "epoch": 4.125491524425266, "grad_norm": 1.4479068517684937, "learning_rate": 3.893976187515136e-06, "loss": 0.0539, "step": 280125 }, { "epoch": 4.125859707515353, "grad_norm": 1.5897122621536255, "learning_rate": 3.892339815548152e-06, "loss": 0.0564, "step": 280150 }, { "epoch": 4.12622789060544, "grad_norm": 1.1869397163391113, "learning_rate": 3.890703443581168e-06, "loss": 0.052, "step": 280175 }, { "epoch": 4.126596073695527, "grad_norm": 1.2326768636703491, "learning_rate": 3.889067071614183e-06, "loss": 0.0488, "step": 280200 }, { "epoch": 4.126964256785614, "grad_norm": 1.4007612466812134, "learning_rate": 3.887430699647198e-06, "loss": 0.0512, "step": 280225 }, { "epoch": 4.127332439875701, "grad_norm": 1.311048150062561, "learning_rate": 3.885794327680214e-06, "loss": 0.0575, "step": 280250 }, { "epoch": 4.127700622965788, "grad_norm": 1.2580080032348633, "learning_rate": 3.884157955713229e-06, "loss": 0.0552, "step": 280275 }, { "epoch": 4.128068806055875, "grad_norm": 1.7356899976730347, "learning_rate": 3.882521583746245e-06, "loss": 0.0478, "step": 280300 }, { "epoch": 4.128436989145962, "grad_norm": 0.8031937479972839, "learning_rate": 3.88088521177926e-06, "loss": 0.0539, "step": 280325 }, { "epoch": 4.128805172236049, "grad_norm": 1.1134426593780518, "learning_rate": 3.879248839812276e-06, "loss": 0.0445, "step": 280350 }, { "epoch": 4.129173355326136, "grad_norm": 1.4369693994522095, "learning_rate": 3.877612467845292e-06, "loss": 0.0482, "step": 280375 }, { "epoch": 4.129541538416223, "grad_norm": 1.549274206161499, "learning_rate": 3.875976095878306e-06, "loss": 0.047, "step": 280400 }, { "epoch": 4.12990972150631, "grad_norm": 1.5910199880599976, "learning_rate": 3.8743397239113224e-06, "loss": 0.0522, "step": 280425 }, { "epoch": 4.130277904596397, "grad_norm": 1.6414375305175781, "learning_rate": 3.872703351944338e-06, "loss": 0.0538, "step": 280450 }, { "epoch": 4.130646087686484, "grad_norm": 1.5115987062454224, "learning_rate": 3.871066979977353e-06, "loss": 0.0464, "step": 280475 }, { "epoch": 4.131014270776571, "grad_norm": 1.0115933418273926, "learning_rate": 3.869430608010368e-06, "loss": 0.0526, "step": 280500 }, { "epoch": 4.131382453866658, "grad_norm": 1.1249902248382568, "learning_rate": 3.867794236043384e-06, "loss": 0.0546, "step": 280525 }, { "epoch": 4.131750636956746, "grad_norm": 0.9938558340072632, "learning_rate": 3.866157864076399e-06, "loss": 0.0603, "step": 280550 }, { "epoch": 4.132118820046833, "grad_norm": 1.1485445499420166, "learning_rate": 3.864521492109415e-06, "loss": 0.0466, "step": 280575 }, { "epoch": 4.13248700313692, "grad_norm": 1.3947577476501465, "learning_rate": 3.86288512014243e-06, "loss": 0.0562, "step": 280600 }, { "epoch": 4.132855186227007, "grad_norm": 1.539109468460083, "learning_rate": 3.861248748175446e-06, "loss": 0.0477, "step": 280625 }, { "epoch": 4.133223369317094, "grad_norm": 1.8791552782058716, "learning_rate": 3.859612376208461e-06, "loss": 0.0525, "step": 280650 }, { "epoch": 4.133591552407181, "grad_norm": 1.3559869527816772, "learning_rate": 3.857976004241476e-06, "loss": 0.0536, "step": 280675 }, { "epoch": 4.133959735497268, "grad_norm": 1.8116252422332764, "learning_rate": 3.856339632274492e-06, "loss": 0.0521, "step": 280700 }, { "epoch": 4.134327918587355, "grad_norm": 1.504961371421814, "learning_rate": 3.854703260307508e-06, "loss": 0.0488, "step": 280725 }, { "epoch": 4.134696101677442, "grad_norm": 1.4178041219711304, "learning_rate": 3.853066888340522e-06, "loss": 0.0557, "step": 280750 }, { "epoch": 4.135064284767529, "grad_norm": 1.3786544799804688, "learning_rate": 3.851430516373538e-06, "loss": 0.0477, "step": 280775 }, { "epoch": 4.135432467857616, "grad_norm": 0.9783445000648499, "learning_rate": 3.849794144406554e-06, "loss": 0.0527, "step": 280800 }, { "epoch": 4.135800650947703, "grad_norm": 1.117689609527588, "learning_rate": 3.848157772439569e-06, "loss": 0.0536, "step": 280825 }, { "epoch": 4.1361688340377905, "grad_norm": 1.316820740699768, "learning_rate": 3.846521400472584e-06, "loss": 0.0487, "step": 280850 }, { "epoch": 4.1365370171278775, "grad_norm": 1.0695276260375977, "learning_rate": 3.8448850285056e-06, "loss": 0.0513, "step": 280875 }, { "epoch": 4.1369052002179645, "grad_norm": 0.7669237852096558, "learning_rate": 3.843248656538616e-06, "loss": 0.0465, "step": 280900 }, { "epoch": 4.1372733833080515, "grad_norm": 1.5514988899230957, "learning_rate": 3.841612284571631e-06, "loss": 0.0475, "step": 280925 }, { "epoch": 4.1376415663981385, "grad_norm": 1.8899503946304321, "learning_rate": 3.839975912604646e-06, "loss": 0.0518, "step": 280950 }, { "epoch": 4.1380097494882255, "grad_norm": 1.3596923351287842, "learning_rate": 3.838339540637662e-06, "loss": 0.0535, "step": 280975 }, { "epoch": 4.1383779325783125, "grad_norm": 1.0282924175262451, "learning_rate": 3.836703168670677e-06, "loss": 0.0504, "step": 281000 }, { "epoch": 4.1387461156684, "grad_norm": 0.9863817095756531, "learning_rate": 3.835066796703692e-06, "loss": 0.0546, "step": 281025 }, { "epoch": 4.139114298758487, "grad_norm": 1.4705183506011963, "learning_rate": 3.8334304247367084e-06, "loss": 0.0484, "step": 281050 }, { "epoch": 4.139482481848574, "grad_norm": 1.3739672899246216, "learning_rate": 3.831794052769724e-06, "loss": 0.0481, "step": 281075 }, { "epoch": 4.139850664938661, "grad_norm": 1.5501339435577393, "learning_rate": 3.830157680802739e-06, "loss": 0.056, "step": 281100 }, { "epoch": 4.140218848028748, "grad_norm": 1.1224405765533447, "learning_rate": 3.828521308835754e-06, "loss": 0.056, "step": 281125 }, { "epoch": 4.140587031118835, "grad_norm": 1.4552491903305054, "learning_rate": 3.82688493686877e-06, "loss": 0.0528, "step": 281150 }, { "epoch": 4.140955214208922, "grad_norm": 1.668328881263733, "learning_rate": 3.825248564901785e-06, "loss": 0.0459, "step": 281175 }, { "epoch": 4.141323397299009, "grad_norm": 1.903546690940857, "learning_rate": 3.823612192934801e-06, "loss": 0.0577, "step": 281200 }, { "epoch": 4.141691580389096, "grad_norm": 1.1953068971633911, "learning_rate": 3.8219758209678156e-06, "loss": 0.048, "step": 281225 }, { "epoch": 4.142059763479183, "grad_norm": 1.5615509748458862, "learning_rate": 3.820339449000832e-06, "loss": 0.0507, "step": 281250 }, { "epoch": 4.14242794656927, "grad_norm": 1.4437997341156006, "learning_rate": 3.818703077033847e-06, "loss": 0.0469, "step": 281275 }, { "epoch": 4.142796129659357, "grad_norm": 1.070771336555481, "learning_rate": 3.817066705066862e-06, "loss": 0.0509, "step": 281300 }, { "epoch": 4.143164312749444, "grad_norm": 1.5310577154159546, "learning_rate": 3.815430333099878e-06, "loss": 0.0474, "step": 281325 }, { "epoch": 4.143532495839531, "grad_norm": 0.6705294847488403, "learning_rate": 3.8137939611328934e-06, "loss": 0.0467, "step": 281350 }, { "epoch": 4.143900678929618, "grad_norm": 1.2362991571426392, "learning_rate": 3.812157589165909e-06, "loss": 0.0566, "step": 281375 }, { "epoch": 4.144268862019705, "grad_norm": 0.9386043548583984, "learning_rate": 3.8105212171989244e-06, "loss": 0.0529, "step": 281400 }, { "epoch": 4.144637045109792, "grad_norm": 1.6510556936264038, "learning_rate": 3.80888484523194e-06, "loss": 0.0514, "step": 281425 }, { "epoch": 4.145005228199879, "grad_norm": 1.139722466468811, "learning_rate": 3.807248473264955e-06, "loss": 0.054, "step": 281450 }, { "epoch": 4.145373411289967, "grad_norm": 1.3769398927688599, "learning_rate": 3.8056121012979707e-06, "loss": 0.0584, "step": 281475 }, { "epoch": 4.145741594380054, "grad_norm": 1.5605382919311523, "learning_rate": 3.803975729330986e-06, "loss": 0.0593, "step": 281500 }, { "epoch": 4.146109777470141, "grad_norm": 0.9440943002700806, "learning_rate": 3.8023393573640018e-06, "loss": 0.0458, "step": 281525 }, { "epoch": 4.146477960560228, "grad_norm": 0.9544996619224548, "learning_rate": 3.8007029853970166e-06, "loss": 0.0498, "step": 281550 }, { "epoch": 4.146846143650315, "grad_norm": 1.5641615390777588, "learning_rate": 3.7990666134300324e-06, "loss": 0.0515, "step": 281575 }, { "epoch": 4.147214326740402, "grad_norm": 1.2624694108963013, "learning_rate": 3.7974302414630477e-06, "loss": 0.0474, "step": 281600 }, { "epoch": 4.147582509830489, "grad_norm": 1.328643798828125, "learning_rate": 3.7957938694960634e-06, "loss": 0.0571, "step": 281625 }, { "epoch": 4.147950692920576, "grad_norm": 1.238291621208191, "learning_rate": 3.7941574975290783e-06, "loss": 0.0466, "step": 281650 }, { "epoch": 4.148318876010663, "grad_norm": 1.3468068838119507, "learning_rate": 3.792521125562094e-06, "loss": 0.05, "step": 281675 }, { "epoch": 4.14868705910075, "grad_norm": 1.4807982444763184, "learning_rate": 3.7909502084737887e-06, "loss": 0.048, "step": 281700 }, { "epoch": 4.149055242190837, "grad_norm": 1.6809862852096558, "learning_rate": 3.7893138365068045e-06, "loss": 0.0453, "step": 281725 }, { "epoch": 4.149423425280924, "grad_norm": 0.9660292267799377, "learning_rate": 3.7876774645398193e-06, "loss": 0.0534, "step": 281750 }, { "epoch": 4.149791608371011, "grad_norm": 1.4939463138580322, "learning_rate": 3.786041092572835e-06, "loss": 0.0493, "step": 281775 }, { "epoch": 4.150159791461098, "grad_norm": 1.056085467338562, "learning_rate": 3.7844047206058504e-06, "loss": 0.0547, "step": 281800 }, { "epoch": 4.150527974551185, "grad_norm": 1.0270249843597412, "learning_rate": 3.782768348638866e-06, "loss": 0.0548, "step": 281825 }, { "epoch": 4.150896157641272, "grad_norm": 0.9781486988067627, "learning_rate": 3.781131976671882e-06, "loss": 0.051, "step": 281850 }, { "epoch": 4.151264340731359, "grad_norm": 1.783305287361145, "learning_rate": 3.779495604704897e-06, "loss": 0.0494, "step": 281875 }, { "epoch": 4.151632523821446, "grad_norm": 1.3520704507827759, "learning_rate": 3.777859232737913e-06, "loss": 0.0534, "step": 281900 }, { "epoch": 4.152000706911533, "grad_norm": 1.6302826404571533, "learning_rate": 3.7762228607709277e-06, "loss": 0.0593, "step": 281925 }, { "epoch": 4.15236889000162, "grad_norm": 1.2334368228912354, "learning_rate": 3.7745864888039435e-06, "loss": 0.0515, "step": 281950 }, { "epoch": 4.152737073091707, "grad_norm": 1.8050910234451294, "learning_rate": 3.7729501168369588e-06, "loss": 0.0606, "step": 281975 }, { "epoch": 4.153105256181794, "grad_norm": 1.1785519123077393, "learning_rate": 3.7713137448699745e-06, "loss": 0.0569, "step": 282000 }, { "epoch": 4.153473439271881, "grad_norm": 1.6257681846618652, "learning_rate": 3.7696773729029894e-06, "loss": 0.053, "step": 282025 }, { "epoch": 4.153841622361968, "grad_norm": 1.4788684844970703, "learning_rate": 3.768041000936005e-06, "loss": 0.053, "step": 282050 }, { "epoch": 4.154209805452055, "grad_norm": 1.4266949892044067, "learning_rate": 3.7664046289690204e-06, "loss": 0.0541, "step": 282075 }, { "epoch": 4.154577988542142, "grad_norm": 1.146416425704956, "learning_rate": 3.764768257002036e-06, "loss": 0.0556, "step": 282100 }, { "epoch": 4.154946171632229, "grad_norm": 1.081526756286621, "learning_rate": 3.763131885035051e-06, "loss": 0.044, "step": 282125 }, { "epoch": 4.155314354722316, "grad_norm": 1.3343247175216675, "learning_rate": 3.7614955130680668e-06, "loss": 0.0517, "step": 282150 }, { "epoch": 4.155682537812403, "grad_norm": 0.8478697538375854, "learning_rate": 3.7599245959797615e-06, "loss": 0.0556, "step": 282175 }, { "epoch": 4.15605072090249, "grad_norm": 1.4555424451828003, "learning_rate": 3.758288224012777e-06, "loss": 0.0588, "step": 282200 }, { "epoch": 4.156418903992577, "grad_norm": 1.3658803701400757, "learning_rate": 3.7566518520457925e-06, "loss": 0.0528, "step": 282225 }, { "epoch": 4.156787087082664, "grad_norm": 1.5523141622543335, "learning_rate": 3.755015480078808e-06, "loss": 0.05, "step": 282250 }, { "epoch": 4.157155270172751, "grad_norm": 1.4585939645767212, "learning_rate": 3.753379108111823e-06, "loss": 0.0588, "step": 282275 }, { "epoch": 4.157523453262838, "grad_norm": 1.540112018585205, "learning_rate": 3.751742736144839e-06, "loss": 0.0543, "step": 282300 }, { "epoch": 4.157891636352925, "grad_norm": 1.4808543920516968, "learning_rate": 3.7501063641778546e-06, "loss": 0.0517, "step": 282325 }, { "epoch": 4.158259819443012, "grad_norm": 1.9939229488372803, "learning_rate": 3.74846999221087e-06, "loss": 0.0483, "step": 282350 }, { "epoch": 4.158628002533099, "grad_norm": 1.9306612014770508, "learning_rate": 3.7468336202438856e-06, "loss": 0.0544, "step": 282375 }, { "epoch": 4.158996185623186, "grad_norm": 0.961968183517456, "learning_rate": 3.7451972482769005e-06, "loss": 0.0463, "step": 282400 }, { "epoch": 4.159364368713273, "grad_norm": 1.0841031074523926, "learning_rate": 3.7435608763099162e-06, "loss": 0.055, "step": 282425 }, { "epoch": 4.15973255180336, "grad_norm": 1.4258852005004883, "learning_rate": 3.7419245043429315e-06, "loss": 0.0507, "step": 282450 }, { "epoch": 4.160100734893448, "grad_norm": 0.4652763307094574, "learning_rate": 3.7402881323759473e-06, "loss": 0.0538, "step": 282475 }, { "epoch": 4.160468917983535, "grad_norm": 1.4698705673217773, "learning_rate": 3.738651760408962e-06, "loss": 0.0444, "step": 282500 }, { "epoch": 4.160837101073622, "grad_norm": 0.6853455901145935, "learning_rate": 3.737015388441978e-06, "loss": 0.047, "step": 282525 }, { "epoch": 4.161205284163709, "grad_norm": 1.4518296718597412, "learning_rate": 3.735379016474993e-06, "loss": 0.0554, "step": 282550 }, { "epoch": 4.161573467253796, "grad_norm": 1.57321035861969, "learning_rate": 3.733742644508009e-06, "loss": 0.0568, "step": 282575 }, { "epoch": 4.161941650343883, "grad_norm": 1.0352312326431274, "learning_rate": 3.7321062725410238e-06, "loss": 0.0507, "step": 282600 }, { "epoch": 4.16230983343397, "grad_norm": 1.6174521446228027, "learning_rate": 3.7304699005740395e-06, "loss": 0.0541, "step": 282625 }, { "epoch": 4.162678016524057, "grad_norm": 1.185634732246399, "learning_rate": 3.728833528607055e-06, "loss": 0.0472, "step": 282650 }, { "epoch": 4.163046199614144, "grad_norm": 1.0461548566818237, "learning_rate": 3.7271971566400705e-06, "loss": 0.0491, "step": 282675 }, { "epoch": 4.163414382704231, "grad_norm": 2.0412161350250244, "learning_rate": 3.725560784673086e-06, "loss": 0.0483, "step": 282700 }, { "epoch": 4.163782565794318, "grad_norm": 1.139068603515625, "learning_rate": 3.7239244127061016e-06, "loss": 0.0546, "step": 282725 }, { "epoch": 4.164150748884405, "grad_norm": 1.1163042783737183, "learning_rate": 3.7222880407391164e-06, "loss": 0.0493, "step": 282750 }, { "epoch": 4.164518931974492, "grad_norm": 1.081230640411377, "learning_rate": 3.720651668772132e-06, "loss": 0.0481, "step": 282775 }, { "epoch": 4.164887115064579, "grad_norm": 1.8118054866790771, "learning_rate": 3.719015296805148e-06, "loss": 0.0491, "step": 282800 }, { "epoch": 4.165255298154666, "grad_norm": 1.4853826761245728, "learning_rate": 3.717378924838163e-06, "loss": 0.0468, "step": 282825 }, { "epoch": 4.1656234812447535, "grad_norm": 1.6419068574905396, "learning_rate": 3.715742552871179e-06, "loss": 0.0491, "step": 282850 }, { "epoch": 4.1659916643348405, "grad_norm": 1.3722608089447021, "learning_rate": 3.714106180904194e-06, "loss": 0.0527, "step": 282875 }, { "epoch": 4.1663598474249275, "grad_norm": 0.9031792879104614, "learning_rate": 3.7124698089372095e-06, "loss": 0.0489, "step": 282900 }, { "epoch": 4.1667280305150145, "grad_norm": 1.5830729007720947, "learning_rate": 3.710833436970225e-06, "loss": 0.0519, "step": 282925 }, { "epoch": 4.1670962136051015, "grad_norm": 1.047120451927185, "learning_rate": 3.7091970650032406e-06, "loss": 0.0441, "step": 282950 }, { "epoch": 4.1674643966951885, "grad_norm": 0.8555548787117004, "learning_rate": 3.7075606930362555e-06, "loss": 0.0478, "step": 282975 }, { "epoch": 4.1678325797852755, "grad_norm": 1.1296738386154175, "learning_rate": 3.705924321069271e-06, "loss": 0.0483, "step": 283000 }, { "epoch": 4.168200762875363, "grad_norm": 1.3878998756408691, "learning_rate": 3.7042879491022865e-06, "loss": 0.0553, "step": 283025 }, { "epoch": 4.16856894596545, "grad_norm": 1.4197288751602173, "learning_rate": 3.7026515771353022e-06, "loss": 0.0536, "step": 283050 }, { "epoch": 4.168937129055537, "grad_norm": 1.2826813459396362, "learning_rate": 3.7010152051683175e-06, "loss": 0.0496, "step": 283075 }, { "epoch": 4.169305312145624, "grad_norm": 1.5160043239593506, "learning_rate": 3.6993788332013333e-06, "loss": 0.0534, "step": 283100 }, { "epoch": 4.169673495235711, "grad_norm": 1.5410345792770386, "learning_rate": 3.697742461234348e-06, "loss": 0.0488, "step": 283125 }, { "epoch": 4.170041678325798, "grad_norm": 1.1372733116149902, "learning_rate": 3.696106089267364e-06, "loss": 0.0434, "step": 283150 }, { "epoch": 4.170409861415885, "grad_norm": 1.4371058940887451, "learning_rate": 3.694469717300379e-06, "loss": 0.0554, "step": 283175 }, { "epoch": 4.170778044505972, "grad_norm": 1.4943219423294067, "learning_rate": 3.692833345333395e-06, "loss": 0.0533, "step": 283200 }, { "epoch": 4.171146227596059, "grad_norm": 1.2485432624816895, "learning_rate": 3.6911969733664098e-06, "loss": 0.0548, "step": 283225 }, { "epoch": 4.171514410686146, "grad_norm": 1.5465879440307617, "learning_rate": 3.6895606013994255e-06, "loss": 0.0497, "step": 283250 }, { "epoch": 4.171882593776233, "grad_norm": 0.9757363200187683, "learning_rate": 3.687924229432441e-06, "loss": 0.0489, "step": 283275 }, { "epoch": 4.17225077686632, "grad_norm": 1.5635976791381836, "learning_rate": 3.6862878574654565e-06, "loss": 0.0562, "step": 283300 }, { "epoch": 4.172618959956407, "grad_norm": 1.2802245616912842, "learning_rate": 3.6846514854984723e-06, "loss": 0.0537, "step": 283325 }, { "epoch": 4.172987143046494, "grad_norm": 1.107739806175232, "learning_rate": 3.683015113531487e-06, "loss": 0.0481, "step": 283350 }, { "epoch": 4.173355326136581, "grad_norm": 1.1620582342147827, "learning_rate": 3.681378741564503e-06, "loss": 0.0512, "step": 283375 }, { "epoch": 4.173723509226669, "grad_norm": 1.1617927551269531, "learning_rate": 3.679742369597518e-06, "loss": 0.0483, "step": 283400 }, { "epoch": 4.174091692316756, "grad_norm": 1.037222981452942, "learning_rate": 3.678105997630534e-06, "loss": 0.0521, "step": 283425 }, { "epoch": 4.174459875406843, "grad_norm": 1.8881289958953857, "learning_rate": 3.676469625663549e-06, "loss": 0.0548, "step": 283450 }, { "epoch": 4.17482805849693, "grad_norm": 0.6663896441459656, "learning_rate": 3.674833253696565e-06, "loss": 0.0434, "step": 283475 }, { "epoch": 4.175196241587017, "grad_norm": 2.1065988540649414, "learning_rate": 3.67319688172958e-06, "loss": 0.0535, "step": 283500 }, { "epoch": 4.175564424677104, "grad_norm": 1.6611994504928589, "learning_rate": 3.6715605097625955e-06, "loss": 0.0504, "step": 283525 }, { "epoch": 4.175932607767191, "grad_norm": 1.2438486814498901, "learning_rate": 3.669924137795611e-06, "loss": 0.0539, "step": 283550 }, { "epoch": 4.176300790857278, "grad_norm": 1.3907822370529175, "learning_rate": 3.6682877658286266e-06, "loss": 0.0517, "step": 283575 }, { "epoch": 4.176668973947365, "grad_norm": 1.1284421682357788, "learning_rate": 3.6666513938616415e-06, "loss": 0.0522, "step": 283600 }, { "epoch": 4.177037157037452, "grad_norm": 1.4884239435195923, "learning_rate": 3.665015021894657e-06, "loss": 0.0595, "step": 283625 }, { "epoch": 4.177405340127539, "grad_norm": 1.5920871496200562, "learning_rate": 3.6633786499276725e-06, "loss": 0.0522, "step": 283650 }, { "epoch": 4.177773523217626, "grad_norm": 1.177476406097412, "learning_rate": 3.6617422779606882e-06, "loss": 0.0556, "step": 283675 }, { "epoch": 4.178141706307713, "grad_norm": 0.9358722567558289, "learning_rate": 3.660105905993703e-06, "loss": 0.0521, "step": 283700 }, { "epoch": 4.1785098893978, "grad_norm": 1.1437467336654663, "learning_rate": 3.658469534026719e-06, "loss": 0.0509, "step": 283725 }, { "epoch": 4.178878072487887, "grad_norm": 0.9306302666664124, "learning_rate": 3.656833162059734e-06, "loss": 0.0546, "step": 283750 }, { "epoch": 4.179246255577974, "grad_norm": 1.3671047687530518, "learning_rate": 3.65519679009275e-06, "loss": 0.0467, "step": 283775 }, { "epoch": 4.179614438668061, "grad_norm": 1.409212350845337, "learning_rate": 3.6535604181257656e-06, "loss": 0.0466, "step": 283800 }, { "epoch": 4.179982621758148, "grad_norm": 1.377893090248108, "learning_rate": 3.651924046158781e-06, "loss": 0.0567, "step": 283825 }, { "epoch": 4.180350804848235, "grad_norm": 1.5031005144119263, "learning_rate": 3.6502876741917966e-06, "loss": 0.0531, "step": 283850 }, { "epoch": 4.180718987938322, "grad_norm": 1.449777603149414, "learning_rate": 3.6486513022248115e-06, "loss": 0.0527, "step": 283875 }, { "epoch": 4.181087171028409, "grad_norm": 1.289481282234192, "learning_rate": 3.6470149302578272e-06, "loss": 0.0471, "step": 283900 }, { "epoch": 4.181455354118496, "grad_norm": 0.9766039252281189, "learning_rate": 3.6453785582908425e-06, "loss": 0.0474, "step": 283925 }, { "epoch": 4.181823537208583, "grad_norm": 1.397099494934082, "learning_rate": 3.6437421863238583e-06, "loss": 0.0487, "step": 283950 }, { "epoch": 4.18219172029867, "grad_norm": 1.3325046300888062, "learning_rate": 3.642105814356873e-06, "loss": 0.0547, "step": 283975 }, { "epoch": 4.182559903388757, "grad_norm": 0.7550286650657654, "learning_rate": 3.640469442389889e-06, "loss": 0.0505, "step": 284000 }, { "epoch": 4.182928086478844, "grad_norm": 1.2550404071807861, "learning_rate": 3.638833070422904e-06, "loss": 0.0571, "step": 284025 }, { "epoch": 4.183296269568931, "grad_norm": 1.4222699403762817, "learning_rate": 3.63719669845592e-06, "loss": 0.0497, "step": 284050 }, { "epoch": 4.183664452659018, "grad_norm": 1.6280872821807861, "learning_rate": 3.6355603264889348e-06, "loss": 0.054, "step": 284075 }, { "epoch": 4.184032635749105, "grad_norm": 0.9154566526412964, "learning_rate": 3.6339239545219505e-06, "loss": 0.0521, "step": 284100 }, { "epoch": 4.184400818839192, "grad_norm": 1.3835906982421875, "learning_rate": 3.632287582554966e-06, "loss": 0.055, "step": 284125 }, { "epoch": 4.184769001929279, "grad_norm": 1.424846887588501, "learning_rate": 3.6306512105879815e-06, "loss": 0.0504, "step": 284150 }, { "epoch": 4.185137185019366, "grad_norm": 1.2080756425857544, "learning_rate": 3.629014838620997e-06, "loss": 0.0547, "step": 284175 }, { "epoch": 4.185505368109453, "grad_norm": 1.2325810194015503, "learning_rate": 3.6273784666540126e-06, "loss": 0.0459, "step": 284200 }, { "epoch": 4.18587355119954, "grad_norm": 1.4504190683364868, "learning_rate": 3.6257420946870275e-06, "loss": 0.0446, "step": 284225 }, { "epoch": 4.186241734289627, "grad_norm": 0.8496175408363342, "learning_rate": 3.624105722720043e-06, "loss": 0.051, "step": 284250 }, { "epoch": 4.186609917379714, "grad_norm": 1.655601978302002, "learning_rate": 3.622469350753059e-06, "loss": 0.0564, "step": 284275 }, { "epoch": 4.186978100469801, "grad_norm": 1.0685817003250122, "learning_rate": 3.6208329787860742e-06, "loss": 0.0496, "step": 284300 }, { "epoch": 4.187346283559888, "grad_norm": 1.3044464588165283, "learning_rate": 3.61919660681909e-06, "loss": 0.0498, "step": 284325 }, { "epoch": 4.187714466649975, "grad_norm": 1.1293905973434448, "learning_rate": 3.617560234852105e-06, "loss": 0.0518, "step": 284350 }, { "epoch": 4.188082649740063, "grad_norm": 1.2364485263824463, "learning_rate": 3.6159893177638e-06, "loss": 0.049, "step": 284375 }, { "epoch": 4.18845083283015, "grad_norm": 1.1875444650650024, "learning_rate": 3.6143529457968153e-06, "loss": 0.0555, "step": 284400 }, { "epoch": 4.188819015920237, "grad_norm": 1.6219980716705322, "learning_rate": 3.612716573829831e-06, "loss": 0.0468, "step": 284425 }, { "epoch": 4.189187199010324, "grad_norm": 1.4333478212356567, "learning_rate": 3.611080201862846e-06, "loss": 0.0527, "step": 284450 }, { "epoch": 4.189555382100411, "grad_norm": 1.323794960975647, "learning_rate": 3.6094438298958616e-06, "loss": 0.0519, "step": 284475 }, { "epoch": 4.189923565190498, "grad_norm": 1.1227856874465942, "learning_rate": 3.607807457928877e-06, "loss": 0.0518, "step": 284500 }, { "epoch": 4.190291748280585, "grad_norm": 1.4547703266143799, "learning_rate": 3.6061710859618926e-06, "loss": 0.052, "step": 284525 }, { "epoch": 4.190659931370672, "grad_norm": 2.2386605739593506, "learning_rate": 3.6045347139949075e-06, "loss": 0.0551, "step": 284550 }, { "epoch": 4.191028114460759, "grad_norm": 1.3500272035598755, "learning_rate": 3.6028983420279233e-06, "loss": 0.0535, "step": 284575 }, { "epoch": 4.191396297550846, "grad_norm": 1.15129816532135, "learning_rate": 3.6012619700609386e-06, "loss": 0.0556, "step": 284600 }, { "epoch": 4.191764480640933, "grad_norm": 1.3669862747192383, "learning_rate": 3.5996255980939543e-06, "loss": 0.0541, "step": 284625 }, { "epoch": 4.19213266373102, "grad_norm": 1.4275031089782715, "learning_rate": 3.5979892261269696e-06, "loss": 0.0509, "step": 284650 }, { "epoch": 4.192500846821107, "grad_norm": 1.125692367553711, "learning_rate": 3.5963528541599853e-06, "loss": 0.0494, "step": 284675 }, { "epoch": 4.192869029911194, "grad_norm": 1.472734808921814, "learning_rate": 3.594716482193e-06, "loss": 0.0452, "step": 284700 }, { "epoch": 4.193237213001281, "grad_norm": 1.5592925548553467, "learning_rate": 3.593080110226016e-06, "loss": 0.0498, "step": 284725 }, { "epoch": 4.193605396091368, "grad_norm": 1.3757771253585815, "learning_rate": 3.5914437382590317e-06, "loss": 0.054, "step": 284750 }, { "epoch": 4.193973579181455, "grad_norm": 1.4402618408203125, "learning_rate": 3.589807366292047e-06, "loss": 0.0445, "step": 284775 }, { "epoch": 4.194341762271542, "grad_norm": 1.5237786769866943, "learning_rate": 3.5881709943250627e-06, "loss": 0.0544, "step": 284800 }, { "epoch": 4.194709945361629, "grad_norm": 1.1614034175872803, "learning_rate": 3.5865346223580776e-06, "loss": 0.0479, "step": 284825 }, { "epoch": 4.1950781284517165, "grad_norm": 1.3960483074188232, "learning_rate": 3.5848982503910933e-06, "loss": 0.0549, "step": 284850 }, { "epoch": 4.1954463115418035, "grad_norm": 1.4039913415908813, "learning_rate": 3.5832618784241086e-06, "loss": 0.05, "step": 284875 }, { "epoch": 4.1958144946318905, "grad_norm": 0.9151574969291687, "learning_rate": 3.5816255064571243e-06, "loss": 0.05, "step": 284900 }, { "epoch": 4.1961826777219775, "grad_norm": 1.6494172811508179, "learning_rate": 3.5799891344901392e-06, "loss": 0.0517, "step": 284925 }, { "epoch": 4.1965508608120645, "grad_norm": 1.4494985342025757, "learning_rate": 3.578352762523155e-06, "loss": 0.0533, "step": 284950 }, { "epoch": 4.1969190439021515, "grad_norm": 0.9448962211608887, "learning_rate": 3.5767163905561702e-06, "loss": 0.0572, "step": 284975 }, { "epoch": 4.1972872269922386, "grad_norm": 1.1785589456558228, "learning_rate": 3.575080018589186e-06, "loss": 0.0555, "step": 285000 }, { "epoch": 4.197655410082326, "grad_norm": 1.3010982275009155, "learning_rate": 3.5734436466222013e-06, "loss": 0.0525, "step": 285025 }, { "epoch": 4.198023593172413, "grad_norm": 1.6826128959655762, "learning_rate": 3.571807274655217e-06, "loss": 0.0543, "step": 285050 }, { "epoch": 4.1983917762625, "grad_norm": 0.929847002029419, "learning_rate": 3.570170902688232e-06, "loss": 0.0522, "step": 285075 }, { "epoch": 4.198759959352587, "grad_norm": 1.49109947681427, "learning_rate": 3.5685345307212476e-06, "loss": 0.0489, "step": 285100 }, { "epoch": 4.199128142442674, "grad_norm": 0.9287909865379333, "learning_rate": 3.566898158754263e-06, "loss": 0.0509, "step": 285125 }, { "epoch": 4.199496325532761, "grad_norm": 1.263285517692566, "learning_rate": 3.5652617867872786e-06, "loss": 0.0587, "step": 285150 }, { "epoch": 4.199864508622848, "grad_norm": 0.84065181016922, "learning_rate": 3.5636254148202935e-06, "loss": 0.0575, "step": 285175 }, { "epoch": 4.200232691712935, "grad_norm": 1.635359525680542, "learning_rate": 3.5619890428533093e-06, "loss": 0.0481, "step": 285200 }, { "epoch": 4.200600874803022, "grad_norm": 1.077462077140808, "learning_rate": 3.560352670886325e-06, "loss": 0.0508, "step": 285225 }, { "epoch": 4.200969057893109, "grad_norm": 1.0449047088623047, "learning_rate": 3.5587162989193403e-06, "loss": 0.0507, "step": 285250 }, { "epoch": 4.201337240983196, "grad_norm": 1.7464070320129395, "learning_rate": 3.557079926952356e-06, "loss": 0.0468, "step": 285275 }, { "epoch": 4.201705424073283, "grad_norm": 1.5482630729675293, "learning_rate": 3.555443554985371e-06, "loss": 0.0554, "step": 285300 }, { "epoch": 4.202073607163371, "grad_norm": 1.3082911968231201, "learning_rate": 3.5538071830183866e-06, "loss": 0.0518, "step": 285325 }, { "epoch": 4.202441790253458, "grad_norm": 1.0178221464157104, "learning_rate": 3.552170811051402e-06, "loss": 0.0479, "step": 285350 }, { "epoch": 4.202809973343545, "grad_norm": 1.0560585260391235, "learning_rate": 3.5505344390844177e-06, "loss": 0.0512, "step": 285375 }, { "epoch": 4.203178156433632, "grad_norm": 1.4004467725753784, "learning_rate": 3.548898067117433e-06, "loss": 0.0557, "step": 285400 }, { "epoch": 4.203546339523719, "grad_norm": 1.5529333353042603, "learning_rate": 3.5472616951504487e-06, "loss": 0.0498, "step": 285425 }, { "epoch": 4.203914522613806, "grad_norm": 1.1342477798461914, "learning_rate": 3.5456253231834636e-06, "loss": 0.0541, "step": 285450 }, { "epoch": 4.204282705703893, "grad_norm": 1.4639195203781128, "learning_rate": 3.5439889512164793e-06, "loss": 0.0545, "step": 285475 }, { "epoch": 4.20465088879398, "grad_norm": 1.3467246294021606, "learning_rate": 3.5423525792494946e-06, "loss": 0.0494, "step": 285500 }, { "epoch": 4.205019071884067, "grad_norm": 1.3431875705718994, "learning_rate": 3.5407162072825103e-06, "loss": 0.0549, "step": 285525 }, { "epoch": 4.205387254974154, "grad_norm": 0.9523720741271973, "learning_rate": 3.5390798353155252e-06, "loss": 0.0509, "step": 285550 }, { "epoch": 4.205755438064241, "grad_norm": 1.1625728607177734, "learning_rate": 3.537443463348541e-06, "loss": 0.0514, "step": 285575 }, { "epoch": 4.206123621154328, "grad_norm": 1.586590051651001, "learning_rate": 3.5358070913815562e-06, "loss": 0.0492, "step": 285600 }, { "epoch": 4.206491804244415, "grad_norm": 1.7192764282226562, "learning_rate": 3.534170719414572e-06, "loss": 0.0527, "step": 285625 }, { "epoch": 4.206859987334502, "grad_norm": 0.8004681468009949, "learning_rate": 3.532534347447587e-06, "loss": 0.0478, "step": 285650 }, { "epoch": 4.207228170424589, "grad_norm": 0.976959228515625, "learning_rate": 3.5308979754806026e-06, "loss": 0.0536, "step": 285675 }, { "epoch": 4.207596353514676, "grad_norm": 1.2287458181381226, "learning_rate": 3.529261603513618e-06, "loss": 0.0519, "step": 285700 }, { "epoch": 4.207964536604763, "grad_norm": 1.6525061130523682, "learning_rate": 3.5276252315466336e-06, "loss": 0.0564, "step": 285725 }, { "epoch": 4.20833271969485, "grad_norm": 1.0059435367584229, "learning_rate": 3.5259888595796493e-06, "loss": 0.053, "step": 285750 }, { "epoch": 4.208700902784937, "grad_norm": 1.0858955383300781, "learning_rate": 3.5243524876126646e-06, "loss": 0.0458, "step": 285775 }, { "epoch": 4.209069085875024, "grad_norm": 0.9993805289268494, "learning_rate": 3.5227161156456804e-06, "loss": 0.0493, "step": 285800 }, { "epoch": 4.209437268965111, "grad_norm": 1.0551505088806152, "learning_rate": 3.5210797436786953e-06, "loss": 0.0514, "step": 285825 }, { "epoch": 4.209805452055198, "grad_norm": 0.9285268187522888, "learning_rate": 3.519443371711711e-06, "loss": 0.0492, "step": 285850 }, { "epoch": 4.210173635145285, "grad_norm": 0.6509037017822266, "learning_rate": 3.5178069997447263e-06, "loss": 0.0492, "step": 285875 }, { "epoch": 4.210541818235372, "grad_norm": 1.442220687866211, "learning_rate": 3.516170627777742e-06, "loss": 0.051, "step": 285900 }, { "epoch": 4.210910001325459, "grad_norm": 1.7208672761917114, "learning_rate": 3.514534255810757e-06, "loss": 0.0512, "step": 285925 }, { "epoch": 4.211278184415546, "grad_norm": 1.4775081872940063, "learning_rate": 3.5128978838437726e-06, "loss": 0.0509, "step": 285950 }, { "epoch": 4.211646367505633, "grad_norm": 1.3374617099761963, "learning_rate": 3.511261511876788e-06, "loss": 0.0586, "step": 285975 }, { "epoch": 4.21201455059572, "grad_norm": 1.54880690574646, "learning_rate": 3.5096251399098037e-06, "loss": 0.0491, "step": 286000 }, { "epoch": 4.212382733685807, "grad_norm": 1.2430872917175293, "learning_rate": 3.5079887679428185e-06, "loss": 0.0569, "step": 286025 }, { "epoch": 4.212750916775894, "grad_norm": 1.08426833152771, "learning_rate": 3.5063523959758343e-06, "loss": 0.0508, "step": 286050 }, { "epoch": 4.213119099865981, "grad_norm": 1.4624437093734741, "learning_rate": 3.5047160240088496e-06, "loss": 0.0487, "step": 286075 }, { "epoch": 4.213487282956068, "grad_norm": 1.0623691082000732, "learning_rate": 3.5030796520418653e-06, "loss": 0.053, "step": 286100 }, { "epoch": 4.213855466046155, "grad_norm": 1.3077738285064697, "learning_rate": 3.5014432800748806e-06, "loss": 0.0495, "step": 286125 }, { "epoch": 4.214223649136242, "grad_norm": 1.6256091594696045, "learning_rate": 3.4998069081078963e-06, "loss": 0.0486, "step": 286150 }, { "epoch": 4.214591832226329, "grad_norm": 1.4712008237838745, "learning_rate": 3.4981705361409112e-06, "loss": 0.047, "step": 286175 }, { "epoch": 4.214960015316416, "grad_norm": 1.390589952468872, "learning_rate": 3.496534164173927e-06, "loss": 0.0507, "step": 286200 }, { "epoch": 4.215328198406503, "grad_norm": 1.317811131477356, "learning_rate": 3.4948977922069427e-06, "loss": 0.0506, "step": 286225 }, { "epoch": 4.21569638149659, "grad_norm": 0.9666112065315247, "learning_rate": 3.493261420239958e-06, "loss": 0.0484, "step": 286250 }, { "epoch": 4.216064564586677, "grad_norm": 1.3020298480987549, "learning_rate": 3.4916250482729737e-06, "loss": 0.0521, "step": 286275 }, { "epoch": 4.216432747676765, "grad_norm": 1.4063552618026733, "learning_rate": 3.4899886763059886e-06, "loss": 0.0509, "step": 286300 }, { "epoch": 4.216800930766852, "grad_norm": 1.3392831087112427, "learning_rate": 3.4883523043390043e-06, "loss": 0.0502, "step": 286325 }, { "epoch": 4.217169113856939, "grad_norm": 1.3368010520935059, "learning_rate": 3.4867159323720196e-06, "loss": 0.0532, "step": 286350 }, { "epoch": 4.217537296947026, "grad_norm": 1.1430208683013916, "learning_rate": 3.4850795604050353e-06, "loss": 0.059, "step": 286375 }, { "epoch": 4.217905480037113, "grad_norm": 1.1475892066955566, "learning_rate": 3.4834431884380502e-06, "loss": 0.0501, "step": 286400 }, { "epoch": 4.2182736631272, "grad_norm": 1.3156911134719849, "learning_rate": 3.481806816471066e-06, "loss": 0.0544, "step": 286425 }, { "epoch": 4.218641846217287, "grad_norm": 1.2806178331375122, "learning_rate": 3.4801704445040813e-06, "loss": 0.054, "step": 286450 }, { "epoch": 4.219010029307374, "grad_norm": 1.2796999216079712, "learning_rate": 3.478534072537097e-06, "loss": 0.0483, "step": 286475 }, { "epoch": 4.219378212397461, "grad_norm": 0.8412948250770569, "learning_rate": 3.4768977005701123e-06, "loss": 0.05, "step": 286500 }, { "epoch": 4.219746395487548, "grad_norm": 1.1998164653778076, "learning_rate": 3.475261328603128e-06, "loss": 0.0564, "step": 286525 }, { "epoch": 4.220114578577635, "grad_norm": 1.5432463884353638, "learning_rate": 3.473624956636143e-06, "loss": 0.0571, "step": 286550 }, { "epoch": 4.220482761667722, "grad_norm": 1.593366265296936, "learning_rate": 3.4719885846691586e-06, "loss": 0.0594, "step": 286575 }, { "epoch": 4.220850944757809, "grad_norm": 1.1081656217575073, "learning_rate": 3.470352212702174e-06, "loss": 0.0482, "step": 286600 }, { "epoch": 4.221219127847896, "grad_norm": 1.377360463142395, "learning_rate": 3.4687158407351897e-06, "loss": 0.0547, "step": 286625 }, { "epoch": 4.221587310937983, "grad_norm": 1.5743542909622192, "learning_rate": 3.4670794687682045e-06, "loss": 0.0519, "step": 286650 }, { "epoch": 4.22195549402807, "grad_norm": 1.342956304550171, "learning_rate": 3.4654430968012203e-06, "loss": 0.0505, "step": 286675 }, { "epoch": 4.222323677118157, "grad_norm": 1.1712003946304321, "learning_rate": 3.463806724834236e-06, "loss": 0.0556, "step": 286700 }, { "epoch": 4.222691860208244, "grad_norm": 1.3126769065856934, "learning_rate": 3.4621703528672513e-06, "loss": 0.0515, "step": 286725 }, { "epoch": 4.223060043298331, "grad_norm": 1.6280109882354736, "learning_rate": 3.460533980900267e-06, "loss": 0.0562, "step": 286750 }, { "epoch": 4.223428226388418, "grad_norm": 1.3670576810836792, "learning_rate": 3.458897608933282e-06, "loss": 0.0525, "step": 286775 }, { "epoch": 4.223796409478505, "grad_norm": 1.4677999019622803, "learning_rate": 3.4572612369662976e-06, "loss": 0.0455, "step": 286800 }, { "epoch": 4.2241645925685924, "grad_norm": 1.2276616096496582, "learning_rate": 3.455624864999313e-06, "loss": 0.0477, "step": 286825 }, { "epoch": 4.2245327756586795, "grad_norm": 1.565685510635376, "learning_rate": 3.4539884930323287e-06, "loss": 0.0573, "step": 286850 }, { "epoch": 4.2249009587487665, "grad_norm": 1.4173812866210938, "learning_rate": 3.452352121065344e-06, "loss": 0.0501, "step": 286875 }, { "epoch": 4.2252691418388535, "grad_norm": 1.1952711343765259, "learning_rate": 3.4507157490983597e-06, "loss": 0.0485, "step": 286900 }, { "epoch": 4.2256373249289405, "grad_norm": 1.399548053741455, "learning_rate": 3.4490793771313746e-06, "loss": 0.0563, "step": 286925 }, { "epoch": 4.2260055080190275, "grad_norm": 1.5876163244247437, "learning_rate": 3.4474430051643903e-06, "loss": 0.0471, "step": 286950 }, { "epoch": 4.2263736911091145, "grad_norm": 1.351813793182373, "learning_rate": 3.4458066331974056e-06, "loss": 0.0533, "step": 286975 }, { "epoch": 4.2267418741992016, "grad_norm": 1.292729139328003, "learning_rate": 3.4441702612304213e-06, "loss": 0.0451, "step": 287000 }, { "epoch": 4.227110057289289, "grad_norm": 1.1002082824707031, "learning_rate": 3.4425338892634362e-06, "loss": 0.0495, "step": 287025 }, { "epoch": 4.227478240379376, "grad_norm": 1.2477836608886719, "learning_rate": 3.440897517296452e-06, "loss": 0.0536, "step": 287050 }, { "epoch": 4.227846423469463, "grad_norm": 0.9137217998504639, "learning_rate": 3.4392611453294673e-06, "loss": 0.0458, "step": 287075 }, { "epoch": 4.22821460655955, "grad_norm": 0.747760534286499, "learning_rate": 3.437624773362483e-06, "loss": 0.0497, "step": 287100 }, { "epoch": 4.228582789649637, "grad_norm": 1.180064082145691, "learning_rate": 3.435988401395498e-06, "loss": 0.0536, "step": 287125 }, { "epoch": 4.228950972739724, "grad_norm": 1.5068246126174927, "learning_rate": 3.4343520294285136e-06, "loss": 0.0498, "step": 287150 }, { "epoch": 4.229319155829811, "grad_norm": 1.101651668548584, "learning_rate": 3.432715657461529e-06, "loss": 0.0536, "step": 287175 }, { "epoch": 4.229687338919898, "grad_norm": 1.7168573141098022, "learning_rate": 3.431144740373224e-06, "loss": 0.0529, "step": 287200 }, { "epoch": 4.230055522009986, "grad_norm": 1.4192029237747192, "learning_rate": 3.4295083684062398e-06, "loss": 0.0465, "step": 287225 }, { "epoch": 4.230423705100073, "grad_norm": 1.5027098655700684, "learning_rate": 3.4278719964392547e-06, "loss": 0.0549, "step": 287250 }, { "epoch": 4.23079188819016, "grad_norm": 1.5143821239471436, "learning_rate": 3.4262356244722704e-06, "loss": 0.0535, "step": 287275 }, { "epoch": 4.231160071280247, "grad_norm": 1.3449275493621826, "learning_rate": 3.4245992525052857e-06, "loss": 0.047, "step": 287300 }, { "epoch": 4.231528254370334, "grad_norm": 1.2649414539337158, "learning_rate": 3.4229628805383014e-06, "loss": 0.0526, "step": 287325 }, { "epoch": 4.231896437460421, "grad_norm": 1.6541857719421387, "learning_rate": 3.4213265085713167e-06, "loss": 0.0535, "step": 287350 }, { "epoch": 4.232264620550508, "grad_norm": 1.4935722351074219, "learning_rate": 3.4196901366043324e-06, "loss": 0.0615, "step": 287375 }, { "epoch": 4.232632803640595, "grad_norm": 1.4454498291015625, "learning_rate": 3.4180537646373473e-06, "loss": 0.0546, "step": 287400 }, { "epoch": 4.233000986730682, "grad_norm": 0.9084637761116028, "learning_rate": 3.416417392670363e-06, "loss": 0.0535, "step": 287425 }, { "epoch": 4.233369169820769, "grad_norm": 1.5626475811004639, "learning_rate": 3.4147810207033784e-06, "loss": 0.0495, "step": 287450 }, { "epoch": 4.233737352910856, "grad_norm": 1.3954142332077026, "learning_rate": 3.413144648736394e-06, "loss": 0.0537, "step": 287475 }, { "epoch": 4.234105536000943, "grad_norm": 1.2264585494995117, "learning_rate": 3.411508276769409e-06, "loss": 0.0523, "step": 287500 }, { "epoch": 4.23447371909103, "grad_norm": 1.78122878074646, "learning_rate": 3.4098719048024247e-06, "loss": 0.0535, "step": 287525 }, { "epoch": 4.234841902181117, "grad_norm": 1.4941461086273193, "learning_rate": 3.40823553283544e-06, "loss": 0.0474, "step": 287550 }, { "epoch": 4.235210085271204, "grad_norm": 1.4036809206008911, "learning_rate": 3.4065991608684557e-06, "loss": 0.053, "step": 287575 }, { "epoch": 4.235578268361291, "grad_norm": 1.3087482452392578, "learning_rate": 3.4049627889014706e-06, "loss": 0.0598, "step": 287600 }, { "epoch": 4.235946451451378, "grad_norm": 1.2929563522338867, "learning_rate": 3.4033264169344863e-06, "loss": 0.0502, "step": 287625 }, { "epoch": 4.236314634541465, "grad_norm": 1.1535840034484863, "learning_rate": 3.4016900449675016e-06, "loss": 0.0509, "step": 287650 }, { "epoch": 4.236682817631552, "grad_norm": 1.151566743850708, "learning_rate": 3.4000536730005174e-06, "loss": 0.0499, "step": 287675 }, { "epoch": 4.237051000721639, "grad_norm": 1.2608799934387207, "learning_rate": 3.398417301033533e-06, "loss": 0.054, "step": 287700 }, { "epoch": 4.237419183811726, "grad_norm": 1.074532389640808, "learning_rate": 3.3967809290665484e-06, "loss": 0.0517, "step": 287725 }, { "epoch": 4.237787366901813, "grad_norm": 1.3132317066192627, "learning_rate": 3.395144557099564e-06, "loss": 0.0536, "step": 287750 }, { "epoch": 4.2381555499919, "grad_norm": 1.4164564609527588, "learning_rate": 3.393508185132579e-06, "loss": 0.0569, "step": 287775 }, { "epoch": 4.238523733081987, "grad_norm": 0.8150655031204224, "learning_rate": 3.3918718131655947e-06, "loss": 0.0475, "step": 287800 }, { "epoch": 4.238891916172074, "grad_norm": 1.7023588418960571, "learning_rate": 3.39023544119861e-06, "loss": 0.0533, "step": 287825 }, { "epoch": 4.239260099262161, "grad_norm": 1.1218279600143433, "learning_rate": 3.3885990692316258e-06, "loss": 0.0591, "step": 287850 }, { "epoch": 4.239628282352248, "grad_norm": 1.4748696088790894, "learning_rate": 3.3869626972646407e-06, "loss": 0.0513, "step": 287875 }, { "epoch": 4.239996465442335, "grad_norm": 1.154908299446106, "learning_rate": 3.3853263252976564e-06, "loss": 0.052, "step": 287900 }, { "epoch": 4.240364648532422, "grad_norm": 1.170432448387146, "learning_rate": 3.3836899533306717e-06, "loss": 0.0453, "step": 287925 }, { "epoch": 4.240732831622509, "grad_norm": 1.052100658416748, "learning_rate": 3.3820535813636874e-06, "loss": 0.0595, "step": 287950 }, { "epoch": 4.241101014712596, "grad_norm": 1.3566515445709229, "learning_rate": 3.3804172093967023e-06, "loss": 0.0505, "step": 287975 }, { "epoch": 4.241469197802683, "grad_norm": 2.012906551361084, "learning_rate": 3.378780837429718e-06, "loss": 0.0473, "step": 288000 }, { "epoch": 4.24183738089277, "grad_norm": 1.811879277229309, "learning_rate": 3.3771444654627333e-06, "loss": 0.0533, "step": 288025 }, { "epoch": 4.242205563982857, "grad_norm": 1.1512092351913452, "learning_rate": 3.375508093495749e-06, "loss": 0.0489, "step": 288050 }, { "epoch": 4.242573747072944, "grad_norm": 1.3555552959442139, "learning_rate": 3.3738717215287644e-06, "loss": 0.0543, "step": 288075 }, { "epoch": 4.242941930163031, "grad_norm": 1.0794495344161987, "learning_rate": 3.37223534956178e-06, "loss": 0.0523, "step": 288100 }, { "epoch": 4.243310113253118, "grad_norm": 1.7101385593414307, "learning_rate": 3.370598977594795e-06, "loss": 0.0592, "step": 288125 }, { "epoch": 4.243678296343205, "grad_norm": 1.3256570100784302, "learning_rate": 3.3689626056278107e-06, "loss": 0.0571, "step": 288150 }, { "epoch": 4.244046479433292, "grad_norm": 1.3084527254104614, "learning_rate": 3.3673262336608264e-06, "loss": 0.057, "step": 288175 }, { "epoch": 4.244414662523379, "grad_norm": 0.7994515299797058, "learning_rate": 3.3656898616938417e-06, "loss": 0.052, "step": 288200 }, { "epoch": 4.244782845613467, "grad_norm": 1.5635790824890137, "learning_rate": 3.3640534897268575e-06, "loss": 0.0573, "step": 288225 }, { "epoch": 4.245151028703554, "grad_norm": 1.1247919797897339, "learning_rate": 3.3624171177598723e-06, "loss": 0.055, "step": 288250 }, { "epoch": 4.245519211793641, "grad_norm": 1.2731386423110962, "learning_rate": 3.360780745792888e-06, "loss": 0.0541, "step": 288275 }, { "epoch": 4.245887394883728, "grad_norm": 1.2959508895874023, "learning_rate": 3.3591443738259034e-06, "loss": 0.0426, "step": 288300 }, { "epoch": 4.246255577973815, "grad_norm": 1.116158127784729, "learning_rate": 3.357508001858919e-06, "loss": 0.0561, "step": 288325 }, { "epoch": 4.246623761063902, "grad_norm": 0.6781604290008545, "learning_rate": 3.355871629891934e-06, "loss": 0.0398, "step": 288350 }, { "epoch": 4.246991944153989, "grad_norm": 1.5362576246261597, "learning_rate": 3.3542352579249497e-06, "loss": 0.0574, "step": 288375 }, { "epoch": 4.247360127244076, "grad_norm": 1.4149580001831055, "learning_rate": 3.352598885957965e-06, "loss": 0.0517, "step": 288400 }, { "epoch": 4.247728310334163, "grad_norm": 1.4112199544906616, "learning_rate": 3.3509625139909807e-06, "loss": 0.0545, "step": 288425 }, { "epoch": 4.24809649342425, "grad_norm": 0.7767438888549805, "learning_rate": 3.349326142023996e-06, "loss": 0.0446, "step": 288450 }, { "epoch": 4.248464676514337, "grad_norm": 1.3089207410812378, "learning_rate": 3.3476897700570118e-06, "loss": 0.0492, "step": 288475 }, { "epoch": 4.248832859604424, "grad_norm": 1.3279422521591187, "learning_rate": 3.3460533980900267e-06, "loss": 0.054, "step": 288500 }, { "epoch": 4.249201042694511, "grad_norm": 1.2852131128311157, "learning_rate": 3.3444170261230424e-06, "loss": 0.0488, "step": 288525 }, { "epoch": 4.249569225784598, "grad_norm": 1.088971495628357, "learning_rate": 3.3427806541560577e-06, "loss": 0.0527, "step": 288550 }, { "epoch": 4.249937408874685, "grad_norm": 1.6746782064437866, "learning_rate": 3.3411442821890734e-06, "loss": 0.0581, "step": 288575 }, { "epoch": 4.250305591964772, "grad_norm": 1.1501498222351074, "learning_rate": 3.3395079102220883e-06, "loss": 0.0525, "step": 288600 }, { "epoch": 4.250673775054859, "grad_norm": 1.5668296813964844, "learning_rate": 3.337871538255104e-06, "loss": 0.048, "step": 288625 }, { "epoch": 4.251041958144946, "grad_norm": 1.3431293964385986, "learning_rate": 3.3362351662881197e-06, "loss": 0.0458, "step": 288650 }, { "epoch": 4.251410141235033, "grad_norm": 1.4084354639053345, "learning_rate": 3.334598794321135e-06, "loss": 0.0539, "step": 288675 }, { "epoch": 4.25177832432512, "grad_norm": 1.9039839506149292, "learning_rate": 3.3329624223541508e-06, "loss": 0.0532, "step": 288700 }, { "epoch": 4.252146507415207, "grad_norm": 1.0529685020446777, "learning_rate": 3.3313260503871657e-06, "loss": 0.0555, "step": 288725 }, { "epoch": 4.252514690505294, "grad_norm": 1.086586356163025, "learning_rate": 3.3296896784201814e-06, "loss": 0.0485, "step": 288750 }, { "epoch": 4.252882873595381, "grad_norm": 0.8603990077972412, "learning_rate": 3.3280533064531967e-06, "loss": 0.0555, "step": 288775 }, { "epoch": 4.253251056685468, "grad_norm": 1.4172135591506958, "learning_rate": 3.3264169344862124e-06, "loss": 0.0494, "step": 288800 }, { "epoch": 4.2536192397755554, "grad_norm": 1.1183111667633057, "learning_rate": 3.3247805625192277e-06, "loss": 0.0562, "step": 288825 }, { "epoch": 4.2539874228656425, "grad_norm": 1.5140810012817383, "learning_rate": 3.3231441905522435e-06, "loss": 0.0507, "step": 288850 }, { "epoch": 4.2543556059557295, "grad_norm": 1.7189756631851196, "learning_rate": 3.3215078185852583e-06, "loss": 0.0505, "step": 288875 }, { "epoch": 4.2547237890458165, "grad_norm": 0.8740991353988647, "learning_rate": 3.319871446618274e-06, "loss": 0.0513, "step": 288900 }, { "epoch": 4.2550919721359035, "grad_norm": 1.5767619609832764, "learning_rate": 3.3182350746512894e-06, "loss": 0.0513, "step": 288925 }, { "epoch": 4.2554601552259905, "grad_norm": 1.383811593055725, "learning_rate": 3.316598702684305e-06, "loss": 0.0604, "step": 288950 }, { "epoch": 4.2558283383160775, "grad_norm": 1.1832149028778076, "learning_rate": 3.31496233071732e-06, "loss": 0.0531, "step": 288975 }, { "epoch": 4.256196521406165, "grad_norm": 1.482468605041504, "learning_rate": 3.3133259587503357e-06, "loss": 0.0528, "step": 289000 }, { "epoch": 4.256564704496252, "grad_norm": 1.53411865234375, "learning_rate": 3.311689586783351e-06, "loss": 0.0499, "step": 289025 }, { "epoch": 4.256932887586339, "grad_norm": 1.00519597530365, "learning_rate": 3.3100532148163667e-06, "loss": 0.0533, "step": 289050 }, { "epoch": 4.257301070676426, "grad_norm": 1.3609029054641724, "learning_rate": 3.3084168428493816e-06, "loss": 0.0557, "step": 289075 }, { "epoch": 4.257669253766513, "grad_norm": 1.0265083312988281, "learning_rate": 3.3067804708823973e-06, "loss": 0.0508, "step": 289100 }, { "epoch": 4.2580374368566005, "grad_norm": 1.1739802360534668, "learning_rate": 3.3051440989154127e-06, "loss": 0.0551, "step": 289125 }, { "epoch": 4.2584056199466875, "grad_norm": 0.864486038684845, "learning_rate": 3.3035077269484284e-06, "loss": 0.0504, "step": 289150 }, { "epoch": 4.2587738030367746, "grad_norm": 1.1211373805999756, "learning_rate": 3.301871354981444e-06, "loss": 0.0467, "step": 289175 }, { "epoch": 4.259141986126862, "grad_norm": 1.3712124824523926, "learning_rate": 3.3002349830144594e-06, "loss": 0.0519, "step": 289200 }, { "epoch": 4.259510169216949, "grad_norm": 1.7577154636383057, "learning_rate": 3.2985986110474747e-06, "loss": 0.0527, "step": 289225 }, { "epoch": 4.259878352307036, "grad_norm": 1.3306009769439697, "learning_rate": 3.29696223908049e-06, "loss": 0.0524, "step": 289250 }, { "epoch": 4.260246535397123, "grad_norm": 1.6227326393127441, "learning_rate": 3.2953258671135057e-06, "loss": 0.0522, "step": 289275 }, { "epoch": 4.26061471848721, "grad_norm": 0.8738011717796326, "learning_rate": 3.293689495146521e-06, "loss": 0.0472, "step": 289300 }, { "epoch": 4.260982901577297, "grad_norm": 1.1673873662948608, "learning_rate": 3.2920531231795368e-06, "loss": 0.0488, "step": 289325 }, { "epoch": 4.261351084667384, "grad_norm": 2.038783073425293, "learning_rate": 3.2904167512125517e-06, "loss": 0.0544, "step": 289350 }, { "epoch": 4.261719267757471, "grad_norm": 1.1564655303955078, "learning_rate": 3.2887803792455674e-06, "loss": 0.0568, "step": 289375 }, { "epoch": 4.262087450847558, "grad_norm": 1.0990837812423706, "learning_rate": 3.2871440072785827e-06, "loss": 0.05, "step": 289400 }, { "epoch": 4.262455633937645, "grad_norm": 0.9731162786483765, "learning_rate": 3.2855076353115984e-06, "loss": 0.0587, "step": 289425 }, { "epoch": 4.262823817027732, "grad_norm": 1.4380793571472168, "learning_rate": 3.2838712633446133e-06, "loss": 0.0554, "step": 289450 }, { "epoch": 4.263192000117819, "grad_norm": 1.3496243953704834, "learning_rate": 3.282234891377629e-06, "loss": 0.0509, "step": 289475 }, { "epoch": 4.263560183207906, "grad_norm": 0.9886029958724976, "learning_rate": 3.2805985194106443e-06, "loss": 0.0513, "step": 289500 }, { "epoch": 4.263928366297993, "grad_norm": 1.2706990242004395, "learning_rate": 3.27896214744366e-06, "loss": 0.056, "step": 289525 }, { "epoch": 4.26429654938808, "grad_norm": 1.5971804857254028, "learning_rate": 3.2773257754766754e-06, "loss": 0.0473, "step": 289550 }, { "epoch": 4.264664732478167, "grad_norm": 1.1203545331954956, "learning_rate": 3.275689403509691e-06, "loss": 0.0518, "step": 289575 }, { "epoch": 4.265032915568254, "grad_norm": 1.5526471138000488, "learning_rate": 3.274053031542706e-06, "loss": 0.0499, "step": 289600 }, { "epoch": 4.265401098658341, "grad_norm": 1.7908813953399658, "learning_rate": 3.2724166595757217e-06, "loss": 0.0546, "step": 289625 }, { "epoch": 4.265769281748428, "grad_norm": 0.9146240949630737, "learning_rate": 3.2707802876087374e-06, "loss": 0.0497, "step": 289650 }, { "epoch": 4.266137464838515, "grad_norm": 1.3962552547454834, "learning_rate": 3.2691439156417527e-06, "loss": 0.0518, "step": 289675 }, { "epoch": 4.266505647928602, "grad_norm": 1.0758328437805176, "learning_rate": 3.2675075436747685e-06, "loss": 0.0508, "step": 289700 }, { "epoch": 4.266873831018689, "grad_norm": 1.6994529962539673, "learning_rate": 3.2658711717077833e-06, "loss": 0.052, "step": 289725 }, { "epoch": 4.267242014108776, "grad_norm": 1.4449158906936646, "learning_rate": 3.264234799740799e-06, "loss": 0.046, "step": 289750 }, { "epoch": 4.267610197198863, "grad_norm": 1.1649595499038696, "learning_rate": 3.2625984277738144e-06, "loss": 0.05, "step": 289775 }, { "epoch": 4.26797838028895, "grad_norm": 1.8054165840148926, "learning_rate": 3.26096205580683e-06, "loss": 0.0518, "step": 289800 }, { "epoch": 4.268346563379037, "grad_norm": 1.7467937469482422, "learning_rate": 3.259325683839845e-06, "loss": 0.0448, "step": 289825 }, { "epoch": 4.268714746469124, "grad_norm": 1.802024006843567, "learning_rate": 3.2576893118728607e-06, "loss": 0.0512, "step": 289850 }, { "epoch": 4.269082929559211, "grad_norm": 1.099867582321167, "learning_rate": 3.256052939905876e-06, "loss": 0.0514, "step": 289875 }, { "epoch": 4.269451112649298, "grad_norm": 1.5718309879302979, "learning_rate": 3.2544165679388917e-06, "loss": 0.0531, "step": 289900 }, { "epoch": 4.269819295739385, "grad_norm": 1.1007776260375977, "learning_rate": 3.252780195971907e-06, "loss": 0.0473, "step": 289925 }, { "epoch": 4.270187478829472, "grad_norm": 1.678215503692627, "learning_rate": 3.2511438240049224e-06, "loss": 0.0522, "step": 289950 }, { "epoch": 4.270555661919559, "grad_norm": 1.1936955451965332, "learning_rate": 3.2495074520379377e-06, "loss": 0.048, "step": 289975 }, { "epoch": 4.270923845009646, "grad_norm": 1.3668286800384521, "learning_rate": 3.2478710800709534e-06, "loss": 0.0544, "step": 290000 }, { "epoch": 4.271292028099733, "grad_norm": 1.1381443738937378, "learning_rate": 3.2462347081039687e-06, "loss": 0.0463, "step": 290025 }, { "epoch": 4.27166021118982, "grad_norm": 1.5934035778045654, "learning_rate": 3.2445983361369844e-06, "loss": 0.0562, "step": 290050 }, { "epoch": 4.272028394279907, "grad_norm": 1.540291666984558, "learning_rate": 3.2429619641699993e-06, "loss": 0.047, "step": 290075 }, { "epoch": 4.272396577369994, "grad_norm": 1.237172245979309, "learning_rate": 3.241325592203015e-06, "loss": 0.0585, "step": 290100 }, { "epoch": 4.272764760460081, "grad_norm": 2.0267882347106934, "learning_rate": 3.2396892202360308e-06, "loss": 0.051, "step": 290125 }, { "epoch": 4.273132943550169, "grad_norm": 1.0281363725662231, "learning_rate": 3.238052848269046e-06, "loss": 0.0505, "step": 290150 }, { "epoch": 4.273501126640256, "grad_norm": 1.1336170434951782, "learning_rate": 3.236416476302062e-06, "loss": 0.0536, "step": 290175 }, { "epoch": 4.273869309730343, "grad_norm": 1.7467669248580933, "learning_rate": 3.2347801043350767e-06, "loss": 0.0541, "step": 290200 }, { "epoch": 4.27423749282043, "grad_norm": 0.6383174061775208, "learning_rate": 3.233209187246772e-06, "loss": 0.0587, "step": 290225 }, { "epoch": 4.274605675910517, "grad_norm": 1.2602483034133911, "learning_rate": 3.231572815279787e-06, "loss": 0.056, "step": 290250 }, { "epoch": 4.274973859000604, "grad_norm": 1.03874933719635, "learning_rate": 3.229936443312803e-06, "loss": 0.0537, "step": 290275 }, { "epoch": 4.275342042090691, "grad_norm": 1.1729580163955688, "learning_rate": 3.2283000713458177e-06, "loss": 0.0477, "step": 290300 }, { "epoch": 4.275710225180778, "grad_norm": 0.864612877368927, "learning_rate": 3.2266636993788335e-06, "loss": 0.0489, "step": 290325 }, { "epoch": 4.276078408270865, "grad_norm": 1.0861895084381104, "learning_rate": 3.2250273274118488e-06, "loss": 0.0451, "step": 290350 }, { "epoch": 4.276446591360952, "grad_norm": 1.7863030433654785, "learning_rate": 3.2233909554448645e-06, "loss": 0.0501, "step": 290375 }, { "epoch": 4.276814774451039, "grad_norm": 1.3945504426956177, "learning_rate": 3.22175458347788e-06, "loss": 0.0533, "step": 290400 }, { "epoch": 4.277182957541126, "grad_norm": 1.0132365226745605, "learning_rate": 3.2201182115108955e-06, "loss": 0.0545, "step": 290425 }, { "epoch": 4.277551140631213, "grad_norm": 1.4781955480575562, "learning_rate": 3.2184818395439104e-06, "loss": 0.0472, "step": 290450 }, { "epoch": 4.2779193237213, "grad_norm": 1.2693023681640625, "learning_rate": 3.216845467576926e-06, "loss": 0.0581, "step": 290475 }, { "epoch": 4.278287506811387, "grad_norm": 1.305580496788025, "learning_rate": 3.2152090956099414e-06, "loss": 0.0518, "step": 290500 }, { "epoch": 4.278655689901474, "grad_norm": 1.3671510219573975, "learning_rate": 3.213572723642957e-06, "loss": 0.0466, "step": 290525 }, { "epoch": 4.279023872991561, "grad_norm": 1.381828784942627, "learning_rate": 3.211936351675972e-06, "loss": 0.0572, "step": 290550 }, { "epoch": 4.279392056081648, "grad_norm": 1.5436588525772095, "learning_rate": 3.2102999797089878e-06, "loss": 0.0461, "step": 290575 }, { "epoch": 4.279760239171735, "grad_norm": 1.1214696168899536, "learning_rate": 3.2086636077420035e-06, "loss": 0.0508, "step": 290600 }, { "epoch": 4.280128422261822, "grad_norm": 1.500473141670227, "learning_rate": 3.207027235775019e-06, "loss": 0.0493, "step": 290625 }, { "epoch": 4.280496605351909, "grad_norm": 1.0295681953430176, "learning_rate": 3.2053908638080345e-06, "loss": 0.044, "step": 290650 }, { "epoch": 4.280864788441996, "grad_norm": 1.176110863685608, "learning_rate": 3.2037544918410494e-06, "loss": 0.0536, "step": 290675 }, { "epoch": 4.281232971532083, "grad_norm": 1.3825201988220215, "learning_rate": 3.202118119874065e-06, "loss": 0.0532, "step": 290700 }, { "epoch": 4.28160115462217, "grad_norm": 1.329816460609436, "learning_rate": 3.2004817479070804e-06, "loss": 0.051, "step": 290725 }, { "epoch": 4.281969337712257, "grad_norm": 1.1210793256759644, "learning_rate": 3.198845375940096e-06, "loss": 0.054, "step": 290750 }, { "epoch": 4.282337520802344, "grad_norm": 0.8068587779998779, "learning_rate": 3.1972090039731115e-06, "loss": 0.0564, "step": 290775 }, { "epoch": 4.282705703892431, "grad_norm": 1.3938854932785034, "learning_rate": 3.1955726320061268e-06, "loss": 0.0519, "step": 290800 }, { "epoch": 4.2830738869825185, "grad_norm": 1.214111328125, "learning_rate": 3.193936260039142e-06, "loss": 0.0508, "step": 290825 }, { "epoch": 4.2834420700726055, "grad_norm": 1.3451392650604248, "learning_rate": 3.192299888072158e-06, "loss": 0.0543, "step": 290850 }, { "epoch": 4.2838102531626925, "grad_norm": 1.3039880990982056, "learning_rate": 3.190663516105173e-06, "loss": 0.0494, "step": 290875 }, { "epoch": 4.2841784362527795, "grad_norm": 1.424365520477295, "learning_rate": 3.189027144138189e-06, "loss": 0.0526, "step": 290900 }, { "epoch": 4.2845466193428665, "grad_norm": 1.1638636589050293, "learning_rate": 3.1873907721712037e-06, "loss": 0.0499, "step": 290925 }, { "epoch": 4.2849148024329535, "grad_norm": 1.5044561624526978, "learning_rate": 3.1857544002042195e-06, "loss": 0.0549, "step": 290950 }, { "epoch": 4.2852829855230405, "grad_norm": 1.4784969091415405, "learning_rate": 3.1841180282372348e-06, "loss": 0.0525, "step": 290975 }, { "epoch": 4.285651168613128, "grad_norm": 1.465497612953186, "learning_rate": 3.1824816562702505e-06, "loss": 0.0513, "step": 291000 }, { "epoch": 4.286019351703215, "grad_norm": 1.3716065883636475, "learning_rate": 3.1808452843032654e-06, "loss": 0.0503, "step": 291025 }, { "epoch": 4.2863875347933025, "grad_norm": 1.2589154243469238, "learning_rate": 3.179208912336281e-06, "loss": 0.0562, "step": 291050 }, { "epoch": 4.2867557178833895, "grad_norm": 1.7774900197982788, "learning_rate": 3.1775725403692964e-06, "loss": 0.0551, "step": 291075 }, { "epoch": 4.2871239009734765, "grad_norm": 1.2855439186096191, "learning_rate": 3.175936168402312e-06, "loss": 0.0465, "step": 291100 }, { "epoch": 4.2874920840635635, "grad_norm": 1.6866042613983154, "learning_rate": 3.174299796435328e-06, "loss": 0.0497, "step": 291125 }, { "epoch": 4.2878602671536505, "grad_norm": 1.8169424533843994, "learning_rate": 3.172663424468343e-06, "loss": 0.0517, "step": 291150 }, { "epoch": 4.2882284502437376, "grad_norm": 1.103255033493042, "learning_rate": 3.1710270525013585e-06, "loss": 0.0538, "step": 291175 }, { "epoch": 4.288596633333825, "grad_norm": 1.1007641553878784, "learning_rate": 3.1693906805343738e-06, "loss": 0.0483, "step": 291200 }, { "epoch": 4.288964816423912, "grad_norm": 1.074811339378357, "learning_rate": 3.1677543085673895e-06, "loss": 0.0486, "step": 291225 }, { "epoch": 4.289332999513999, "grad_norm": 1.32887864112854, "learning_rate": 3.166117936600405e-06, "loss": 0.0455, "step": 291250 }, { "epoch": 4.289701182604086, "grad_norm": 0.8534607291221619, "learning_rate": 3.1644815646334205e-06, "loss": 0.0539, "step": 291275 }, { "epoch": 4.290069365694173, "grad_norm": 1.475752353668213, "learning_rate": 3.1628451926664354e-06, "loss": 0.0547, "step": 291300 }, { "epoch": 4.29043754878426, "grad_norm": 0.9513261318206787, "learning_rate": 3.161208820699451e-06, "loss": 0.05, "step": 291325 }, { "epoch": 4.290805731874347, "grad_norm": 1.2565200328826904, "learning_rate": 3.1595724487324664e-06, "loss": 0.0515, "step": 291350 }, { "epoch": 4.291173914964434, "grad_norm": 1.3913260698318481, "learning_rate": 3.157936076765482e-06, "loss": 0.0509, "step": 291375 }, { "epoch": 4.291542098054521, "grad_norm": 0.9858401417732239, "learning_rate": 3.156299704798497e-06, "loss": 0.0537, "step": 291400 }, { "epoch": 4.291910281144608, "grad_norm": 1.2199232578277588, "learning_rate": 3.1546633328315128e-06, "loss": 0.0462, "step": 291425 }, { "epoch": 4.292278464234695, "grad_norm": 0.9362841248512268, "learning_rate": 3.153026960864528e-06, "loss": 0.0518, "step": 291450 }, { "epoch": 4.292646647324782, "grad_norm": 1.0767419338226318, "learning_rate": 3.151390588897544e-06, "loss": 0.0493, "step": 291475 }, { "epoch": 4.293014830414869, "grad_norm": 1.5425621271133423, "learning_rate": 3.149754216930559e-06, "loss": 0.052, "step": 291500 }, { "epoch": 4.293383013504956, "grad_norm": 1.0434218645095825, "learning_rate": 3.148117844963575e-06, "loss": 0.0555, "step": 291525 }, { "epoch": 4.293751196595043, "grad_norm": 1.1621745824813843, "learning_rate": 3.1464814729965897e-06, "loss": 0.048, "step": 291550 }, { "epoch": 4.29411937968513, "grad_norm": 0.9913632273674011, "learning_rate": 3.1448451010296055e-06, "loss": 0.0553, "step": 291575 }, { "epoch": 4.294487562775217, "grad_norm": 1.3171082735061646, "learning_rate": 3.143208729062621e-06, "loss": 0.0506, "step": 291600 }, { "epoch": 4.294855745865304, "grad_norm": 1.4461736679077148, "learning_rate": 3.1415723570956365e-06, "loss": 0.0535, "step": 291625 }, { "epoch": 4.295223928955391, "grad_norm": 1.1955137252807617, "learning_rate": 3.1399359851286522e-06, "loss": 0.0508, "step": 291650 }, { "epoch": 4.295592112045478, "grad_norm": 1.2526497840881348, "learning_rate": 3.138299613161667e-06, "loss": 0.0507, "step": 291675 }, { "epoch": 4.295960295135565, "grad_norm": 1.2451164722442627, "learning_rate": 3.136663241194683e-06, "loss": 0.0563, "step": 291700 }, { "epoch": 4.296328478225652, "grad_norm": 1.0359917879104614, "learning_rate": 3.135026869227698e-06, "loss": 0.0457, "step": 291725 }, { "epoch": 4.296696661315739, "grad_norm": 1.4772802591323853, "learning_rate": 3.133390497260714e-06, "loss": 0.0493, "step": 291750 }, { "epoch": 4.297064844405826, "grad_norm": 1.2262344360351562, "learning_rate": 3.1317541252937287e-06, "loss": 0.0498, "step": 291775 }, { "epoch": 4.297433027495913, "grad_norm": 0.5714721083641052, "learning_rate": 3.1301177533267445e-06, "loss": 0.0545, "step": 291800 }, { "epoch": 4.297801210586, "grad_norm": 1.4304720163345337, "learning_rate": 3.1284813813597598e-06, "loss": 0.0518, "step": 291825 }, { "epoch": 4.298169393676087, "grad_norm": 0.8661518096923828, "learning_rate": 3.1268450093927755e-06, "loss": 0.051, "step": 291850 }, { "epoch": 4.298537576766174, "grad_norm": 1.01231050491333, "learning_rate": 3.125208637425791e-06, "loss": 0.0448, "step": 291875 }, { "epoch": 4.298905759856261, "grad_norm": 1.9943183660507202, "learning_rate": 3.123572265458806e-06, "loss": 0.064, "step": 291900 }, { "epoch": 4.299273942946348, "grad_norm": 1.2635256052017212, "learning_rate": 3.1219358934918214e-06, "loss": 0.052, "step": 291925 }, { "epoch": 4.299642126036435, "grad_norm": 1.0197324752807617, "learning_rate": 3.120299521524837e-06, "loss": 0.0449, "step": 291950 }, { "epoch": 4.300010309126522, "grad_norm": 1.2457128763198853, "learning_rate": 3.1186631495578524e-06, "loss": 0.0415, "step": 291975 }, { "epoch": 4.300378492216609, "grad_norm": 1.4862420558929443, "learning_rate": 3.117026777590868e-06, "loss": 0.054, "step": 292000 }, { "epoch": 4.300746675306696, "grad_norm": 1.4623264074325562, "learning_rate": 3.115390405623883e-06, "loss": 0.0482, "step": 292025 }, { "epoch": 4.301114858396783, "grad_norm": 1.565063714981079, "learning_rate": 3.1137540336568988e-06, "loss": 0.0478, "step": 292050 }, { "epoch": 4.301483041486871, "grad_norm": 1.4657037258148193, "learning_rate": 3.1121176616899145e-06, "loss": 0.0474, "step": 292075 }, { "epoch": 4.301851224576958, "grad_norm": 1.2396297454833984, "learning_rate": 3.11048128972293e-06, "loss": 0.0496, "step": 292100 }, { "epoch": 4.302219407667045, "grad_norm": 1.2692071199417114, "learning_rate": 3.1088449177559455e-06, "loss": 0.0449, "step": 292125 }, { "epoch": 4.302587590757132, "grad_norm": 1.3308403491973877, "learning_rate": 3.1072085457889604e-06, "loss": 0.0601, "step": 292150 }, { "epoch": 4.302955773847219, "grad_norm": 1.3547325134277344, "learning_rate": 3.105572173821976e-06, "loss": 0.0604, "step": 292175 }, { "epoch": 4.303323956937306, "grad_norm": 1.7643423080444336, "learning_rate": 3.1039358018549915e-06, "loss": 0.0498, "step": 292200 }, { "epoch": 4.303692140027393, "grad_norm": 1.1142398118972778, "learning_rate": 3.102299429888007e-06, "loss": 0.0549, "step": 292225 }, { "epoch": 4.30406032311748, "grad_norm": 1.2440831661224365, "learning_rate": 3.1006630579210225e-06, "loss": 0.0506, "step": 292250 }, { "epoch": 4.304428506207567, "grad_norm": 1.3940349817276, "learning_rate": 3.099026685954038e-06, "loss": 0.0546, "step": 292275 }, { "epoch": 4.304796689297654, "grad_norm": 1.4349170923233032, "learning_rate": 3.097390313987053e-06, "loss": 0.0523, "step": 292300 }, { "epoch": 4.305164872387741, "grad_norm": 1.044699788093567, "learning_rate": 3.095753942020069e-06, "loss": 0.0517, "step": 292325 }, { "epoch": 4.305533055477828, "grad_norm": 1.0127317905426025, "learning_rate": 3.094117570053084e-06, "loss": 0.055, "step": 292350 }, { "epoch": 4.305901238567915, "grad_norm": 1.1746273040771484, "learning_rate": 3.0924811980861e-06, "loss": 0.0516, "step": 292375 }, { "epoch": 4.306269421658002, "grad_norm": 1.809509038925171, "learning_rate": 3.0908448261191147e-06, "loss": 0.0524, "step": 292400 }, { "epoch": 4.306637604748089, "grad_norm": 1.101275086402893, "learning_rate": 3.0892084541521305e-06, "loss": 0.0476, "step": 292425 }, { "epoch": 4.307005787838176, "grad_norm": 1.2939157485961914, "learning_rate": 3.0875720821851458e-06, "loss": 0.0553, "step": 292450 }, { "epoch": 4.307373970928263, "grad_norm": 0.8187004327774048, "learning_rate": 3.0859357102181615e-06, "loss": 0.0508, "step": 292475 }, { "epoch": 4.30774215401835, "grad_norm": 1.361149787902832, "learning_rate": 3.0842993382511764e-06, "loss": 0.0509, "step": 292500 }, { "epoch": 4.308110337108437, "grad_norm": 1.3855130672454834, "learning_rate": 3.082662966284192e-06, "loss": 0.0563, "step": 292525 }, { "epoch": 4.308478520198524, "grad_norm": 1.2144083976745605, "learning_rate": 3.0810265943172074e-06, "loss": 0.0488, "step": 292550 }, { "epoch": 4.308846703288611, "grad_norm": 1.3592517375946045, "learning_rate": 3.079390222350223e-06, "loss": 0.0549, "step": 292575 }, { "epoch": 4.309214886378698, "grad_norm": 1.1568927764892578, "learning_rate": 3.077753850383239e-06, "loss": 0.0546, "step": 292600 }, { "epoch": 4.309583069468785, "grad_norm": 0.8282021880149841, "learning_rate": 3.076117478416254e-06, "loss": 0.0491, "step": 292625 }, { "epoch": 4.309951252558872, "grad_norm": 1.4004698991775513, "learning_rate": 3.0744811064492695e-06, "loss": 0.0565, "step": 292650 }, { "epoch": 4.310319435648959, "grad_norm": 1.5502421855926514, "learning_rate": 3.0728447344822848e-06, "loss": 0.0481, "step": 292675 }, { "epoch": 4.310687618739046, "grad_norm": 1.4700853824615479, "learning_rate": 3.0712083625153005e-06, "loss": 0.0453, "step": 292700 }, { "epoch": 4.311055801829133, "grad_norm": 1.9841800928115845, "learning_rate": 3.069571990548316e-06, "loss": 0.0528, "step": 292725 }, { "epoch": 4.31142398491922, "grad_norm": 1.8632556200027466, "learning_rate": 3.0679356185813315e-06, "loss": 0.0571, "step": 292750 }, { "epoch": 4.311792168009307, "grad_norm": 1.39838707447052, "learning_rate": 3.0662992466143464e-06, "loss": 0.0509, "step": 292775 }, { "epoch": 4.312160351099394, "grad_norm": 0.8109171986579895, "learning_rate": 3.064662874647362e-06, "loss": 0.0543, "step": 292800 }, { "epoch": 4.3125285341894815, "grad_norm": 1.4002578258514404, "learning_rate": 3.0630265026803775e-06, "loss": 0.0584, "step": 292825 }, { "epoch": 4.3128967172795685, "grad_norm": 1.2867463827133179, "learning_rate": 3.061390130713393e-06, "loss": 0.0468, "step": 292850 }, { "epoch": 4.3132649003696555, "grad_norm": 1.6728713512420654, "learning_rate": 3.059753758746408e-06, "loss": 0.0572, "step": 292875 }, { "epoch": 4.3136330834597425, "grad_norm": 1.7604358196258545, "learning_rate": 3.058117386779424e-06, "loss": 0.0528, "step": 292900 }, { "epoch": 4.3140012665498295, "grad_norm": 1.296737790107727, "learning_rate": 3.056481014812439e-06, "loss": 0.0534, "step": 292925 }, { "epoch": 4.3143694496399165, "grad_norm": 1.0273759365081787, "learning_rate": 3.054844642845455e-06, "loss": 0.0469, "step": 292950 }, { "epoch": 4.314737632730004, "grad_norm": 1.5726728439331055, "learning_rate": 3.05320827087847e-06, "loss": 0.057, "step": 292975 }, { "epoch": 4.3151058158200915, "grad_norm": 1.0067387819290161, "learning_rate": 3.0515718989114854e-06, "loss": 0.0513, "step": 293000 }, { "epoch": 4.3154739989101785, "grad_norm": 1.455614447593689, "learning_rate": 3.0499355269445007e-06, "loss": 0.0522, "step": 293025 }, { "epoch": 4.3158421820002655, "grad_norm": 1.105013370513916, "learning_rate": 3.0482991549775165e-06, "loss": 0.0487, "step": 293050 }, { "epoch": 4.3162103650903525, "grad_norm": 1.3728548288345337, "learning_rate": 3.046662783010532e-06, "loss": 0.0498, "step": 293075 }, { "epoch": 4.3165785481804395, "grad_norm": 1.3198384046554565, "learning_rate": 3.0450264110435475e-06, "loss": 0.0517, "step": 293100 }, { "epoch": 4.3169467312705265, "grad_norm": 1.5928163528442383, "learning_rate": 3.0433900390765632e-06, "loss": 0.0512, "step": 293125 }, { "epoch": 4.3173149143606135, "grad_norm": 1.511946439743042, "learning_rate": 3.041753667109578e-06, "loss": 0.0521, "step": 293150 }, { "epoch": 4.317683097450701, "grad_norm": 1.5115729570388794, "learning_rate": 3.040117295142594e-06, "loss": 0.055, "step": 293175 }, { "epoch": 4.318051280540788, "grad_norm": 1.5589832067489624, "learning_rate": 3.038480923175609e-06, "loss": 0.0489, "step": 293200 }, { "epoch": 4.318419463630875, "grad_norm": 1.6328973770141602, "learning_rate": 3.036844551208625e-06, "loss": 0.0504, "step": 293225 }, { "epoch": 4.318787646720962, "grad_norm": 1.2181569337844849, "learning_rate": 3.0352081792416398e-06, "loss": 0.0507, "step": 293250 }, { "epoch": 4.319155829811049, "grad_norm": 1.2631444931030273, "learning_rate": 3.0335718072746555e-06, "loss": 0.0496, "step": 293275 }, { "epoch": 4.319524012901136, "grad_norm": 1.3769088983535767, "learning_rate": 3.0319354353076708e-06, "loss": 0.0548, "step": 293300 }, { "epoch": 4.319892195991223, "grad_norm": 1.1551789045333862, "learning_rate": 3.0302990633406865e-06, "loss": 0.0478, "step": 293325 }, { "epoch": 4.32026037908131, "grad_norm": 1.0351506471633911, "learning_rate": 3.028662691373702e-06, "loss": 0.0451, "step": 293350 }, { "epoch": 4.320628562171397, "grad_norm": 1.35616934299469, "learning_rate": 3.027026319406717e-06, "loss": 0.0458, "step": 293375 }, { "epoch": 4.320996745261484, "grad_norm": 1.03713858127594, "learning_rate": 3.0253899474397324e-06, "loss": 0.0511, "step": 293400 }, { "epoch": 4.321364928351571, "grad_norm": 1.6304552555084229, "learning_rate": 3.023753575472748e-06, "loss": 0.0605, "step": 293425 }, { "epoch": 4.321733111441658, "grad_norm": 0.953269362449646, "learning_rate": 3.0221172035057635e-06, "loss": 0.0505, "step": 293450 }, { "epoch": 4.322101294531745, "grad_norm": 1.4370427131652832, "learning_rate": 3.020480831538779e-06, "loss": 0.0475, "step": 293475 }, { "epoch": 4.322469477621832, "grad_norm": 1.8312782049179077, "learning_rate": 3.018844459571794e-06, "loss": 0.0509, "step": 293500 }, { "epoch": 4.322837660711919, "grad_norm": 1.1360297203063965, "learning_rate": 3.01720808760481e-06, "loss": 0.049, "step": 293525 }, { "epoch": 4.323205843802006, "grad_norm": 1.1346262693405151, "learning_rate": 3.0155717156378255e-06, "loss": 0.0483, "step": 293550 }, { "epoch": 4.323574026892093, "grad_norm": 0.7781646251678467, "learning_rate": 3.013935343670841e-06, "loss": 0.0488, "step": 293575 }, { "epoch": 4.32394220998218, "grad_norm": 1.7196122407913208, "learning_rate": 3.0122989717038566e-06, "loss": 0.0469, "step": 293600 }, { "epoch": 4.324310393072267, "grad_norm": 1.5313682556152344, "learning_rate": 3.0106625997368714e-06, "loss": 0.0511, "step": 293625 }, { "epoch": 4.324678576162354, "grad_norm": 1.2286125421524048, "learning_rate": 3.009026227769887e-06, "loss": 0.0537, "step": 293650 }, { "epoch": 4.325046759252441, "grad_norm": 1.605811357498169, "learning_rate": 3.0073898558029025e-06, "loss": 0.0443, "step": 293675 }, { "epoch": 4.325414942342528, "grad_norm": 1.1447017192840576, "learning_rate": 3.005753483835918e-06, "loss": 0.0576, "step": 293700 }, { "epoch": 4.325783125432615, "grad_norm": 1.6054688692092896, "learning_rate": 3.0041171118689335e-06, "loss": 0.0501, "step": 293725 }, { "epoch": 4.326151308522702, "grad_norm": 1.2128782272338867, "learning_rate": 3.002480739901949e-06, "loss": 0.0519, "step": 293750 }, { "epoch": 4.326519491612789, "grad_norm": 0.9511484503746033, "learning_rate": 3.000844367934964e-06, "loss": 0.0536, "step": 293775 }, { "epoch": 4.326887674702876, "grad_norm": 1.5446316003799438, "learning_rate": 2.99920799596798e-06, "loss": 0.0502, "step": 293800 }, { "epoch": 4.327255857792963, "grad_norm": 1.140790581703186, "learning_rate": 2.997571624000995e-06, "loss": 0.049, "step": 293825 }, { "epoch": 4.32762404088305, "grad_norm": 1.5022300481796265, "learning_rate": 2.995935252034011e-06, "loss": 0.0524, "step": 293850 }, { "epoch": 4.327992223973137, "grad_norm": 1.4234492778778076, "learning_rate": 2.9942988800670257e-06, "loss": 0.0564, "step": 293875 }, { "epoch": 4.328360407063224, "grad_norm": 1.4587119817733765, "learning_rate": 2.9926625081000415e-06, "loss": 0.0522, "step": 293900 }, { "epoch": 4.328728590153311, "grad_norm": 1.490440845489502, "learning_rate": 2.991091591011736e-06, "loss": 0.0499, "step": 293925 }, { "epoch": 4.329096773243398, "grad_norm": 1.5111546516418457, "learning_rate": 2.989455219044752e-06, "loss": 0.0504, "step": 293950 }, { "epoch": 4.329464956333485, "grad_norm": 1.2817327976226807, "learning_rate": 2.987818847077767e-06, "loss": 0.0525, "step": 293975 }, { "epoch": 4.329833139423573, "grad_norm": 1.8078248500823975, "learning_rate": 2.9861824751107825e-06, "loss": 0.0565, "step": 294000 }, { "epoch": 4.33020132251366, "grad_norm": 1.2433573007583618, "learning_rate": 2.9845461031437983e-06, "loss": 0.0575, "step": 294025 }, { "epoch": 4.330569505603747, "grad_norm": 1.7191497087478638, "learning_rate": 2.9829097311768136e-06, "loss": 0.057, "step": 294050 }, { "epoch": 4.330937688693834, "grad_norm": 1.2274821996688843, "learning_rate": 2.9812733592098293e-06, "loss": 0.042, "step": 294075 }, { "epoch": 4.331305871783921, "grad_norm": 1.2338813543319702, "learning_rate": 2.979636987242844e-06, "loss": 0.0453, "step": 294100 }, { "epoch": 4.331674054874008, "grad_norm": 1.0689059495925903, "learning_rate": 2.97800061527586e-06, "loss": 0.0563, "step": 294125 }, { "epoch": 4.332042237964095, "grad_norm": 1.1198010444641113, "learning_rate": 2.9763642433088752e-06, "loss": 0.0507, "step": 294150 }, { "epoch": 4.332410421054182, "grad_norm": 0.9678905010223389, "learning_rate": 2.974727871341891e-06, "loss": 0.0494, "step": 294175 }, { "epoch": 4.332778604144269, "grad_norm": 1.0130997896194458, "learning_rate": 2.9730914993749062e-06, "loss": 0.0457, "step": 294200 }, { "epoch": 4.333146787234356, "grad_norm": 1.324289083480835, "learning_rate": 2.9714551274079216e-06, "loss": 0.0534, "step": 294225 }, { "epoch": 4.333514970324443, "grad_norm": 1.6521437168121338, "learning_rate": 2.969818755440937e-06, "loss": 0.0514, "step": 294250 }, { "epoch": 4.33388315341453, "grad_norm": 1.2321314811706543, "learning_rate": 2.9681823834739526e-06, "loss": 0.0511, "step": 294275 }, { "epoch": 4.334251336504617, "grad_norm": 0.857231616973877, "learning_rate": 2.966546011506968e-06, "loss": 0.0479, "step": 294300 }, { "epoch": 4.334619519594704, "grad_norm": 1.4737480878829956, "learning_rate": 2.9649096395399836e-06, "loss": 0.0473, "step": 294325 }, { "epoch": 4.334987702684791, "grad_norm": 1.3005889654159546, "learning_rate": 2.9632732675729985e-06, "loss": 0.0481, "step": 294350 }, { "epoch": 4.335355885774878, "grad_norm": 1.1469502449035645, "learning_rate": 2.9616368956060142e-06, "loss": 0.052, "step": 294375 }, { "epoch": 4.335724068864965, "grad_norm": 1.7334963083267212, "learning_rate": 2.9600005236390295e-06, "loss": 0.0558, "step": 294400 }, { "epoch": 4.336092251955052, "grad_norm": 1.4991530179977417, "learning_rate": 2.9583641516720453e-06, "loss": 0.0485, "step": 294425 }, { "epoch": 4.336460435045139, "grad_norm": 0.9723924398422241, "learning_rate": 2.95672777970506e-06, "loss": 0.0521, "step": 294450 }, { "epoch": 4.336828618135226, "grad_norm": 1.8658486604690552, "learning_rate": 2.955091407738076e-06, "loss": 0.0581, "step": 294475 }, { "epoch": 4.337196801225313, "grad_norm": 1.2224810123443604, "learning_rate": 2.9534550357710916e-06, "loss": 0.0535, "step": 294500 }, { "epoch": 4.3375649843154, "grad_norm": 1.3490447998046875, "learning_rate": 2.951818663804107e-06, "loss": 0.056, "step": 294525 }, { "epoch": 4.337933167405487, "grad_norm": 1.211398720741272, "learning_rate": 2.9501822918371226e-06, "loss": 0.0463, "step": 294550 }, { "epoch": 4.338301350495574, "grad_norm": 1.1779062747955322, "learning_rate": 2.948545919870138e-06, "loss": 0.0514, "step": 294575 }, { "epoch": 4.338669533585661, "grad_norm": 1.1999893188476562, "learning_rate": 2.9469095479031532e-06, "loss": 0.0471, "step": 294600 }, { "epoch": 4.339037716675748, "grad_norm": 1.1773226261138916, "learning_rate": 2.9452731759361685e-06, "loss": 0.0483, "step": 294625 }, { "epoch": 4.339405899765835, "grad_norm": 1.5120632648468018, "learning_rate": 2.9436368039691843e-06, "loss": 0.0504, "step": 294650 }, { "epoch": 4.339774082855922, "grad_norm": 0.8460533618927002, "learning_rate": 2.9420004320021996e-06, "loss": 0.0518, "step": 294675 }, { "epoch": 4.340142265946009, "grad_norm": 0.9781943559646606, "learning_rate": 2.9403640600352153e-06, "loss": 0.0478, "step": 294700 }, { "epoch": 4.340510449036096, "grad_norm": 0.6635684967041016, "learning_rate": 2.93872768806823e-06, "loss": 0.0525, "step": 294725 }, { "epoch": 4.340878632126183, "grad_norm": 1.8739856481552124, "learning_rate": 2.937091316101246e-06, "loss": 0.052, "step": 294750 }, { "epoch": 4.34124681521627, "grad_norm": 1.3742173910140991, "learning_rate": 2.9354549441342612e-06, "loss": 0.0461, "step": 294775 }, { "epoch": 4.341614998306357, "grad_norm": 1.1041861772537231, "learning_rate": 2.933818572167277e-06, "loss": 0.0459, "step": 294800 }, { "epoch": 4.3419831813964445, "grad_norm": 1.3814668655395508, "learning_rate": 2.932182200200292e-06, "loss": 0.0498, "step": 294825 }, { "epoch": 4.3423513644865315, "grad_norm": 1.1698280572891235, "learning_rate": 2.9305458282333075e-06, "loss": 0.0521, "step": 294850 }, { "epoch": 4.3427195475766185, "grad_norm": 1.7867683172225952, "learning_rate": 2.928909456266323e-06, "loss": 0.0484, "step": 294875 }, { "epoch": 4.343087730666706, "grad_norm": 1.2471885681152344, "learning_rate": 2.9272730842993386e-06, "loss": 0.0532, "step": 294900 }, { "epoch": 4.343455913756793, "grad_norm": 1.2834385633468628, "learning_rate": 2.925636712332354e-06, "loss": 0.0535, "step": 294925 }, { "epoch": 4.34382409684688, "grad_norm": 1.1170151233673096, "learning_rate": 2.924000340365369e-06, "loss": 0.0503, "step": 294950 }, { "epoch": 4.344192279936967, "grad_norm": 1.467023491859436, "learning_rate": 2.9223639683983845e-06, "loss": 0.0597, "step": 294975 }, { "epoch": 4.3445604630270545, "grad_norm": 0.9308691620826721, "learning_rate": 2.9207275964314002e-06, "loss": 0.0544, "step": 295000 }, { "epoch": 4.3449286461171415, "grad_norm": 1.2788670063018799, "learning_rate": 2.919091224464416e-06, "loss": 0.0556, "step": 295025 }, { "epoch": 4.3452968292072285, "grad_norm": 1.2762491703033447, "learning_rate": 2.9174548524974313e-06, "loss": 0.051, "step": 295050 }, { "epoch": 4.3456650122973155, "grad_norm": 1.1978625059127808, "learning_rate": 2.915818480530447e-06, "loss": 0.0515, "step": 295075 }, { "epoch": 4.3460331953874025, "grad_norm": 0.8507020473480225, "learning_rate": 2.914182108563462e-06, "loss": 0.0526, "step": 295100 }, { "epoch": 4.3464013784774895, "grad_norm": 1.7318373918533325, "learning_rate": 2.9125457365964776e-06, "loss": 0.0464, "step": 295125 }, { "epoch": 4.3467695615675765, "grad_norm": 1.1228421926498413, "learning_rate": 2.910909364629493e-06, "loss": 0.0501, "step": 295150 }, { "epoch": 4.347137744657664, "grad_norm": 2.1247904300689697, "learning_rate": 2.9092729926625086e-06, "loss": 0.0619, "step": 295175 }, { "epoch": 4.347505927747751, "grad_norm": 1.4659314155578613, "learning_rate": 2.9076366206955235e-06, "loss": 0.0502, "step": 295200 }, { "epoch": 4.347874110837838, "grad_norm": 1.657680630683899, "learning_rate": 2.9060002487285392e-06, "loss": 0.0481, "step": 295225 }, { "epoch": 4.348242293927925, "grad_norm": 1.4498772621154785, "learning_rate": 2.9043638767615545e-06, "loss": 0.0475, "step": 295250 }, { "epoch": 4.348610477018012, "grad_norm": 1.4223194122314453, "learning_rate": 2.9027275047945703e-06, "loss": 0.0523, "step": 295275 }, { "epoch": 4.348978660108099, "grad_norm": 1.369748830795288, "learning_rate": 2.9010911328275856e-06, "loss": 0.0518, "step": 295300 }, { "epoch": 4.349346843198186, "grad_norm": 0.855309247970581, "learning_rate": 2.899454760860601e-06, "loss": 0.0518, "step": 295325 }, { "epoch": 4.349715026288273, "grad_norm": 1.7137930393218994, "learning_rate": 2.897818388893616e-06, "loss": 0.0505, "step": 295350 }, { "epoch": 4.35008320937836, "grad_norm": 1.4124717712402344, "learning_rate": 2.896182016926632e-06, "loss": 0.0566, "step": 295375 }, { "epoch": 4.350451392468447, "grad_norm": 1.562554121017456, "learning_rate": 2.894545644959647e-06, "loss": 0.0497, "step": 295400 }, { "epoch": 4.350819575558534, "grad_norm": 1.7596133947372437, "learning_rate": 2.892909272992663e-06, "loss": 0.0506, "step": 295425 }, { "epoch": 4.351187758648621, "grad_norm": 1.0120108127593994, "learning_rate": 2.891272901025678e-06, "loss": 0.0533, "step": 295450 }, { "epoch": 4.351555941738708, "grad_norm": 1.1392792463302612, "learning_rate": 2.8896365290586935e-06, "loss": 0.0519, "step": 295475 }, { "epoch": 4.351924124828795, "grad_norm": 1.0277249813079834, "learning_rate": 2.8880001570917093e-06, "loss": 0.0444, "step": 295500 }, { "epoch": 4.352292307918882, "grad_norm": 1.5045995712280273, "learning_rate": 2.8863637851247246e-06, "loss": 0.0473, "step": 295525 }, { "epoch": 4.352660491008969, "grad_norm": 1.440407633781433, "learning_rate": 2.8847274131577403e-06, "loss": 0.0544, "step": 295550 }, { "epoch": 4.353028674099056, "grad_norm": 1.6547504663467407, "learning_rate": 2.883091041190755e-06, "loss": 0.058, "step": 295575 }, { "epoch": 4.353396857189143, "grad_norm": 1.3485205173492432, "learning_rate": 2.881454669223771e-06, "loss": 0.0484, "step": 295600 }, { "epoch": 4.35376504027923, "grad_norm": 1.4512834548950195, "learning_rate": 2.8798182972567862e-06, "loss": 0.0565, "step": 295625 }, { "epoch": 4.354133223369317, "grad_norm": 1.4781733751296997, "learning_rate": 2.878181925289802e-06, "loss": 0.0512, "step": 295650 }, { "epoch": 4.354501406459404, "grad_norm": 0.9898490309715271, "learning_rate": 2.876545553322817e-06, "loss": 0.0546, "step": 295675 }, { "epoch": 4.354869589549491, "grad_norm": 1.4517611265182495, "learning_rate": 2.8749091813558326e-06, "loss": 0.054, "step": 295700 }, { "epoch": 4.355237772639578, "grad_norm": 1.4266328811645508, "learning_rate": 2.873272809388848e-06, "loss": 0.0442, "step": 295725 }, { "epoch": 4.355605955729665, "grad_norm": 1.6748931407928467, "learning_rate": 2.8716364374218636e-06, "loss": 0.0491, "step": 295750 }, { "epoch": 4.355974138819752, "grad_norm": 1.054490089416504, "learning_rate": 2.870000065454879e-06, "loss": 0.0582, "step": 295775 }, { "epoch": 4.356342321909839, "grad_norm": 1.383265733718872, "learning_rate": 2.8683636934878946e-06, "loss": 0.0434, "step": 295800 }, { "epoch": 4.356710504999926, "grad_norm": 1.3003169298171997, "learning_rate": 2.8667273215209095e-06, "loss": 0.0503, "step": 295825 }, { "epoch": 4.357078688090013, "grad_norm": 1.1027840375900269, "learning_rate": 2.8650909495539252e-06, "loss": 0.0531, "step": 295850 }, { "epoch": 4.3574468711801, "grad_norm": 1.10606050491333, "learning_rate": 2.8634545775869405e-06, "loss": 0.053, "step": 295875 }, { "epoch": 4.357815054270188, "grad_norm": 1.530905842781067, "learning_rate": 2.8618182056199563e-06, "loss": 0.0571, "step": 295900 }, { "epoch": 4.358183237360275, "grad_norm": 1.0879781246185303, "learning_rate": 2.860181833652971e-06, "loss": 0.0533, "step": 295925 }, { "epoch": 4.358551420450362, "grad_norm": 1.706228256225586, "learning_rate": 2.858545461685987e-06, "loss": 0.0567, "step": 295950 }, { "epoch": 4.358919603540449, "grad_norm": 1.7201368808746338, "learning_rate": 2.8569090897190026e-06, "loss": 0.0504, "step": 295975 }, { "epoch": 4.359287786630536, "grad_norm": 1.1667488813400269, "learning_rate": 2.855272717752018e-06, "loss": 0.0538, "step": 296000 }, { "epoch": 4.359655969720623, "grad_norm": 1.4120999574661255, "learning_rate": 2.8536363457850336e-06, "loss": 0.0501, "step": 296025 }, { "epoch": 4.36002415281071, "grad_norm": 1.6748450994491577, "learning_rate": 2.852065428696728e-06, "loss": 0.0581, "step": 296050 }, { "epoch": 4.360392335900797, "grad_norm": 1.5026227235794067, "learning_rate": 2.8504290567297437e-06, "loss": 0.0541, "step": 296075 }, { "epoch": 4.360760518990884, "grad_norm": 1.3919445276260376, "learning_rate": 2.848792684762759e-06, "loss": 0.0542, "step": 296100 }, { "epoch": 4.361128702080971, "grad_norm": 1.1815283298492432, "learning_rate": 2.8471563127957747e-06, "loss": 0.0596, "step": 296125 }, { "epoch": 4.361496885171058, "grad_norm": 1.4525161981582642, "learning_rate": 2.84551994082879e-06, "loss": 0.0529, "step": 296150 }, { "epoch": 4.361865068261145, "grad_norm": 1.4406558275222778, "learning_rate": 2.8438835688618053e-06, "loss": 0.0586, "step": 296175 }, { "epoch": 4.362233251351232, "grad_norm": 1.7876112461090088, "learning_rate": 2.8422471968948206e-06, "loss": 0.0545, "step": 296200 }, { "epoch": 4.362601434441319, "grad_norm": 1.2497589588165283, "learning_rate": 2.8406108249278363e-06, "loss": 0.0538, "step": 296225 }, { "epoch": 4.362969617531406, "grad_norm": 1.3769502639770508, "learning_rate": 2.8389744529608516e-06, "loss": 0.0524, "step": 296250 }, { "epoch": 4.363337800621493, "grad_norm": 1.0861910581588745, "learning_rate": 2.8373380809938674e-06, "loss": 0.0514, "step": 296275 }, { "epoch": 4.36370598371158, "grad_norm": 1.4146472215652466, "learning_rate": 2.8357017090268823e-06, "loss": 0.0558, "step": 296300 }, { "epoch": 4.364074166801667, "grad_norm": 0.7640154361724854, "learning_rate": 2.834065337059898e-06, "loss": 0.0633, "step": 296325 }, { "epoch": 4.364442349891754, "grad_norm": 1.4597136974334717, "learning_rate": 2.8324289650929133e-06, "loss": 0.0492, "step": 296350 }, { "epoch": 4.364810532981841, "grad_norm": 1.7000768184661865, "learning_rate": 2.830792593125929e-06, "loss": 0.0476, "step": 296375 }, { "epoch": 4.365178716071928, "grad_norm": 1.114599347114563, "learning_rate": 2.829156221158944e-06, "loss": 0.0589, "step": 296400 }, { "epoch": 4.365546899162015, "grad_norm": 1.2643077373504639, "learning_rate": 2.8275198491919596e-06, "loss": 0.0506, "step": 296425 }, { "epoch": 4.365915082252102, "grad_norm": 1.7858588695526123, "learning_rate": 2.8258834772249753e-06, "loss": 0.0585, "step": 296450 }, { "epoch": 4.366283265342189, "grad_norm": 1.3012501001358032, "learning_rate": 2.8242471052579907e-06, "loss": 0.0539, "step": 296475 }, { "epoch": 4.366651448432276, "grad_norm": 1.6165803670883179, "learning_rate": 2.8226107332910064e-06, "loss": 0.0523, "step": 296500 }, { "epoch": 4.367019631522363, "grad_norm": 1.2451139688491821, "learning_rate": 2.8209743613240213e-06, "loss": 0.0562, "step": 296525 }, { "epoch": 4.36738781461245, "grad_norm": 1.2229077816009521, "learning_rate": 2.819337989357037e-06, "loss": 0.0508, "step": 296550 }, { "epoch": 4.367755997702537, "grad_norm": 1.5834873914718628, "learning_rate": 2.8177016173900523e-06, "loss": 0.0521, "step": 296575 }, { "epoch": 4.368124180792624, "grad_norm": 1.4526238441467285, "learning_rate": 2.816065245423068e-06, "loss": 0.0511, "step": 296600 }, { "epoch": 4.368492363882711, "grad_norm": 0.9935227632522583, "learning_rate": 2.8144288734560833e-06, "loss": 0.052, "step": 296625 }, { "epoch": 4.368860546972798, "grad_norm": 1.1918072700500488, "learning_rate": 2.812792501489099e-06, "loss": 0.0478, "step": 296650 }, { "epoch": 4.369228730062885, "grad_norm": 1.8841264247894287, "learning_rate": 2.811156129522114e-06, "loss": 0.0499, "step": 296675 }, { "epoch": 4.369596913152972, "grad_norm": 1.350530982017517, "learning_rate": 2.8095197575551297e-06, "loss": 0.0448, "step": 296700 }, { "epoch": 4.369965096243059, "grad_norm": 1.1562445163726807, "learning_rate": 2.807883385588145e-06, "loss": 0.0511, "step": 296725 }, { "epoch": 4.370333279333146, "grad_norm": 1.1276297569274902, "learning_rate": 2.8062470136211607e-06, "loss": 0.0488, "step": 296750 }, { "epoch": 4.370701462423233, "grad_norm": 1.4236958026885986, "learning_rate": 2.8046106416541756e-06, "loss": 0.0593, "step": 296775 }, { "epoch": 4.371069645513321, "grad_norm": 1.2665432691574097, "learning_rate": 2.8029742696871913e-06, "loss": 0.0543, "step": 296800 }, { "epoch": 4.371437828603408, "grad_norm": 1.5597598552703857, "learning_rate": 2.8013378977202066e-06, "loss": 0.0512, "step": 296825 }, { "epoch": 4.371806011693495, "grad_norm": 0.6048399806022644, "learning_rate": 2.7997015257532223e-06, "loss": 0.0517, "step": 296850 }, { "epoch": 4.372174194783582, "grad_norm": 1.1398652791976929, "learning_rate": 2.7980651537862376e-06, "loss": 0.0476, "step": 296875 }, { "epoch": 4.372542377873669, "grad_norm": 1.0361242294311523, "learning_rate": 2.796428781819253e-06, "loss": 0.0544, "step": 296900 }, { "epoch": 4.372910560963756, "grad_norm": 1.6299474239349365, "learning_rate": 2.7947924098522683e-06, "loss": 0.0549, "step": 296925 }, { "epoch": 4.373278744053843, "grad_norm": 1.4158477783203125, "learning_rate": 2.793156037885284e-06, "loss": 0.0554, "step": 296950 }, { "epoch": 4.37364692714393, "grad_norm": 1.4037063121795654, "learning_rate": 2.7915196659182997e-06, "loss": 0.0497, "step": 296975 }, { "epoch": 4.3740151102340175, "grad_norm": 1.048384428024292, "learning_rate": 2.789883293951315e-06, "loss": 0.0581, "step": 297000 }, { "epoch": 4.3743832933241045, "grad_norm": 1.5785359144210815, "learning_rate": 2.7882469219843307e-06, "loss": 0.0541, "step": 297025 }, { "epoch": 4.3747514764141915, "grad_norm": 0.9885494112968445, "learning_rate": 2.7866105500173456e-06, "loss": 0.0488, "step": 297050 }, { "epoch": 4.3751196595042785, "grad_norm": 1.119652271270752, "learning_rate": 2.7849741780503613e-06, "loss": 0.054, "step": 297075 }, { "epoch": 4.3754878425943655, "grad_norm": 1.4240087270736694, "learning_rate": 2.7833378060833767e-06, "loss": 0.0523, "step": 297100 }, { "epoch": 4.3758560256844525, "grad_norm": 1.3366814851760864, "learning_rate": 2.7817014341163924e-06, "loss": 0.0538, "step": 297125 }, { "epoch": 4.3762242087745395, "grad_norm": 1.2877370119094849, "learning_rate": 2.7800650621494073e-06, "loss": 0.0583, "step": 297150 }, { "epoch": 4.376592391864627, "grad_norm": 0.7757311463356018, "learning_rate": 2.778428690182423e-06, "loss": 0.0466, "step": 297175 }, { "epoch": 4.376960574954714, "grad_norm": 1.0005812644958496, "learning_rate": 2.7767923182154383e-06, "loss": 0.0568, "step": 297200 }, { "epoch": 4.377328758044801, "grad_norm": 1.170225977897644, "learning_rate": 2.775155946248454e-06, "loss": 0.0439, "step": 297225 }, { "epoch": 4.377696941134888, "grad_norm": 1.5662139654159546, "learning_rate": 2.7735195742814693e-06, "loss": 0.0462, "step": 297250 }, { "epoch": 4.378065124224975, "grad_norm": 1.189277172088623, "learning_rate": 2.7718832023144846e-06, "loss": 0.048, "step": 297275 }, { "epoch": 4.378433307315062, "grad_norm": 1.0078911781311035, "learning_rate": 2.7702468303475e-06, "loss": 0.0522, "step": 297300 }, { "epoch": 4.378801490405149, "grad_norm": 1.310375452041626, "learning_rate": 2.7686104583805157e-06, "loss": 0.0504, "step": 297325 }, { "epoch": 4.379169673495236, "grad_norm": 1.2654945850372314, "learning_rate": 2.766974086413531e-06, "loss": 0.0478, "step": 297350 }, { "epoch": 4.379537856585323, "grad_norm": 1.029016137123108, "learning_rate": 2.7653377144465467e-06, "loss": 0.0525, "step": 297375 }, { "epoch": 4.37990603967541, "grad_norm": 1.1610445976257324, "learning_rate": 2.7637013424795616e-06, "loss": 0.0483, "step": 297400 }, { "epoch": 4.380274222765497, "grad_norm": 1.5103274583816528, "learning_rate": 2.7620649705125773e-06, "loss": 0.0506, "step": 297425 }, { "epoch": 4.380642405855584, "grad_norm": 1.6109086275100708, "learning_rate": 2.760428598545593e-06, "loss": 0.053, "step": 297450 }, { "epoch": 4.381010588945671, "grad_norm": 1.547471284866333, "learning_rate": 2.7587922265786083e-06, "loss": 0.0517, "step": 297475 }, { "epoch": 4.381378772035758, "grad_norm": 1.4506932497024536, "learning_rate": 2.757155854611624e-06, "loss": 0.0461, "step": 297500 }, { "epoch": 4.381746955125845, "grad_norm": 1.1375908851623535, "learning_rate": 2.755519482644639e-06, "loss": 0.0472, "step": 297525 }, { "epoch": 4.382115138215932, "grad_norm": 1.2325059175491333, "learning_rate": 2.7538831106776547e-06, "loss": 0.0563, "step": 297550 }, { "epoch": 4.382483321306019, "grad_norm": 1.5177874565124512, "learning_rate": 2.75224673871067e-06, "loss": 0.0511, "step": 297575 }, { "epoch": 4.382851504396106, "grad_norm": 1.8435479402542114, "learning_rate": 2.7506103667436857e-06, "loss": 0.0505, "step": 297600 }, { "epoch": 4.383219687486193, "grad_norm": 1.104292392730713, "learning_rate": 2.7489739947767006e-06, "loss": 0.0587, "step": 297625 }, { "epoch": 4.38358787057628, "grad_norm": 1.0239830017089844, "learning_rate": 2.7473376228097163e-06, "loss": 0.0481, "step": 297650 }, { "epoch": 4.383956053666367, "grad_norm": 1.1710364818572998, "learning_rate": 2.7457012508427316e-06, "loss": 0.0511, "step": 297675 }, { "epoch": 4.384324236756454, "grad_norm": 1.2445671558380127, "learning_rate": 2.7440648788757473e-06, "loss": 0.0509, "step": 297700 }, { "epoch": 4.384692419846541, "grad_norm": 0.7922903299331665, "learning_rate": 2.7424285069087627e-06, "loss": 0.0456, "step": 297725 }, { "epoch": 4.385060602936628, "grad_norm": 1.3949757814407349, "learning_rate": 2.7407921349417784e-06, "loss": 0.0484, "step": 297750 }, { "epoch": 4.385428786026715, "grad_norm": 1.1927595138549805, "learning_rate": 2.7391557629747933e-06, "loss": 0.0538, "step": 297775 }, { "epoch": 4.385796969116802, "grad_norm": 1.561627984046936, "learning_rate": 2.737519391007809e-06, "loss": 0.0484, "step": 297800 }, { "epoch": 4.38616515220689, "grad_norm": 1.2204314470291138, "learning_rate": 2.7358830190408243e-06, "loss": 0.0511, "step": 297825 }, { "epoch": 4.386533335296977, "grad_norm": 1.379150629043579, "learning_rate": 2.73424664707384e-06, "loss": 0.0572, "step": 297850 }, { "epoch": 4.386901518387064, "grad_norm": 1.0495593547821045, "learning_rate": 2.732610275106855e-06, "loss": 0.0511, "step": 297875 }, { "epoch": 4.387269701477151, "grad_norm": 1.3866804838180542, "learning_rate": 2.7309739031398706e-06, "loss": 0.0552, "step": 297900 }, { "epoch": 4.387637884567238, "grad_norm": 1.5815870761871338, "learning_rate": 2.7293375311728864e-06, "loss": 0.055, "step": 297925 }, { "epoch": 4.388006067657325, "grad_norm": 1.0184849500656128, "learning_rate": 2.7277011592059017e-06, "loss": 0.0517, "step": 297950 }, { "epoch": 4.388374250747412, "grad_norm": 1.3937221765518188, "learning_rate": 2.7260647872389174e-06, "loss": 0.048, "step": 297975 }, { "epoch": 4.388742433837499, "grad_norm": 1.2227100133895874, "learning_rate": 2.7244284152719323e-06, "loss": 0.0495, "step": 298000 }, { "epoch": 4.389110616927586, "grad_norm": 1.2483971118927002, "learning_rate": 2.722792043304948e-06, "loss": 0.0513, "step": 298025 }, { "epoch": 4.389478800017673, "grad_norm": 1.6177209615707397, "learning_rate": 2.7211556713379633e-06, "loss": 0.0486, "step": 298050 }, { "epoch": 4.38984698310776, "grad_norm": 1.45516836643219, "learning_rate": 2.719519299370979e-06, "loss": 0.0532, "step": 298075 }, { "epoch": 4.390215166197847, "grad_norm": 1.3236064910888672, "learning_rate": 2.7179483822826738e-06, "loss": 0.0606, "step": 298100 }, { "epoch": 4.390583349287934, "grad_norm": 1.5390287637710571, "learning_rate": 2.716312010315689e-06, "loss": 0.0544, "step": 298125 }, { "epoch": 4.390951532378021, "grad_norm": 1.111795425415039, "learning_rate": 2.7146756383487044e-06, "loss": 0.0546, "step": 298150 }, { "epoch": 4.391319715468108, "grad_norm": 1.445644736289978, "learning_rate": 2.71303926638172e-06, "loss": 0.0552, "step": 298175 }, { "epoch": 4.391687898558195, "grad_norm": 0.9250501394271851, "learning_rate": 2.7114028944147354e-06, "loss": 0.0521, "step": 298200 }, { "epoch": 4.392056081648282, "grad_norm": 1.3016377687454224, "learning_rate": 2.709766522447751e-06, "loss": 0.0526, "step": 298225 }, { "epoch": 4.392424264738369, "grad_norm": 1.109683632850647, "learning_rate": 2.708130150480766e-06, "loss": 0.053, "step": 298250 }, { "epoch": 4.392792447828456, "grad_norm": 1.369711995124817, "learning_rate": 2.7064937785137817e-06, "loss": 0.0501, "step": 298275 }, { "epoch": 4.393160630918543, "grad_norm": 1.0356179475784302, "learning_rate": 2.704857406546797e-06, "loss": 0.0504, "step": 298300 }, { "epoch": 4.39352881400863, "grad_norm": 1.7696492671966553, "learning_rate": 2.7032210345798128e-06, "loss": 0.0522, "step": 298325 }, { "epoch": 4.393896997098717, "grad_norm": 1.1351957321166992, "learning_rate": 2.7015846626128276e-06, "loss": 0.0515, "step": 298350 }, { "epoch": 4.394265180188804, "grad_norm": 1.5372394323349, "learning_rate": 2.6999482906458434e-06, "loss": 0.057, "step": 298375 }, { "epoch": 4.394633363278891, "grad_norm": 1.4864375591278076, "learning_rate": 2.698311918678859e-06, "loss": 0.0532, "step": 298400 }, { "epoch": 4.395001546368978, "grad_norm": 1.473694920539856, "learning_rate": 2.6966755467118744e-06, "loss": 0.0524, "step": 298425 }, { "epoch": 4.395369729459065, "grad_norm": 1.9480584859848022, "learning_rate": 2.69503917474489e-06, "loss": 0.0558, "step": 298450 }, { "epoch": 4.395737912549152, "grad_norm": 1.272112488746643, "learning_rate": 2.693402802777905e-06, "loss": 0.0504, "step": 298475 }, { "epoch": 4.396106095639239, "grad_norm": 1.0264941453933716, "learning_rate": 2.6917664308109207e-06, "loss": 0.0451, "step": 298500 }, { "epoch": 4.396474278729326, "grad_norm": 1.3120458126068115, "learning_rate": 2.690130058843936e-06, "loss": 0.0506, "step": 298525 }, { "epoch": 4.396842461819413, "grad_norm": 1.693663477897644, "learning_rate": 2.6884936868769518e-06, "loss": 0.0527, "step": 298550 }, { "epoch": 4.3972106449095, "grad_norm": 0.8481330871582031, "learning_rate": 2.686857314909967e-06, "loss": 0.0471, "step": 298575 }, { "epoch": 4.397578827999587, "grad_norm": 1.2794184684753418, "learning_rate": 2.685220942942983e-06, "loss": 0.0561, "step": 298600 }, { "epoch": 4.397947011089674, "grad_norm": 0.4349769353866577, "learning_rate": 2.6835845709759977e-06, "loss": 0.0498, "step": 298625 }, { "epoch": 4.398315194179761, "grad_norm": 1.0860872268676758, "learning_rate": 2.6819481990090134e-06, "loss": 0.0487, "step": 298650 }, { "epoch": 4.398683377269848, "grad_norm": 1.555217981338501, "learning_rate": 2.6803118270420287e-06, "loss": 0.0584, "step": 298675 }, { "epoch": 4.399051560359935, "grad_norm": 1.3808329105377197, "learning_rate": 2.6786754550750445e-06, "loss": 0.0576, "step": 298700 }, { "epoch": 4.399419743450023, "grad_norm": 1.217711329460144, "learning_rate": 2.6770390831080593e-06, "loss": 0.055, "step": 298725 }, { "epoch": 4.39978792654011, "grad_norm": 1.384169101715088, "learning_rate": 2.675402711141075e-06, "loss": 0.0506, "step": 298750 }, { "epoch": 4.400156109630197, "grad_norm": 0.9789212942123413, "learning_rate": 2.6737663391740904e-06, "loss": 0.046, "step": 298775 }, { "epoch": 4.400524292720284, "grad_norm": 2.006904125213623, "learning_rate": 2.672129967207106e-06, "loss": 0.0534, "step": 298800 }, { "epoch": 4.400892475810371, "grad_norm": 0.9747607707977295, "learning_rate": 2.6704935952401214e-06, "loss": 0.0531, "step": 298825 }, { "epoch": 4.401260658900458, "grad_norm": 0.8829006552696228, "learning_rate": 2.6688572232731367e-06, "loss": 0.0511, "step": 298850 }, { "epoch": 4.401628841990545, "grad_norm": 1.4461387395858765, "learning_rate": 2.667220851306152e-06, "loss": 0.0517, "step": 298875 }, { "epoch": 4.401997025080632, "grad_norm": 1.2935779094696045, "learning_rate": 2.6655844793391677e-06, "loss": 0.0504, "step": 298900 }, { "epoch": 4.402365208170719, "grad_norm": 1.8071701526641846, "learning_rate": 2.6639481073721835e-06, "loss": 0.0502, "step": 298925 }, { "epoch": 4.402733391260806, "grad_norm": 1.2152225971221924, "learning_rate": 2.6623117354051988e-06, "loss": 0.0516, "step": 298950 }, { "epoch": 4.403101574350893, "grad_norm": 1.7807745933532715, "learning_rate": 2.6606753634382145e-06, "loss": 0.0545, "step": 298975 }, { "epoch": 4.4034697574409805, "grad_norm": 1.3733631372451782, "learning_rate": 2.6590389914712294e-06, "loss": 0.0563, "step": 299000 }, { "epoch": 4.4038379405310675, "grad_norm": 1.5601617097854614, "learning_rate": 2.657402619504245e-06, "loss": 0.0576, "step": 299025 }, { "epoch": 4.4042061236211545, "grad_norm": 1.0742367506027222, "learning_rate": 2.6557662475372604e-06, "loss": 0.0582, "step": 299050 }, { "epoch": 4.4045743067112415, "grad_norm": 0.9975115656852722, "learning_rate": 2.654129875570276e-06, "loss": 0.0557, "step": 299075 }, { "epoch": 4.4049424898013285, "grad_norm": 1.9738733768463135, "learning_rate": 2.652493503603291e-06, "loss": 0.0523, "step": 299100 }, { "epoch": 4.4053106728914155, "grad_norm": 1.4237779378890991, "learning_rate": 2.6508571316363067e-06, "loss": 0.0516, "step": 299125 }, { "epoch": 4.4056788559815026, "grad_norm": 1.1410068273544312, "learning_rate": 2.649220759669322e-06, "loss": 0.0545, "step": 299150 }, { "epoch": 4.40604703907159, "grad_norm": 1.1840661764144897, "learning_rate": 2.6475843877023378e-06, "loss": 0.0532, "step": 299175 }, { "epoch": 4.406415222161677, "grad_norm": 1.254218339920044, "learning_rate": 2.645948015735353e-06, "loss": 0.0572, "step": 299200 }, { "epoch": 4.406783405251764, "grad_norm": 1.343315839767456, "learning_rate": 2.6443116437683684e-06, "loss": 0.0548, "step": 299225 }, { "epoch": 4.407151588341851, "grad_norm": 1.2422288656234741, "learning_rate": 2.6426752718013837e-06, "loss": 0.052, "step": 299250 }, { "epoch": 4.407519771431938, "grad_norm": 1.7081353664398193, "learning_rate": 2.6410388998343994e-06, "loss": 0.0539, "step": 299275 }, { "epoch": 4.407887954522025, "grad_norm": 1.272098183631897, "learning_rate": 2.6394025278674147e-06, "loss": 0.0461, "step": 299300 }, { "epoch": 4.408256137612112, "grad_norm": 1.390230417251587, "learning_rate": 2.6377661559004305e-06, "loss": 0.0561, "step": 299325 }, { "epoch": 4.408624320702199, "grad_norm": 1.0581073760986328, "learning_rate": 2.6361297839334453e-06, "loss": 0.0426, "step": 299350 }, { "epoch": 4.408992503792286, "grad_norm": 0.8326212167739868, "learning_rate": 2.634493411966461e-06, "loss": 0.0501, "step": 299375 }, { "epoch": 4.409360686882373, "grad_norm": 0.8531941771507263, "learning_rate": 2.6328570399994768e-06, "loss": 0.0529, "step": 299400 }, { "epoch": 4.40972886997246, "grad_norm": 2.0169854164123535, "learning_rate": 2.631220668032492e-06, "loss": 0.0551, "step": 299425 }, { "epoch": 4.410097053062547, "grad_norm": 1.4882276058197021, "learning_rate": 2.629584296065508e-06, "loss": 0.0519, "step": 299450 }, { "epoch": 4.410465236152634, "grad_norm": 1.610654592514038, "learning_rate": 2.6279479240985227e-06, "loss": 0.0532, "step": 299475 }, { "epoch": 4.410833419242721, "grad_norm": 1.629957675933838, "learning_rate": 2.6263115521315384e-06, "loss": 0.0539, "step": 299500 }, { "epoch": 4.411201602332808, "grad_norm": 1.3898857831954956, "learning_rate": 2.6246751801645537e-06, "loss": 0.0477, "step": 299525 }, { "epoch": 4.411569785422895, "grad_norm": 1.2525744438171387, "learning_rate": 2.6230388081975695e-06, "loss": 0.0543, "step": 299550 }, { "epoch": 4.411937968512982, "grad_norm": 0.7920873761177063, "learning_rate": 2.6214024362305843e-06, "loss": 0.0507, "step": 299575 }, { "epoch": 4.412306151603069, "grad_norm": 1.3093024492263794, "learning_rate": 2.6197660642636e-06, "loss": 0.0532, "step": 299600 }, { "epoch": 4.412674334693156, "grad_norm": 1.4931198358535767, "learning_rate": 2.6181296922966154e-06, "loss": 0.051, "step": 299625 }, { "epoch": 4.413042517783243, "grad_norm": 1.462104082107544, "learning_rate": 2.616493320329631e-06, "loss": 0.0631, "step": 299650 }, { "epoch": 4.41341070087333, "grad_norm": 1.5805273056030273, "learning_rate": 2.6148569483626464e-06, "loss": 0.0508, "step": 299675 }, { "epoch": 4.413778883963417, "grad_norm": 1.3844873905181885, "learning_rate": 2.613220576395662e-06, "loss": 0.0505, "step": 299700 }, { "epoch": 4.414147067053504, "grad_norm": 1.4826583862304688, "learning_rate": 2.611584204428677e-06, "loss": 0.0536, "step": 299725 }, { "epoch": 4.414515250143592, "grad_norm": 1.5089002847671509, "learning_rate": 2.6099478324616927e-06, "loss": 0.0525, "step": 299750 }, { "epoch": 4.414883433233679, "grad_norm": 1.598009467124939, "learning_rate": 2.608311460494708e-06, "loss": 0.0574, "step": 299775 }, { "epoch": 4.415251616323766, "grad_norm": 1.2918604612350464, "learning_rate": 2.6066750885277238e-06, "loss": 0.0497, "step": 299800 }, { "epoch": 4.415619799413853, "grad_norm": 1.3681979179382324, "learning_rate": 2.6050387165607387e-06, "loss": 0.0492, "step": 299825 }, { "epoch": 4.41598798250394, "grad_norm": 1.422539234161377, "learning_rate": 2.6034023445937544e-06, "loss": 0.0504, "step": 299850 }, { "epoch": 4.416356165594027, "grad_norm": 1.8677875995635986, "learning_rate": 2.60176597262677e-06, "loss": 0.0562, "step": 299875 }, { "epoch": 4.416724348684114, "grad_norm": 1.4954921007156372, "learning_rate": 2.6001296006597854e-06, "loss": 0.0495, "step": 299900 }, { "epoch": 4.417092531774201, "grad_norm": 0.998248815536499, "learning_rate": 2.598493228692801e-06, "loss": 0.0475, "step": 299925 }, { "epoch": 4.417460714864288, "grad_norm": 1.0428069829940796, "learning_rate": 2.596856856725816e-06, "loss": 0.0476, "step": 299950 }, { "epoch": 4.417828897954375, "grad_norm": 1.4951359033584595, "learning_rate": 2.5952204847588318e-06, "loss": 0.0563, "step": 299975 }, { "epoch": 4.418197081044462, "grad_norm": 1.082573413848877, "learning_rate": 2.593584112791847e-06, "loss": 0.0516, "step": 300000 }, { "epoch": 4.418565264134549, "grad_norm": 1.1197725534439087, "learning_rate": 2.5919477408248628e-06, "loss": 0.048, "step": 300025 }, { "epoch": 4.418933447224636, "grad_norm": 1.5714365243911743, "learning_rate": 2.590311368857878e-06, "loss": 0.0472, "step": 300050 }, { "epoch": 4.419301630314723, "grad_norm": 1.0920747518539429, "learning_rate": 2.588674996890894e-06, "loss": 0.0513, "step": 300075 }, { "epoch": 4.41966981340481, "grad_norm": 0.701291024684906, "learning_rate": 2.5870386249239087e-06, "loss": 0.0459, "step": 300100 }, { "epoch": 4.420037996494897, "grad_norm": 1.2260955572128296, "learning_rate": 2.5854022529569244e-06, "loss": 0.0532, "step": 300125 }, { "epoch": 4.420406179584984, "grad_norm": 1.5864808559417725, "learning_rate": 2.5837658809899397e-06, "loss": 0.0575, "step": 300150 }, { "epoch": 4.420774362675071, "grad_norm": 1.6229859590530396, "learning_rate": 2.5821295090229555e-06, "loss": 0.0479, "step": 300175 }, { "epoch": 4.421142545765158, "grad_norm": 1.4057663679122925, "learning_rate": 2.5804931370559703e-06, "loss": 0.0584, "step": 300200 }, { "epoch": 4.421510728855245, "grad_norm": 1.6148444414138794, "learning_rate": 2.578856765088986e-06, "loss": 0.0518, "step": 300225 }, { "epoch": 4.421878911945332, "grad_norm": 0.7885890603065491, "learning_rate": 2.5772203931220014e-06, "loss": 0.0474, "step": 300250 }, { "epoch": 4.422247095035419, "grad_norm": 1.4228276014328003, "learning_rate": 2.575584021155017e-06, "loss": 0.0495, "step": 300275 }, { "epoch": 4.422615278125506, "grad_norm": 1.3355226516723633, "learning_rate": 2.5739476491880324e-06, "loss": 0.0519, "step": 300300 }, { "epoch": 4.422983461215593, "grad_norm": 0.9256533980369568, "learning_rate": 2.5723112772210477e-06, "loss": 0.055, "step": 300325 }, { "epoch": 4.42335164430568, "grad_norm": 1.1552969217300415, "learning_rate": 2.570674905254063e-06, "loss": 0.0492, "step": 300350 }, { "epoch": 4.423719827395767, "grad_norm": 0.8856759071350098, "learning_rate": 2.5690385332870787e-06, "loss": 0.0494, "step": 300375 }, { "epoch": 4.424088010485854, "grad_norm": 0.7557277679443359, "learning_rate": 2.5674021613200945e-06, "loss": 0.048, "step": 300400 }, { "epoch": 4.424456193575941, "grad_norm": 1.100771188735962, "learning_rate": 2.5657657893531098e-06, "loss": 0.0521, "step": 300425 }, { "epoch": 4.424824376666028, "grad_norm": 1.4789375066757202, "learning_rate": 2.5641294173861255e-06, "loss": 0.0531, "step": 300450 }, { "epoch": 4.425192559756115, "grad_norm": 1.3899985551834106, "learning_rate": 2.5624930454191404e-06, "loss": 0.0465, "step": 300475 }, { "epoch": 4.425560742846202, "grad_norm": 1.5925854444503784, "learning_rate": 2.560856673452156e-06, "loss": 0.0512, "step": 300500 }, { "epoch": 4.425928925936289, "grad_norm": 1.3255113363265991, "learning_rate": 2.5592203014851714e-06, "loss": 0.0537, "step": 300525 }, { "epoch": 4.426297109026376, "grad_norm": 1.81946861743927, "learning_rate": 2.557583929518187e-06, "loss": 0.0564, "step": 300550 }, { "epoch": 4.426665292116463, "grad_norm": 1.4383327960968018, "learning_rate": 2.555947557551202e-06, "loss": 0.0471, "step": 300575 }, { "epoch": 4.42703347520655, "grad_norm": 1.5087847709655762, "learning_rate": 2.5543111855842178e-06, "loss": 0.0584, "step": 300600 }, { "epoch": 4.427401658296637, "grad_norm": 1.9436042308807373, "learning_rate": 2.552674813617233e-06, "loss": 0.054, "step": 300625 }, { "epoch": 4.427769841386725, "grad_norm": 0.8835351467132568, "learning_rate": 2.5510384416502488e-06, "loss": 0.048, "step": 300650 }, { "epoch": 4.428138024476812, "grad_norm": 1.235834002494812, "learning_rate": 2.5494020696832637e-06, "loss": 0.0468, "step": 300675 }, { "epoch": 4.428506207566899, "grad_norm": 1.4335917234420776, "learning_rate": 2.5477656977162794e-06, "loss": 0.0549, "step": 300700 }, { "epoch": 4.428874390656986, "grad_norm": 1.2068747282028198, "learning_rate": 2.5461293257492947e-06, "loss": 0.0568, "step": 300725 }, { "epoch": 4.429242573747073, "grad_norm": 1.0815385580062866, "learning_rate": 2.5444929537823104e-06, "loss": 0.0509, "step": 300750 }, { "epoch": 4.42961075683716, "grad_norm": 0.9886804223060608, "learning_rate": 2.5428565818153257e-06, "loss": 0.0536, "step": 300775 }, { "epoch": 4.429978939927247, "grad_norm": 1.6844570636749268, "learning_rate": 2.5412202098483415e-06, "loss": 0.0512, "step": 300800 }, { "epoch": 4.430347123017334, "grad_norm": 1.2598366737365723, "learning_rate": 2.5395838378813563e-06, "loss": 0.0509, "step": 300825 }, { "epoch": 4.430715306107421, "grad_norm": 1.3276710510253906, "learning_rate": 2.537947465914372e-06, "loss": 0.0495, "step": 300850 }, { "epoch": 4.431083489197508, "grad_norm": 0.822670578956604, "learning_rate": 2.536311093947388e-06, "loss": 0.0536, "step": 300875 }, { "epoch": 4.431451672287595, "grad_norm": 1.4422811269760132, "learning_rate": 2.5347401768590825e-06, "loss": 0.0577, "step": 300900 }, { "epoch": 4.431819855377682, "grad_norm": 1.4851795434951782, "learning_rate": 2.5331038048920982e-06, "loss": 0.057, "step": 300925 }, { "epoch": 4.432188038467769, "grad_norm": 1.595252513885498, "learning_rate": 2.531467432925113e-06, "loss": 0.047, "step": 300950 }, { "epoch": 4.4325562215578564, "grad_norm": 1.4419323205947876, "learning_rate": 2.529831060958129e-06, "loss": 0.0517, "step": 300975 }, { "epoch": 4.4329244046479435, "grad_norm": 1.0129940509796143, "learning_rate": 2.528194688991144e-06, "loss": 0.05, "step": 301000 }, { "epoch": 4.4332925877380305, "grad_norm": 1.3844878673553467, "learning_rate": 2.52655831702416e-06, "loss": 0.0524, "step": 301025 }, { "epoch": 4.4336607708281175, "grad_norm": 1.4275884628295898, "learning_rate": 2.5249219450571748e-06, "loss": 0.0441, "step": 301050 }, { "epoch": 4.4340289539182045, "grad_norm": 1.6299229860305786, "learning_rate": 2.5232855730901905e-06, "loss": 0.0533, "step": 301075 }, { "epoch": 4.4343971370082915, "grad_norm": 0.8308444023132324, "learning_rate": 2.521649201123206e-06, "loss": 0.0507, "step": 301100 }, { "epoch": 4.4347653200983785, "grad_norm": 1.664947509765625, "learning_rate": 2.5200128291562215e-06, "loss": 0.0465, "step": 301125 }, { "epoch": 4.4351335031884656, "grad_norm": 1.0470316410064697, "learning_rate": 2.5183764571892364e-06, "loss": 0.0442, "step": 301150 }, { "epoch": 4.435501686278553, "grad_norm": 0.891488790512085, "learning_rate": 2.516740085222252e-06, "loss": 0.0522, "step": 301175 }, { "epoch": 4.43586986936864, "grad_norm": 1.3064780235290527, "learning_rate": 2.5151037132552674e-06, "loss": 0.0455, "step": 301200 }, { "epoch": 4.436238052458727, "grad_norm": 1.6625359058380127, "learning_rate": 2.513467341288283e-06, "loss": 0.0641, "step": 301225 }, { "epoch": 4.436606235548814, "grad_norm": 1.353169560432434, "learning_rate": 2.5118309693212985e-06, "loss": 0.0588, "step": 301250 }, { "epoch": 4.436974418638901, "grad_norm": 1.3421375751495361, "learning_rate": 2.510194597354314e-06, "loss": 0.0512, "step": 301275 }, { "epoch": 4.437342601728988, "grad_norm": 0.7791436314582825, "learning_rate": 2.508558225387329e-06, "loss": 0.049, "step": 301300 }, { "epoch": 4.437710784819075, "grad_norm": 1.415158748626709, "learning_rate": 2.506921853420345e-06, "loss": 0.0603, "step": 301325 }, { "epoch": 4.438078967909162, "grad_norm": 1.0226030349731445, "learning_rate": 2.5052854814533605e-06, "loss": 0.048, "step": 301350 }, { "epoch": 4.438447150999249, "grad_norm": 1.190688967704773, "learning_rate": 2.503649109486376e-06, "loss": 0.0462, "step": 301375 }, { "epoch": 4.438815334089336, "grad_norm": 1.3343232870101929, "learning_rate": 2.5020127375193916e-06, "loss": 0.0551, "step": 301400 }, { "epoch": 4.439183517179423, "grad_norm": 1.429953932762146, "learning_rate": 2.5003763655524065e-06, "loss": 0.0508, "step": 301425 }, { "epoch": 4.43955170026951, "grad_norm": 1.3824851512908936, "learning_rate": 2.498739993585422e-06, "loss": 0.0502, "step": 301450 }, { "epoch": 4.439919883359597, "grad_norm": 1.5845098495483398, "learning_rate": 2.4971036216184375e-06, "loss": 0.0587, "step": 301475 }, { "epoch": 4.440288066449684, "grad_norm": 1.8588379621505737, "learning_rate": 2.495467249651453e-06, "loss": 0.0493, "step": 301500 }, { "epoch": 4.440656249539771, "grad_norm": 1.6703959703445435, "learning_rate": 2.493830877684468e-06, "loss": 0.0535, "step": 301525 }, { "epoch": 4.441024432629858, "grad_norm": 1.5778106451034546, "learning_rate": 2.492194505717484e-06, "loss": 0.0554, "step": 301550 }, { "epoch": 4.441392615719945, "grad_norm": 1.471972942352295, "learning_rate": 2.4905581337504996e-06, "loss": 0.0484, "step": 301575 }, { "epoch": 4.441760798810032, "grad_norm": 1.1855192184448242, "learning_rate": 2.488921761783515e-06, "loss": 0.0498, "step": 301600 }, { "epoch": 4.442128981900119, "grad_norm": 1.611812949180603, "learning_rate": 2.48728538981653e-06, "loss": 0.0459, "step": 301625 }, { "epoch": 4.442497164990206, "grad_norm": 1.5399442911148071, "learning_rate": 2.485649017849546e-06, "loss": 0.061, "step": 301650 }, { "epoch": 4.442865348080294, "grad_norm": 1.0918532609939575, "learning_rate": 2.484012645882561e-06, "loss": 0.0498, "step": 301675 }, { "epoch": 4.443233531170381, "grad_norm": 0.7599321007728577, "learning_rate": 2.4823762739155765e-06, "loss": 0.0529, "step": 301700 }, { "epoch": 4.443601714260468, "grad_norm": 1.6148830652236938, "learning_rate": 2.480739901948592e-06, "loss": 0.0517, "step": 301725 }, { "epoch": 4.443969897350555, "grad_norm": 1.2528841495513916, "learning_rate": 2.4791035299816075e-06, "loss": 0.0491, "step": 301750 }, { "epoch": 4.444338080440642, "grad_norm": 1.618980884552002, "learning_rate": 2.477467158014623e-06, "loss": 0.0486, "step": 301775 }, { "epoch": 4.444706263530729, "grad_norm": 0.9234166145324707, "learning_rate": 2.475830786047638e-06, "loss": 0.0524, "step": 301800 }, { "epoch": 4.445074446620816, "grad_norm": 1.7744756937026978, "learning_rate": 2.474194414080654e-06, "loss": 0.0505, "step": 301825 }, { "epoch": 4.445442629710903, "grad_norm": 1.1677004098892212, "learning_rate": 2.472558042113669e-06, "loss": 0.0591, "step": 301850 }, { "epoch": 4.44581081280099, "grad_norm": 1.5444965362548828, "learning_rate": 2.4709216701466845e-06, "loss": 0.0512, "step": 301875 }, { "epoch": 4.446178995891077, "grad_norm": 1.2752526998519897, "learning_rate": 2.4692852981796998e-06, "loss": 0.0531, "step": 301900 }, { "epoch": 4.446547178981164, "grad_norm": 1.4353508949279785, "learning_rate": 2.4676489262127155e-06, "loss": 0.0484, "step": 301925 }, { "epoch": 4.446915362071251, "grad_norm": 1.9133244752883911, "learning_rate": 2.466012554245731e-06, "loss": 0.0505, "step": 301950 }, { "epoch": 4.447283545161338, "grad_norm": 1.2566324472427368, "learning_rate": 2.464376182278746e-06, "loss": 0.0502, "step": 301975 }, { "epoch": 4.447651728251425, "grad_norm": 1.4808827638626099, "learning_rate": 2.462739810311762e-06, "loss": 0.0492, "step": 302000 }, { "epoch": 4.448019911341512, "grad_norm": 0.9286792874336243, "learning_rate": 2.461103438344777e-06, "loss": 0.0472, "step": 302025 }, { "epoch": 4.448388094431599, "grad_norm": 0.6995263695716858, "learning_rate": 2.4594670663777925e-06, "loss": 0.058, "step": 302050 }, { "epoch": 4.448756277521686, "grad_norm": 1.1706923246383667, "learning_rate": 2.457830694410808e-06, "loss": 0.0463, "step": 302075 }, { "epoch": 4.449124460611773, "grad_norm": 0.8549727201461792, "learning_rate": 2.4561943224438235e-06, "loss": 0.0556, "step": 302100 }, { "epoch": 4.44949264370186, "grad_norm": 0.8848899602890015, "learning_rate": 2.4545579504768392e-06, "loss": 0.0546, "step": 302125 }, { "epoch": 4.449860826791947, "grad_norm": 1.4121531248092651, "learning_rate": 2.4529215785098545e-06, "loss": 0.0498, "step": 302150 }, { "epoch": 4.450229009882034, "grad_norm": 1.789003610610962, "learning_rate": 2.45128520654287e-06, "loss": 0.0505, "step": 302175 }, { "epoch": 4.450597192972121, "grad_norm": 1.1204239130020142, "learning_rate": 2.4496488345758856e-06, "loss": 0.0531, "step": 302200 }, { "epoch": 4.450965376062208, "grad_norm": 1.5883513689041138, "learning_rate": 2.448012462608901e-06, "loss": 0.0558, "step": 302225 }, { "epoch": 4.451333559152295, "grad_norm": 1.1423672437667847, "learning_rate": 2.446376090641916e-06, "loss": 0.0487, "step": 302250 }, { "epoch": 4.451701742242382, "grad_norm": 1.250074028968811, "learning_rate": 2.4447397186749315e-06, "loss": 0.0483, "step": 302275 }, { "epoch": 4.452069925332469, "grad_norm": 1.2350640296936035, "learning_rate": 2.443103346707947e-06, "loss": 0.0537, "step": 302300 }, { "epoch": 4.452438108422556, "grad_norm": 1.8885859251022339, "learning_rate": 2.4414669747409625e-06, "loss": 0.0547, "step": 302325 }, { "epoch": 4.452806291512643, "grad_norm": 1.1005498170852661, "learning_rate": 2.439830602773978e-06, "loss": 0.0471, "step": 302350 }, { "epoch": 4.45317447460273, "grad_norm": 1.6805378198623657, "learning_rate": 2.4381942308069935e-06, "loss": 0.053, "step": 302375 }, { "epoch": 4.453542657692817, "grad_norm": 0.9430249929428101, "learning_rate": 2.436557858840009e-06, "loss": 0.0566, "step": 302400 }, { "epoch": 4.453910840782904, "grad_norm": 1.1696069240570068, "learning_rate": 2.434921486873024e-06, "loss": 0.0552, "step": 302425 }, { "epoch": 4.454279023872991, "grad_norm": 1.346459984779358, "learning_rate": 2.4332851149060394e-06, "loss": 0.0534, "step": 302450 }, { "epoch": 4.454647206963078, "grad_norm": 1.5569554567337036, "learning_rate": 2.431648742939055e-06, "loss": 0.052, "step": 302475 }, { "epoch": 4.455015390053165, "grad_norm": 1.1602433919906616, "learning_rate": 2.4300123709720705e-06, "loss": 0.057, "step": 302500 }, { "epoch": 4.455383573143252, "grad_norm": 1.3797434568405151, "learning_rate": 2.4283759990050858e-06, "loss": 0.0517, "step": 302525 }, { "epoch": 4.455751756233339, "grad_norm": 1.4027955532073975, "learning_rate": 2.4267396270381015e-06, "loss": 0.0515, "step": 302550 }, { "epoch": 4.456119939323427, "grad_norm": 1.0705606937408447, "learning_rate": 2.4251032550711172e-06, "loss": 0.0472, "step": 302575 }, { "epoch": 4.456488122413514, "grad_norm": 1.7451097965240479, "learning_rate": 2.4234668831041325e-06, "loss": 0.0505, "step": 302600 }, { "epoch": 4.456856305503601, "grad_norm": 1.7931379079818726, "learning_rate": 2.421830511137148e-06, "loss": 0.0539, "step": 302625 }, { "epoch": 4.457224488593688, "grad_norm": 1.5439224243164062, "learning_rate": 2.420194139170163e-06, "loss": 0.0518, "step": 302650 }, { "epoch": 4.457592671683775, "grad_norm": 1.378536581993103, "learning_rate": 2.418557767203179e-06, "loss": 0.0538, "step": 302675 }, { "epoch": 4.457960854773862, "grad_norm": 1.214421033859253, "learning_rate": 2.416921395236194e-06, "loss": 0.0491, "step": 302700 }, { "epoch": 4.458329037863949, "grad_norm": 1.3873339891433716, "learning_rate": 2.4152850232692095e-06, "loss": 0.0486, "step": 302725 }, { "epoch": 4.458697220954036, "grad_norm": 1.8396759033203125, "learning_rate": 2.4136486513022252e-06, "loss": 0.0533, "step": 302750 }, { "epoch": 4.459065404044123, "grad_norm": 0.9953267574310303, "learning_rate": 2.4120122793352405e-06, "loss": 0.0522, "step": 302775 }, { "epoch": 4.45943358713421, "grad_norm": 1.5243364572525024, "learning_rate": 2.410375907368256e-06, "loss": 0.0586, "step": 302800 }, { "epoch": 4.459801770224297, "grad_norm": 1.5203975439071655, "learning_rate": 2.408739535401271e-06, "loss": 0.0483, "step": 302825 }, { "epoch": 4.460169953314384, "grad_norm": 0.6166027784347534, "learning_rate": 2.407103163434287e-06, "loss": 0.0553, "step": 302850 }, { "epoch": 4.460538136404471, "grad_norm": 1.088309407234192, "learning_rate": 2.405466791467302e-06, "loss": 0.051, "step": 302875 }, { "epoch": 4.460906319494558, "grad_norm": 1.5143396854400635, "learning_rate": 2.4038304195003175e-06, "loss": 0.0521, "step": 302900 }, { "epoch": 4.461274502584645, "grad_norm": 1.5423846244812012, "learning_rate": 2.402194047533333e-06, "loss": 0.0538, "step": 302925 }, { "epoch": 4.461642685674732, "grad_norm": 0.7741517424583435, "learning_rate": 2.4005576755663485e-06, "loss": 0.0539, "step": 302950 }, { "epoch": 4.4620108687648194, "grad_norm": 1.6776727437973022, "learning_rate": 2.398921303599364e-06, "loss": 0.052, "step": 302975 }, { "epoch": 4.4623790518549065, "grad_norm": 2.0980570316314697, "learning_rate": 2.397284931632379e-06, "loss": 0.0594, "step": 303000 }, { "epoch": 4.4627472349449935, "grad_norm": 1.3102998733520508, "learning_rate": 2.395648559665395e-06, "loss": 0.0484, "step": 303025 }, { "epoch": 4.4631154180350805, "grad_norm": 1.2197163105010986, "learning_rate": 2.3940121876984106e-06, "loss": 0.0497, "step": 303050 }, { "epoch": 4.4634836011251675, "grad_norm": 1.6866099834442139, "learning_rate": 2.392375815731426e-06, "loss": 0.0556, "step": 303075 }, { "epoch": 4.4638517842152545, "grad_norm": 0.8207196593284607, "learning_rate": 2.390739443764441e-06, "loss": 0.0486, "step": 303100 }, { "epoch": 4.4642199673053415, "grad_norm": 1.1538947820663452, "learning_rate": 2.389103071797457e-06, "loss": 0.057, "step": 303125 }, { "epoch": 4.464588150395429, "grad_norm": 1.5906556844711304, "learning_rate": 2.387466699830472e-06, "loss": 0.0492, "step": 303150 }, { "epoch": 4.464956333485516, "grad_norm": 1.5449646711349487, "learning_rate": 2.3858303278634875e-06, "loss": 0.0587, "step": 303175 }, { "epoch": 4.465324516575603, "grad_norm": 1.6102849245071411, "learning_rate": 2.384193955896503e-06, "loss": 0.0543, "step": 303200 }, { "epoch": 4.46569269966569, "grad_norm": 0.7648277878761292, "learning_rate": 2.3825575839295185e-06, "loss": 0.0532, "step": 303225 }, { "epoch": 4.466060882755777, "grad_norm": 1.6596821546554565, "learning_rate": 2.380921211962534e-06, "loss": 0.047, "step": 303250 }, { "epoch": 4.466429065845864, "grad_norm": 1.2146966457366943, "learning_rate": 2.379284839995549e-06, "loss": 0.0461, "step": 303275 }, { "epoch": 4.466797248935951, "grad_norm": 1.5407129526138306, "learning_rate": 2.377648468028565e-06, "loss": 0.054, "step": 303300 }, { "epoch": 4.467165432026038, "grad_norm": 2.0960118770599365, "learning_rate": 2.37601209606158e-06, "loss": 0.053, "step": 303325 }, { "epoch": 4.467533615116125, "grad_norm": 1.0898778438568115, "learning_rate": 2.3743757240945955e-06, "loss": 0.0467, "step": 303350 }, { "epoch": 4.467901798206212, "grad_norm": 1.2167497873306274, "learning_rate": 2.372739352127611e-06, "loss": 0.0476, "step": 303375 }, { "epoch": 4.468269981296299, "grad_norm": 1.5868618488311768, "learning_rate": 2.3711029801606265e-06, "loss": 0.0472, "step": 303400 }, { "epoch": 4.468638164386386, "grad_norm": 1.0690587759017944, "learning_rate": 2.369466608193642e-06, "loss": 0.0556, "step": 303425 }, { "epoch": 4.469006347476473, "grad_norm": 1.8434339761734009, "learning_rate": 2.367830236226657e-06, "loss": 0.049, "step": 303450 }, { "epoch": 4.46937453056656, "grad_norm": 1.3922070264816284, "learning_rate": 2.366193864259673e-06, "loss": 0.055, "step": 303475 }, { "epoch": 4.469742713656647, "grad_norm": 1.1121543645858765, "learning_rate": 2.364557492292688e-06, "loss": 0.0454, "step": 303500 }, { "epoch": 4.470110896746734, "grad_norm": 1.2854058742523193, "learning_rate": 2.3629211203257035e-06, "loss": 0.0515, "step": 303525 }, { "epoch": 4.470479079836821, "grad_norm": 1.4660992622375488, "learning_rate": 2.361284748358719e-06, "loss": 0.0518, "step": 303550 }, { "epoch": 4.470847262926909, "grad_norm": 1.7707901000976562, "learning_rate": 2.3596483763917345e-06, "loss": 0.0505, "step": 303575 }, { "epoch": 4.471215446016996, "grad_norm": 1.2327923774719238, "learning_rate": 2.3580120044247502e-06, "loss": 0.0513, "step": 303600 }, { "epoch": 4.471583629107083, "grad_norm": 1.302258014678955, "learning_rate": 2.3563756324577655e-06, "loss": 0.0477, "step": 303625 }, { "epoch": 4.47195181219717, "grad_norm": 1.5161261558532715, "learning_rate": 2.354739260490781e-06, "loss": 0.0499, "step": 303650 }, { "epoch": 4.472319995287257, "grad_norm": 1.1093387603759766, "learning_rate": 2.3531028885237966e-06, "loss": 0.06, "step": 303675 }, { "epoch": 4.472688178377344, "grad_norm": 1.2082892656326294, "learning_rate": 2.351466516556812e-06, "loss": 0.0603, "step": 303700 }, { "epoch": 4.473056361467431, "grad_norm": 1.7611470222473145, "learning_rate": 2.349830144589827e-06, "loss": 0.0485, "step": 303725 }, { "epoch": 4.473424544557518, "grad_norm": 1.4070907831192017, "learning_rate": 2.3481937726228425e-06, "loss": 0.0555, "step": 303750 }, { "epoch": 4.473792727647605, "grad_norm": 2.345460891723633, "learning_rate": 2.346557400655858e-06, "loss": 0.049, "step": 303775 }, { "epoch": 4.474160910737692, "grad_norm": 1.7008612155914307, "learning_rate": 2.3449210286888735e-06, "loss": 0.0489, "step": 303800 }, { "epoch": 4.474529093827779, "grad_norm": 1.2351136207580566, "learning_rate": 2.343284656721889e-06, "loss": 0.0508, "step": 303825 }, { "epoch": 4.474897276917866, "grad_norm": 0.689289927482605, "learning_rate": 2.3416482847549045e-06, "loss": 0.0478, "step": 303850 }, { "epoch": 4.475265460007953, "grad_norm": 1.561794400215149, "learning_rate": 2.34001191278792e-06, "loss": 0.0507, "step": 303875 }, { "epoch": 4.47563364309804, "grad_norm": 0.8364143967628479, "learning_rate": 2.338375540820935e-06, "loss": 0.0543, "step": 303900 }, { "epoch": 4.476001826188127, "grad_norm": 0.9369179010391235, "learning_rate": 2.3367391688539505e-06, "loss": 0.0493, "step": 303925 }, { "epoch": 4.476370009278214, "grad_norm": 1.6872276067733765, "learning_rate": 2.335102796886966e-06, "loss": 0.0546, "step": 303950 }, { "epoch": 4.476738192368301, "grad_norm": 0.6905471086502075, "learning_rate": 2.3334664249199815e-06, "loss": 0.05, "step": 303975 }, { "epoch": 4.477106375458388, "grad_norm": 1.1974610090255737, "learning_rate": 2.331830052952997e-06, "loss": 0.0481, "step": 304000 }, { "epoch": 4.477474558548475, "grad_norm": 1.2794733047485352, "learning_rate": 2.3301936809860125e-06, "loss": 0.0508, "step": 304025 }, { "epoch": 4.477842741638562, "grad_norm": 1.5989645719528198, "learning_rate": 2.3285573090190282e-06, "loss": 0.0579, "step": 304050 }, { "epoch": 4.478210924728649, "grad_norm": 0.7437596321105957, "learning_rate": 2.3269209370520435e-06, "loss": 0.0554, "step": 304075 }, { "epoch": 4.478579107818736, "grad_norm": 1.1852704286575317, "learning_rate": 2.325284565085059e-06, "loss": 0.0537, "step": 304100 }, { "epoch": 4.478947290908823, "grad_norm": 1.0112836360931396, "learning_rate": 2.323648193118074e-06, "loss": 0.0504, "step": 304125 }, { "epoch": 4.47931547399891, "grad_norm": 1.5848444700241089, "learning_rate": 2.32201182115109e-06, "loss": 0.0473, "step": 304150 }, { "epoch": 4.479683657088997, "grad_norm": 1.5396461486816406, "learning_rate": 2.320375449184105e-06, "loss": 0.051, "step": 304175 }, { "epoch": 4.480051840179084, "grad_norm": 2.1263575553894043, "learning_rate": 2.3187390772171205e-06, "loss": 0.0487, "step": 304200 }, { "epoch": 4.480420023269171, "grad_norm": 1.0708191394805908, "learning_rate": 2.3171027052501362e-06, "loss": 0.0519, "step": 304225 }, { "epoch": 4.480788206359258, "grad_norm": 1.460453748703003, "learning_rate": 2.3154663332831515e-06, "loss": 0.053, "step": 304250 }, { "epoch": 4.481156389449345, "grad_norm": 1.161363124847412, "learning_rate": 2.313829961316167e-06, "loss": 0.0533, "step": 304275 }, { "epoch": 4.481524572539432, "grad_norm": 1.493945598602295, "learning_rate": 2.312193589349182e-06, "loss": 0.051, "step": 304300 }, { "epoch": 4.481892755629519, "grad_norm": 1.3790481090545654, "learning_rate": 2.310557217382198e-06, "loss": 0.048, "step": 304325 }, { "epoch": 4.482260938719606, "grad_norm": 1.0603439807891846, "learning_rate": 2.308920845415213e-06, "loss": 0.0528, "step": 304350 }, { "epoch": 4.482629121809693, "grad_norm": 1.2882375717163086, "learning_rate": 2.3072844734482285e-06, "loss": 0.0511, "step": 304375 }, { "epoch": 4.48299730489978, "grad_norm": 1.3489183187484741, "learning_rate": 2.305648101481244e-06, "loss": 0.0514, "step": 304400 }, { "epoch": 4.483365487989867, "grad_norm": 1.2654738426208496, "learning_rate": 2.3040117295142595e-06, "loss": 0.0519, "step": 304425 }, { "epoch": 4.483733671079954, "grad_norm": 1.0368967056274414, "learning_rate": 2.302375357547275e-06, "loss": 0.0489, "step": 304450 }, { "epoch": 4.484101854170042, "grad_norm": 1.6323338747024536, "learning_rate": 2.30073898558029e-06, "loss": 0.0515, "step": 304475 }, { "epoch": 4.484470037260129, "grad_norm": 1.1667799949645996, "learning_rate": 2.299102613613306e-06, "loss": 0.0566, "step": 304500 }, { "epoch": 4.484838220350216, "grad_norm": 1.8040019273757935, "learning_rate": 2.2974662416463216e-06, "loss": 0.05, "step": 304525 }, { "epoch": 4.485206403440303, "grad_norm": 1.075221300125122, "learning_rate": 2.295829869679337e-06, "loss": 0.0483, "step": 304550 }, { "epoch": 4.48557458653039, "grad_norm": 1.8594220876693726, "learning_rate": 2.294193497712352e-06, "loss": 0.0586, "step": 304575 }, { "epoch": 4.485942769620477, "grad_norm": 1.4127616882324219, "learning_rate": 2.292557125745368e-06, "loss": 0.053, "step": 304600 }, { "epoch": 4.486310952710564, "grad_norm": 2.123760938644409, "learning_rate": 2.290920753778383e-06, "loss": 0.0539, "step": 304625 }, { "epoch": 4.486679135800651, "grad_norm": 1.1200772523880005, "learning_rate": 2.2892843818113985e-06, "loss": 0.0477, "step": 304650 }, { "epoch": 4.487047318890738, "grad_norm": 1.318161964416504, "learning_rate": 2.287648009844414e-06, "loss": 0.0481, "step": 304675 }, { "epoch": 4.487415501980825, "grad_norm": 1.3724302053451538, "learning_rate": 2.2860116378774295e-06, "loss": 0.0608, "step": 304700 }, { "epoch": 4.487783685070912, "grad_norm": 1.4727050065994263, "learning_rate": 2.284375265910445e-06, "loss": 0.0519, "step": 304725 }, { "epoch": 4.488151868160999, "grad_norm": 1.287550687789917, "learning_rate": 2.28273889394346e-06, "loss": 0.0552, "step": 304750 }, { "epoch": 4.488520051251086, "grad_norm": 1.0559489727020264, "learning_rate": 2.281102521976476e-06, "loss": 0.0527, "step": 304775 }, { "epoch": 4.488888234341173, "grad_norm": 1.2488083839416504, "learning_rate": 2.279466150009491e-06, "loss": 0.0494, "step": 304800 }, { "epoch": 4.48925641743126, "grad_norm": 1.230230450630188, "learning_rate": 2.2778297780425065e-06, "loss": 0.0475, "step": 304825 }, { "epoch": 4.489624600521347, "grad_norm": 1.1732892990112305, "learning_rate": 2.276193406075522e-06, "loss": 0.0541, "step": 304850 }, { "epoch": 4.489992783611434, "grad_norm": 1.4207347631454468, "learning_rate": 2.2745570341085375e-06, "loss": 0.0487, "step": 304875 }, { "epoch": 4.490360966701521, "grad_norm": 1.2135663032531738, "learning_rate": 2.2729861170202323e-06, "loss": 0.0477, "step": 304900 }, { "epoch": 4.490729149791608, "grad_norm": 1.8241201639175415, "learning_rate": 2.2713497450532476e-06, "loss": 0.0471, "step": 304925 }, { "epoch": 4.491097332881695, "grad_norm": 1.2652854919433594, "learning_rate": 2.269713373086263e-06, "loss": 0.0578, "step": 304950 }, { "epoch": 4.4914655159717825, "grad_norm": 1.6332687139511108, "learning_rate": 2.2680770011192786e-06, "loss": 0.0456, "step": 304975 }, { "epoch": 4.4918336990618695, "grad_norm": 1.4398852586746216, "learning_rate": 2.2664406291522943e-06, "loss": 0.049, "step": 305000 }, { "epoch": 4.4922018821519565, "grad_norm": 1.6836400032043457, "learning_rate": 2.2648042571853096e-06, "loss": 0.0598, "step": 305025 }, { "epoch": 4.4925700652420435, "grad_norm": 1.1527715921401978, "learning_rate": 2.263167885218325e-06, "loss": 0.051, "step": 305050 }, { "epoch": 4.4929382483321305, "grad_norm": 1.7300193309783936, "learning_rate": 2.2615315132513407e-06, "loss": 0.0555, "step": 305075 }, { "epoch": 4.4933064314222175, "grad_norm": 1.165737271308899, "learning_rate": 2.259895141284356e-06, "loss": 0.0448, "step": 305100 }, { "epoch": 4.4936746145123045, "grad_norm": 1.4664829969406128, "learning_rate": 2.2582587693173713e-06, "loss": 0.0471, "step": 305125 }, { "epoch": 4.494042797602392, "grad_norm": 1.0854312181472778, "learning_rate": 2.2566223973503866e-06, "loss": 0.0429, "step": 305150 }, { "epoch": 4.494410980692479, "grad_norm": 1.5705701112747192, "learning_rate": 2.2549860253834023e-06, "loss": 0.0557, "step": 305175 }, { "epoch": 4.494779163782566, "grad_norm": 1.1281174421310425, "learning_rate": 2.2533496534164176e-06, "loss": 0.0488, "step": 305200 }, { "epoch": 4.495147346872653, "grad_norm": 1.310794472694397, "learning_rate": 2.251713281449433e-06, "loss": 0.0556, "step": 305225 }, { "epoch": 4.49551552996274, "grad_norm": 1.3496712446212769, "learning_rate": 2.2500769094824486e-06, "loss": 0.0484, "step": 305250 }, { "epoch": 4.495883713052827, "grad_norm": 1.17291259765625, "learning_rate": 2.248440537515464e-06, "loss": 0.0482, "step": 305275 }, { "epoch": 4.496251896142914, "grad_norm": 1.2383538484573364, "learning_rate": 2.2468041655484792e-06, "loss": 0.0461, "step": 305300 }, { "epoch": 4.496620079233001, "grad_norm": 1.3425499200820923, "learning_rate": 2.2451677935814945e-06, "loss": 0.052, "step": 305325 }, { "epoch": 4.496988262323088, "grad_norm": 1.3372963666915894, "learning_rate": 2.2435314216145103e-06, "loss": 0.0505, "step": 305350 }, { "epoch": 4.497356445413175, "grad_norm": 1.6998307704925537, "learning_rate": 2.2418950496475256e-06, "loss": 0.0523, "step": 305375 }, { "epoch": 4.497724628503262, "grad_norm": 1.1803414821624756, "learning_rate": 2.240258677680541e-06, "loss": 0.0473, "step": 305400 }, { "epoch": 4.498092811593349, "grad_norm": 1.2189404964447021, "learning_rate": 2.2386223057135566e-06, "loss": 0.0495, "step": 305425 }, { "epoch": 4.498460994683436, "grad_norm": 1.2284362316131592, "learning_rate": 2.236985933746572e-06, "loss": 0.0466, "step": 305450 }, { "epoch": 4.498829177773523, "grad_norm": 1.1205028295516968, "learning_rate": 2.2353495617795876e-06, "loss": 0.0505, "step": 305475 }, { "epoch": 4.499197360863611, "grad_norm": 1.914658784866333, "learning_rate": 2.233713189812603e-06, "loss": 0.0508, "step": 305500 }, { "epoch": 4.499565543953698, "grad_norm": 1.8325726985931396, "learning_rate": 2.2320768178456183e-06, "loss": 0.0575, "step": 305525 }, { "epoch": 4.499933727043785, "grad_norm": 1.3237731456756592, "learning_rate": 2.230440445878634e-06, "loss": 0.051, "step": 305550 }, { "epoch": 4.500301910133872, "grad_norm": 1.2312746047973633, "learning_rate": 2.2288040739116493e-06, "loss": 0.0554, "step": 305575 }, { "epoch": 4.500670093223959, "grad_norm": 1.255330204963684, "learning_rate": 2.2271677019446646e-06, "loss": 0.0512, "step": 305600 }, { "epoch": 4.501038276314046, "grad_norm": 1.2739551067352295, "learning_rate": 2.2255313299776803e-06, "loss": 0.0505, "step": 305625 }, { "epoch": 4.501406459404133, "grad_norm": 1.4140033721923828, "learning_rate": 2.2238949580106956e-06, "loss": 0.0506, "step": 305650 }, { "epoch": 4.50177464249422, "grad_norm": 1.2576498985290527, "learning_rate": 2.222258586043711e-06, "loss": 0.0467, "step": 305675 }, { "epoch": 4.502142825584307, "grad_norm": 1.3852546215057373, "learning_rate": 2.2206222140767262e-06, "loss": 0.0532, "step": 305700 }, { "epoch": 4.502511008674394, "grad_norm": 1.5580438375473022, "learning_rate": 2.218985842109742e-06, "loss": 0.0503, "step": 305725 }, { "epoch": 4.502879191764481, "grad_norm": 1.479324460029602, "learning_rate": 2.2173494701427573e-06, "loss": 0.053, "step": 305750 }, { "epoch": 4.503247374854568, "grad_norm": 1.2652708292007446, "learning_rate": 2.2157130981757726e-06, "loss": 0.0473, "step": 305775 }, { "epoch": 4.503615557944655, "grad_norm": 1.5438512563705444, "learning_rate": 2.2140767262087883e-06, "loss": 0.052, "step": 305800 }, { "epoch": 4.503983741034742, "grad_norm": 0.9680161476135254, "learning_rate": 2.2124403542418036e-06, "loss": 0.0478, "step": 305825 }, { "epoch": 4.504351924124829, "grad_norm": 1.074666976928711, "learning_rate": 2.210803982274819e-06, "loss": 0.0504, "step": 305850 }, { "epoch": 4.504720107214916, "grad_norm": 1.2749511003494263, "learning_rate": 2.209167610307834e-06, "loss": 0.0451, "step": 305875 }, { "epoch": 4.505088290305003, "grad_norm": 1.2340952157974243, "learning_rate": 2.20753123834085e-06, "loss": 0.0473, "step": 305900 }, { "epoch": 4.50545647339509, "grad_norm": 1.9128999710083008, "learning_rate": 2.2058948663738652e-06, "loss": 0.0535, "step": 305925 }, { "epoch": 4.505824656485177, "grad_norm": 1.1371002197265625, "learning_rate": 2.2042584944068805e-06, "loss": 0.0529, "step": 305950 }, { "epoch": 4.506192839575264, "grad_norm": 1.4185383319854736, "learning_rate": 2.2026221224398963e-06, "loss": 0.057, "step": 305975 }, { "epoch": 4.506561022665351, "grad_norm": 0.9934126734733582, "learning_rate": 2.200985750472912e-06, "loss": 0.0504, "step": 306000 }, { "epoch": 4.506929205755438, "grad_norm": 1.484820008277893, "learning_rate": 2.1993493785059273e-06, "loss": 0.047, "step": 306025 }, { "epoch": 4.507297388845525, "grad_norm": 0.9192208647727966, "learning_rate": 2.1977130065389426e-06, "loss": 0.048, "step": 306050 }, { "epoch": 4.507665571935612, "grad_norm": 1.314201831817627, "learning_rate": 2.196076634571958e-06, "loss": 0.0495, "step": 306075 }, { "epoch": 4.508033755025699, "grad_norm": 1.4820038080215454, "learning_rate": 2.1944402626049736e-06, "loss": 0.0489, "step": 306100 }, { "epoch": 4.508401938115786, "grad_norm": 1.5399411916732788, "learning_rate": 2.192803890637989e-06, "loss": 0.051, "step": 306125 }, { "epoch": 4.508770121205873, "grad_norm": 1.285320520401001, "learning_rate": 2.1911675186710042e-06, "loss": 0.052, "step": 306150 }, { "epoch": 4.50913830429596, "grad_norm": 1.5088833570480347, "learning_rate": 2.18953114670402e-06, "loss": 0.0555, "step": 306175 }, { "epoch": 4.509506487386047, "grad_norm": 1.1098202466964722, "learning_rate": 2.1878947747370353e-06, "loss": 0.0445, "step": 306200 }, { "epoch": 4.509874670476134, "grad_norm": 1.4007564783096313, "learning_rate": 2.1862584027700506e-06, "loss": 0.0514, "step": 306225 }, { "epoch": 4.510242853566221, "grad_norm": 1.8120425939559937, "learning_rate": 2.184622030803066e-06, "loss": 0.0543, "step": 306250 }, { "epoch": 4.510611036656308, "grad_norm": 1.083579421043396, "learning_rate": 2.1829856588360816e-06, "loss": 0.049, "step": 306275 }, { "epoch": 4.510979219746395, "grad_norm": 1.1269021034240723, "learning_rate": 2.181349286869097e-06, "loss": 0.0505, "step": 306300 }, { "epoch": 4.511347402836482, "grad_norm": 1.0835684537887573, "learning_rate": 2.1797129149021122e-06, "loss": 0.0485, "step": 306325 }, { "epoch": 4.511715585926569, "grad_norm": 1.3766403198242188, "learning_rate": 2.178076542935128e-06, "loss": 0.0505, "step": 306350 }, { "epoch": 4.512083769016657, "grad_norm": 1.0139336585998535, "learning_rate": 2.1764401709681433e-06, "loss": 0.0537, "step": 306375 }, { "epoch": 4.512451952106744, "grad_norm": 1.4512341022491455, "learning_rate": 2.1748037990011586e-06, "loss": 0.0545, "step": 306400 }, { "epoch": 4.512820135196831, "grad_norm": 1.534245491027832, "learning_rate": 2.173167427034174e-06, "loss": 0.049, "step": 306425 }, { "epoch": 4.513188318286918, "grad_norm": 1.7229063510894775, "learning_rate": 2.1715310550671896e-06, "loss": 0.0528, "step": 306450 }, { "epoch": 4.513556501377005, "grad_norm": 1.3052908182144165, "learning_rate": 2.1698946831002053e-06, "loss": 0.0502, "step": 306475 }, { "epoch": 4.513924684467092, "grad_norm": 1.6839303970336914, "learning_rate": 2.1682583111332206e-06, "loss": 0.0529, "step": 306500 }, { "epoch": 4.514292867557179, "grad_norm": 1.0376553535461426, "learning_rate": 2.166621939166236e-06, "loss": 0.0503, "step": 306525 }, { "epoch": 4.514661050647266, "grad_norm": 1.3496531248092651, "learning_rate": 2.1649855671992517e-06, "loss": 0.0544, "step": 306550 }, { "epoch": 4.515029233737353, "grad_norm": 1.4430419206619263, "learning_rate": 2.163349195232267e-06, "loss": 0.0483, "step": 306575 }, { "epoch": 4.51539741682744, "grad_norm": 1.4553323984146118, "learning_rate": 2.1617128232652823e-06, "loss": 0.0554, "step": 306600 }, { "epoch": 4.515765599917527, "grad_norm": 0.8176844716072083, "learning_rate": 2.1600764512982976e-06, "loss": 0.0571, "step": 306625 }, { "epoch": 4.516133783007614, "grad_norm": 1.2480665445327759, "learning_rate": 2.1584400793313133e-06, "loss": 0.0517, "step": 306650 }, { "epoch": 4.516501966097701, "grad_norm": 1.301805853843689, "learning_rate": 2.1568037073643286e-06, "loss": 0.0549, "step": 306675 }, { "epoch": 4.516870149187788, "grad_norm": 1.3430217504501343, "learning_rate": 2.155167335397344e-06, "loss": 0.0488, "step": 306700 }, { "epoch": 4.517238332277875, "grad_norm": 0.9315279126167297, "learning_rate": 2.1535309634303596e-06, "loss": 0.0505, "step": 306725 }, { "epoch": 4.517606515367962, "grad_norm": 1.353087306022644, "learning_rate": 2.1519600463420544e-06, "loss": 0.0482, "step": 306750 }, { "epoch": 4.517974698458049, "grad_norm": 1.4864869117736816, "learning_rate": 2.1503236743750697e-06, "loss": 0.0525, "step": 306775 }, { "epoch": 4.518342881548136, "grad_norm": 1.2863086462020874, "learning_rate": 2.148687302408085e-06, "loss": 0.0533, "step": 306800 }, { "epoch": 4.518711064638223, "grad_norm": 0.984040379524231, "learning_rate": 2.1470509304411007e-06, "loss": 0.0466, "step": 306825 }, { "epoch": 4.51907924772831, "grad_norm": 1.4597172737121582, "learning_rate": 2.145414558474116e-06, "loss": 0.0493, "step": 306850 }, { "epoch": 4.519447430818397, "grad_norm": 1.468724012374878, "learning_rate": 2.1437781865071313e-06, "loss": 0.0567, "step": 306875 }, { "epoch": 4.519815613908484, "grad_norm": 0.874403178691864, "learning_rate": 2.1421418145401466e-06, "loss": 0.0518, "step": 306900 }, { "epoch": 4.520183796998571, "grad_norm": 0.9855050444602966, "learning_rate": 2.1405054425731623e-06, "loss": 0.0489, "step": 306925 }, { "epoch": 4.520551980088658, "grad_norm": 1.2628121376037598, "learning_rate": 2.138869070606178e-06, "loss": 0.0534, "step": 306950 }, { "epoch": 4.5209201631787455, "grad_norm": 1.2815665006637573, "learning_rate": 2.1372326986391934e-06, "loss": 0.053, "step": 306975 }, { "epoch": 4.5212883462688325, "grad_norm": 1.0003927946090698, "learning_rate": 2.1355963266722087e-06, "loss": 0.0505, "step": 307000 }, { "epoch": 4.5216565293589195, "grad_norm": 1.924319863319397, "learning_rate": 2.1339599547052244e-06, "loss": 0.0541, "step": 307025 }, { "epoch": 4.5220247124490065, "grad_norm": 1.4959354400634766, "learning_rate": 2.1323235827382397e-06, "loss": 0.0501, "step": 307050 }, { "epoch": 4.5223928955390935, "grad_norm": 0.8307550549507141, "learning_rate": 2.130687210771255e-06, "loss": 0.0583, "step": 307075 }, { "epoch": 4.5227610786291805, "grad_norm": 1.1998952627182007, "learning_rate": 2.1290508388042703e-06, "loss": 0.0502, "step": 307100 }, { "epoch": 4.5231292617192675, "grad_norm": 1.3657691478729248, "learning_rate": 2.127414466837286e-06, "loss": 0.0495, "step": 307125 }, { "epoch": 4.523497444809355, "grad_norm": 0.9390334486961365, "learning_rate": 2.1257780948703014e-06, "loss": 0.0466, "step": 307150 }, { "epoch": 4.523865627899442, "grad_norm": 1.3445976972579956, "learning_rate": 2.1241417229033167e-06, "loss": 0.0578, "step": 307175 }, { "epoch": 4.524233810989529, "grad_norm": 1.182094931602478, "learning_rate": 2.1225053509363324e-06, "loss": 0.0465, "step": 307200 }, { "epoch": 4.524601994079616, "grad_norm": 0.7793055772781372, "learning_rate": 2.1208689789693477e-06, "loss": 0.0459, "step": 307225 }, { "epoch": 4.524970177169703, "grad_norm": 1.6466139554977417, "learning_rate": 2.119232607002363e-06, "loss": 0.0528, "step": 307250 }, { "epoch": 4.52533836025979, "grad_norm": 1.24486243724823, "learning_rate": 2.1175962350353783e-06, "loss": 0.0516, "step": 307275 }, { "epoch": 4.525706543349877, "grad_norm": 1.2348495721817017, "learning_rate": 2.115959863068394e-06, "loss": 0.0505, "step": 307300 }, { "epoch": 4.526074726439964, "grad_norm": 0.8380063772201538, "learning_rate": 2.1143234911014093e-06, "loss": 0.053, "step": 307325 }, { "epoch": 4.526442909530051, "grad_norm": 1.3594605922698975, "learning_rate": 2.1126871191344246e-06, "loss": 0.0558, "step": 307350 }, { "epoch": 4.526811092620138, "grad_norm": 1.736283540725708, "learning_rate": 2.1110507471674404e-06, "loss": 0.0519, "step": 307375 }, { "epoch": 4.527179275710225, "grad_norm": 1.2260088920593262, "learning_rate": 2.1094143752004557e-06, "loss": 0.0574, "step": 307400 }, { "epoch": 4.527547458800312, "grad_norm": 1.6989027261734009, "learning_rate": 2.1077780032334714e-06, "loss": 0.0501, "step": 307425 }, { "epoch": 4.5279156418904, "grad_norm": 1.137641191482544, "learning_rate": 2.1061416312664867e-06, "loss": 0.0511, "step": 307450 }, { "epoch": 4.528283824980487, "grad_norm": 1.1760953664779663, "learning_rate": 2.104505259299502e-06, "loss": 0.0488, "step": 307475 }, { "epoch": 4.528652008070574, "grad_norm": 1.7314229011535645, "learning_rate": 2.1028688873325177e-06, "loss": 0.0505, "step": 307500 }, { "epoch": 4.529020191160661, "grad_norm": 1.4109761714935303, "learning_rate": 2.101232515365533e-06, "loss": 0.0491, "step": 307525 }, { "epoch": 4.529388374250748, "grad_norm": 0.888072669506073, "learning_rate": 2.0995961433985483e-06, "loss": 0.0465, "step": 307550 }, { "epoch": 4.529756557340835, "grad_norm": 1.5533137321472168, "learning_rate": 2.097959771431564e-06, "loss": 0.0523, "step": 307575 }, { "epoch": 4.530124740430922, "grad_norm": 1.5203653573989868, "learning_rate": 2.0963233994645794e-06, "loss": 0.0531, "step": 307600 }, { "epoch": 4.530492923521009, "grad_norm": 1.265055775642395, "learning_rate": 2.0946870274975947e-06, "loss": 0.0533, "step": 307625 }, { "epoch": 4.530861106611096, "grad_norm": 1.1532315015792847, "learning_rate": 2.09305065553061e-06, "loss": 0.0437, "step": 307650 }, { "epoch": 4.531229289701183, "grad_norm": 1.476191520690918, "learning_rate": 2.0914142835636257e-06, "loss": 0.048, "step": 307675 }, { "epoch": 4.53159747279127, "grad_norm": 1.0593379735946655, "learning_rate": 2.089777911596641e-06, "loss": 0.0525, "step": 307700 }, { "epoch": 4.531965655881357, "grad_norm": 1.4708608388900757, "learning_rate": 2.0881415396296563e-06, "loss": 0.051, "step": 307725 }, { "epoch": 4.532333838971444, "grad_norm": 1.4360496997833252, "learning_rate": 2.086505167662672e-06, "loss": 0.0493, "step": 307750 }, { "epoch": 4.532702022061531, "grad_norm": 0.9608443975448608, "learning_rate": 2.0848687956956874e-06, "loss": 0.0536, "step": 307775 }, { "epoch": 4.533070205151618, "grad_norm": 1.5330548286437988, "learning_rate": 2.0832324237287027e-06, "loss": 0.0506, "step": 307800 }, { "epoch": 4.533438388241705, "grad_norm": 0.862598180770874, "learning_rate": 2.081596051761718e-06, "loss": 0.047, "step": 307825 }, { "epoch": 4.533806571331792, "grad_norm": 1.1573472023010254, "learning_rate": 2.0799596797947337e-06, "loss": 0.045, "step": 307850 }, { "epoch": 4.534174754421879, "grad_norm": 1.48971688747406, "learning_rate": 2.078323307827749e-06, "loss": 0.0482, "step": 307875 }, { "epoch": 4.534542937511966, "grad_norm": 1.1666826009750366, "learning_rate": 2.0766869358607643e-06, "loss": 0.0452, "step": 307900 }, { "epoch": 4.534911120602053, "grad_norm": 1.4414671659469604, "learning_rate": 2.07505056389378e-06, "loss": 0.0543, "step": 307925 }, { "epoch": 4.53527930369214, "grad_norm": 1.4044976234436035, "learning_rate": 2.0734141919267958e-06, "loss": 0.0586, "step": 307950 }, { "epoch": 4.535647486782227, "grad_norm": 1.4503514766693115, "learning_rate": 2.071777819959811e-06, "loss": 0.0492, "step": 307975 }, { "epoch": 4.536015669872314, "grad_norm": 1.3328460454940796, "learning_rate": 2.0701414479928264e-06, "loss": 0.0436, "step": 308000 }, { "epoch": 4.536383852962401, "grad_norm": 1.34400475025177, "learning_rate": 2.0685050760258417e-06, "loss": 0.056, "step": 308025 }, { "epoch": 4.536752036052488, "grad_norm": 1.5616716146469116, "learning_rate": 2.0668687040588574e-06, "loss": 0.0573, "step": 308050 }, { "epoch": 4.537120219142575, "grad_norm": 0.8876332640647888, "learning_rate": 2.0652323320918727e-06, "loss": 0.0512, "step": 308075 }, { "epoch": 4.537488402232662, "grad_norm": 0.9860138893127441, "learning_rate": 2.063595960124888e-06, "loss": 0.0555, "step": 308100 }, { "epoch": 4.537856585322749, "grad_norm": 1.431259036064148, "learning_rate": 2.0619595881579037e-06, "loss": 0.05, "step": 308125 }, { "epoch": 4.538224768412836, "grad_norm": 1.095428228378296, "learning_rate": 2.060323216190919e-06, "loss": 0.0485, "step": 308150 }, { "epoch": 4.538592951502923, "grad_norm": 1.5755724906921387, "learning_rate": 2.0586868442239343e-06, "loss": 0.0533, "step": 308175 }, { "epoch": 4.53896113459301, "grad_norm": 2.0550174713134766, "learning_rate": 2.0570504722569496e-06, "loss": 0.052, "step": 308200 }, { "epoch": 4.539329317683097, "grad_norm": 1.2741310596466064, "learning_rate": 2.0554141002899654e-06, "loss": 0.0529, "step": 308225 }, { "epoch": 4.539697500773184, "grad_norm": 0.894555926322937, "learning_rate": 2.0537777283229807e-06, "loss": 0.0492, "step": 308250 }, { "epoch": 4.540065683863271, "grad_norm": 1.6967883110046387, "learning_rate": 2.052141356355996e-06, "loss": 0.0483, "step": 308275 }, { "epoch": 4.540433866953359, "grad_norm": 1.0774853229522705, "learning_rate": 2.0505049843890117e-06, "loss": 0.0483, "step": 308300 }, { "epoch": 4.540802050043446, "grad_norm": 1.6280452013015747, "learning_rate": 2.048868612422027e-06, "loss": 0.056, "step": 308325 }, { "epoch": 4.541170233133533, "grad_norm": 1.453758955001831, "learning_rate": 2.0472322404550423e-06, "loss": 0.0504, "step": 308350 }, { "epoch": 4.54153841622362, "grad_norm": 1.55021333694458, "learning_rate": 2.0455958684880576e-06, "loss": 0.0469, "step": 308375 }, { "epoch": 4.541906599313707, "grad_norm": 1.4157837629318237, "learning_rate": 2.0439594965210734e-06, "loss": 0.0521, "step": 308400 }, { "epoch": 4.542274782403794, "grad_norm": 0.7594125270843506, "learning_rate": 2.042323124554089e-06, "loss": 0.0511, "step": 308425 }, { "epoch": 4.542642965493881, "grad_norm": 1.248920202255249, "learning_rate": 2.0406867525871044e-06, "loss": 0.0458, "step": 308450 }, { "epoch": 4.543011148583968, "grad_norm": 1.3545526266098022, "learning_rate": 2.0390503806201197e-06, "loss": 0.0497, "step": 308475 }, { "epoch": 4.543379331674055, "grad_norm": 0.7493758201599121, "learning_rate": 2.0374140086531354e-06, "loss": 0.0539, "step": 308500 }, { "epoch": 4.543747514764142, "grad_norm": 1.0456050634384155, "learning_rate": 2.0357776366861507e-06, "loss": 0.054, "step": 308525 }, { "epoch": 4.544115697854229, "grad_norm": 1.1004092693328857, "learning_rate": 2.034141264719166e-06, "loss": 0.0511, "step": 308550 }, { "epoch": 4.544483880944316, "grad_norm": 1.1395137310028076, "learning_rate": 2.0325048927521813e-06, "loss": 0.0467, "step": 308575 }, { "epoch": 4.544852064034403, "grad_norm": 1.5351648330688477, "learning_rate": 2.030868520785197e-06, "loss": 0.0546, "step": 308600 }, { "epoch": 4.54522024712449, "grad_norm": 1.2298682928085327, "learning_rate": 2.0292321488182124e-06, "loss": 0.0527, "step": 308625 }, { "epoch": 4.545588430214577, "grad_norm": 1.3381637334823608, "learning_rate": 2.0275957768512277e-06, "loss": 0.053, "step": 308650 }, { "epoch": 4.545956613304664, "grad_norm": 0.8341094255447388, "learning_rate": 2.0259594048842434e-06, "loss": 0.0503, "step": 308675 }, { "epoch": 4.546324796394751, "grad_norm": 1.2205511331558228, "learning_rate": 2.0243230329172587e-06, "loss": 0.0514, "step": 308700 }, { "epoch": 4.546692979484838, "grad_norm": 0.9575676918029785, "learning_rate": 2.022686660950274e-06, "loss": 0.05, "step": 308725 }, { "epoch": 4.547061162574925, "grad_norm": 1.0823307037353516, "learning_rate": 2.0210502889832893e-06, "loss": 0.0616, "step": 308750 }, { "epoch": 4.547429345665012, "grad_norm": 1.2561061382293701, "learning_rate": 2.019413917016305e-06, "loss": 0.0602, "step": 308775 }, { "epoch": 4.547797528755099, "grad_norm": 1.1245801448822021, "learning_rate": 2.0177775450493203e-06, "loss": 0.0505, "step": 308800 }, { "epoch": 4.548165711845186, "grad_norm": 1.0400805473327637, "learning_rate": 2.0161411730823356e-06, "loss": 0.0462, "step": 308825 }, { "epoch": 4.548533894935273, "grad_norm": 1.3343368768692017, "learning_rate": 2.0145048011153514e-06, "loss": 0.0594, "step": 308850 }, { "epoch": 4.54890207802536, "grad_norm": 1.1459853649139404, "learning_rate": 2.0128684291483667e-06, "loss": 0.0561, "step": 308875 }, { "epoch": 4.549270261115447, "grad_norm": 1.415501594543457, "learning_rate": 2.0112320571813824e-06, "loss": 0.052, "step": 308900 }, { "epoch": 4.549638444205534, "grad_norm": 1.3437832593917847, "learning_rate": 2.0095956852143977e-06, "loss": 0.0535, "step": 308925 }, { "epoch": 4.550006627295621, "grad_norm": 1.5965673923492432, "learning_rate": 2.007959313247413e-06, "loss": 0.0482, "step": 308950 }, { "epoch": 4.5503748103857085, "grad_norm": 1.0317606925964355, "learning_rate": 2.0063229412804287e-06, "loss": 0.043, "step": 308975 }, { "epoch": 4.5507429934757955, "grad_norm": 1.7104601860046387, "learning_rate": 2.004686569313444e-06, "loss": 0.0512, "step": 309000 }, { "epoch": 4.5511111765658825, "grad_norm": 1.38189697265625, "learning_rate": 2.0030501973464594e-06, "loss": 0.0443, "step": 309025 }, { "epoch": 4.5514793596559695, "grad_norm": 1.3474329710006714, "learning_rate": 2.001413825379475e-06, "loss": 0.0525, "step": 309050 }, { "epoch": 4.5518475427460565, "grad_norm": 1.881171703338623, "learning_rate": 1.9997774534124904e-06, "loss": 0.0546, "step": 309075 }, { "epoch": 4.5522157258361435, "grad_norm": 1.063180923461914, "learning_rate": 1.9981410814455057e-06, "loss": 0.0549, "step": 309100 }, { "epoch": 4.5525839089262305, "grad_norm": 1.3530104160308838, "learning_rate": 1.996504709478521e-06, "loss": 0.0572, "step": 309125 }, { "epoch": 4.552952092016318, "grad_norm": 1.5687769651412964, "learning_rate": 1.9948683375115367e-06, "loss": 0.05, "step": 309150 }, { "epoch": 4.553320275106405, "grad_norm": 1.1460498571395874, "learning_rate": 1.993231965544552e-06, "loss": 0.0505, "step": 309175 }, { "epoch": 4.553688458196492, "grad_norm": 1.2409263849258423, "learning_rate": 1.9915955935775673e-06, "loss": 0.0444, "step": 309200 }, { "epoch": 4.554056641286579, "grad_norm": 1.58135986328125, "learning_rate": 1.989959221610583e-06, "loss": 0.0545, "step": 309225 }, { "epoch": 4.554424824376666, "grad_norm": 0.8890162706375122, "learning_rate": 1.9883228496435984e-06, "loss": 0.0518, "step": 309250 }, { "epoch": 4.554793007466753, "grad_norm": 1.0495911836624146, "learning_rate": 1.9866864776766137e-06, "loss": 0.05, "step": 309275 }, { "epoch": 4.55516119055684, "grad_norm": 1.4100531339645386, "learning_rate": 1.985050105709629e-06, "loss": 0.0538, "step": 309300 }, { "epoch": 4.555529373646927, "grad_norm": 1.3735814094543457, "learning_rate": 1.9834137337426447e-06, "loss": 0.0507, "step": 309325 }, { "epoch": 4.555897556737014, "grad_norm": 1.9199820756912231, "learning_rate": 1.98177736177566e-06, "loss": 0.0556, "step": 309350 }, { "epoch": 4.556265739827102, "grad_norm": 1.5116608142852783, "learning_rate": 1.9801409898086753e-06, "loss": 0.0593, "step": 309375 }, { "epoch": 4.556633922917189, "grad_norm": 1.0083881616592407, "learning_rate": 1.978504617841691e-06, "loss": 0.0515, "step": 309400 }, { "epoch": 4.557002106007276, "grad_norm": 1.572373628616333, "learning_rate": 1.9768682458747063e-06, "loss": 0.0549, "step": 309425 }, { "epoch": 4.557370289097363, "grad_norm": 1.614080548286438, "learning_rate": 1.975231873907722e-06, "loss": 0.0516, "step": 309450 }, { "epoch": 4.55773847218745, "grad_norm": 1.8171714544296265, "learning_rate": 1.9735955019407374e-06, "loss": 0.0494, "step": 309475 }, { "epoch": 4.558106655277537, "grad_norm": 1.7256240844726562, "learning_rate": 1.9719591299737527e-06, "loss": 0.0553, "step": 309500 }, { "epoch": 4.558474838367624, "grad_norm": 1.19894278049469, "learning_rate": 1.9703227580067684e-06, "loss": 0.0491, "step": 309525 }, { "epoch": 4.558843021457711, "grad_norm": 1.234932541847229, "learning_rate": 1.9686863860397837e-06, "loss": 0.0449, "step": 309550 }, { "epoch": 4.559211204547798, "grad_norm": 1.6571643352508545, "learning_rate": 1.967050014072799e-06, "loss": 0.0533, "step": 309575 }, { "epoch": 4.559579387637885, "grad_norm": 1.6973224878311157, "learning_rate": 1.9654136421058147e-06, "loss": 0.0542, "step": 309600 }, { "epoch": 4.559947570727972, "grad_norm": 0.9314064383506775, "learning_rate": 1.96377727013883e-06, "loss": 0.0488, "step": 309625 }, { "epoch": 4.560315753818059, "grad_norm": 1.58635413646698, "learning_rate": 1.9621408981718454e-06, "loss": 0.047, "step": 309650 }, { "epoch": 4.560683936908146, "grad_norm": 1.5457500219345093, "learning_rate": 1.9605045262048607e-06, "loss": 0.0546, "step": 309675 }, { "epoch": 4.561052119998233, "grad_norm": 1.4691550731658936, "learning_rate": 1.9588681542378764e-06, "loss": 0.0492, "step": 309700 }, { "epoch": 4.56142030308832, "grad_norm": 1.3959052562713623, "learning_rate": 1.9572317822708917e-06, "loss": 0.054, "step": 309725 }, { "epoch": 4.561788486178407, "grad_norm": 1.1163315773010254, "learning_rate": 1.955595410303907e-06, "loss": 0.0497, "step": 309750 }, { "epoch": 4.562156669268494, "grad_norm": 1.162321925163269, "learning_rate": 1.9539590383369227e-06, "loss": 0.0563, "step": 309775 }, { "epoch": 4.562524852358581, "grad_norm": 0.9975666403770447, "learning_rate": 1.952322666369938e-06, "loss": 0.0542, "step": 309800 }, { "epoch": 4.562893035448668, "grad_norm": 1.577072024345398, "learning_rate": 1.9507517492816328e-06, "loss": 0.0551, "step": 309825 }, { "epoch": 4.563261218538755, "grad_norm": 1.5585591793060303, "learning_rate": 1.949115377314648e-06, "loss": 0.0524, "step": 309850 }, { "epoch": 4.563629401628842, "grad_norm": 1.5018388032913208, "learning_rate": 1.9474790053476638e-06, "loss": 0.0476, "step": 309875 }, { "epoch": 4.563997584718929, "grad_norm": 1.6291329860687256, "learning_rate": 1.9458426333806795e-06, "loss": 0.0526, "step": 309900 }, { "epoch": 4.564365767809016, "grad_norm": 1.3654611110687256, "learning_rate": 1.944206261413695e-06, "loss": 0.0521, "step": 309925 }, { "epoch": 4.564733950899103, "grad_norm": 1.7379506826400757, "learning_rate": 1.94256988944671e-06, "loss": 0.0492, "step": 309950 }, { "epoch": 4.56510213398919, "grad_norm": 2.03979754447937, "learning_rate": 1.9409335174797254e-06, "loss": 0.0497, "step": 309975 }, { "epoch": 4.565470317079277, "grad_norm": 1.3213950395584106, "learning_rate": 1.939297145512741e-06, "loss": 0.0525, "step": 310000 }, { "epoch": 4.565838500169364, "grad_norm": 1.017162561416626, "learning_rate": 1.9376607735457565e-06, "loss": 0.0571, "step": 310025 }, { "epoch": 4.566206683259451, "grad_norm": 1.3367708921432495, "learning_rate": 1.9360244015787718e-06, "loss": 0.0522, "step": 310050 }, { "epoch": 4.566574866349538, "grad_norm": 1.4543776512145996, "learning_rate": 1.9343880296117875e-06, "loss": 0.0469, "step": 310075 }, { "epoch": 4.566943049439625, "grad_norm": 1.4850057363510132, "learning_rate": 1.932751657644803e-06, "loss": 0.0544, "step": 310100 }, { "epoch": 4.567311232529712, "grad_norm": 1.5761196613311768, "learning_rate": 1.931115285677818e-06, "loss": 0.0495, "step": 310125 }, { "epoch": 4.567679415619799, "grad_norm": 1.6672794818878174, "learning_rate": 1.9294789137108334e-06, "loss": 0.0535, "step": 310150 }, { "epoch": 4.568047598709886, "grad_norm": 1.3041654825210571, "learning_rate": 1.927842541743849e-06, "loss": 0.051, "step": 310175 }, { "epoch": 4.568415781799974, "grad_norm": 1.3045835494995117, "learning_rate": 1.9262061697768644e-06, "loss": 0.0539, "step": 310200 }, { "epoch": 4.568783964890061, "grad_norm": 1.269605040550232, "learning_rate": 1.9245697978098797e-06, "loss": 0.0495, "step": 310225 }, { "epoch": 4.569152147980148, "grad_norm": 1.4230904579162598, "learning_rate": 1.9229334258428955e-06, "loss": 0.0528, "step": 310250 }, { "epoch": 4.569520331070235, "grad_norm": 1.7751332521438599, "learning_rate": 1.9212970538759108e-06, "loss": 0.0526, "step": 310275 }, { "epoch": 4.569888514160322, "grad_norm": 1.475242257118225, "learning_rate": 1.919660681908926e-06, "loss": 0.0535, "step": 310300 }, { "epoch": 4.570256697250409, "grad_norm": 1.2149518728256226, "learning_rate": 1.9180243099419414e-06, "loss": 0.0488, "step": 310325 }, { "epoch": 4.570624880340496, "grad_norm": 1.1104406118392944, "learning_rate": 1.916387937974957e-06, "loss": 0.0475, "step": 310350 }, { "epoch": 4.570993063430583, "grad_norm": 1.0464363098144531, "learning_rate": 1.914751566007973e-06, "loss": 0.0554, "step": 310375 }, { "epoch": 4.57136124652067, "grad_norm": 1.5187437534332275, "learning_rate": 1.913115194040988e-06, "loss": 0.0512, "step": 310400 }, { "epoch": 4.571729429610757, "grad_norm": 1.0319486856460571, "learning_rate": 1.9114788220740034e-06, "loss": 0.0488, "step": 310425 }, { "epoch": 4.572097612700844, "grad_norm": 1.4134297370910645, "learning_rate": 1.909842450107019e-06, "loss": 0.0534, "step": 310450 }, { "epoch": 4.572465795790931, "grad_norm": 1.3182697296142578, "learning_rate": 1.9082060781400345e-06, "loss": 0.0514, "step": 310475 }, { "epoch": 4.572833978881018, "grad_norm": 1.3037372827529907, "learning_rate": 1.9065697061730498e-06, "loss": 0.0478, "step": 310500 }, { "epoch": 4.573202161971105, "grad_norm": 1.3757737874984741, "learning_rate": 1.9049333342060653e-06, "loss": 0.0555, "step": 310525 }, { "epoch": 4.573570345061192, "grad_norm": 1.1345593929290771, "learning_rate": 1.9032969622390806e-06, "loss": 0.0484, "step": 310550 }, { "epoch": 4.573938528151279, "grad_norm": 1.1313515901565552, "learning_rate": 1.9016605902720961e-06, "loss": 0.0552, "step": 310575 }, { "epoch": 4.574306711241366, "grad_norm": 1.2449558973312378, "learning_rate": 1.9000242183051114e-06, "loss": 0.0585, "step": 310600 }, { "epoch": 4.574674894331453, "grad_norm": 1.5524110794067383, "learning_rate": 1.898387846338127e-06, "loss": 0.048, "step": 310625 }, { "epoch": 4.57504307742154, "grad_norm": 1.2277108430862427, "learning_rate": 1.8967514743711425e-06, "loss": 0.0526, "step": 310650 }, { "epoch": 4.575411260511627, "grad_norm": 1.7272179126739502, "learning_rate": 1.8951151024041578e-06, "loss": 0.0497, "step": 310675 }, { "epoch": 4.575779443601714, "grad_norm": 1.518348217010498, "learning_rate": 1.8934787304371733e-06, "loss": 0.0507, "step": 310700 }, { "epoch": 4.576147626691801, "grad_norm": 1.2473044395446777, "learning_rate": 1.8918423584701886e-06, "loss": 0.0571, "step": 310725 }, { "epoch": 4.576515809781888, "grad_norm": 1.1358023881912231, "learning_rate": 1.890205986503204e-06, "loss": 0.0548, "step": 310750 }, { "epoch": 4.576883992871975, "grad_norm": 1.5578036308288574, "learning_rate": 1.8885696145362196e-06, "loss": 0.058, "step": 310775 }, { "epoch": 4.577252175962062, "grad_norm": 1.411180853843689, "learning_rate": 1.886933242569235e-06, "loss": 0.0502, "step": 310800 }, { "epoch": 4.577620359052149, "grad_norm": 1.2316758632659912, "learning_rate": 1.8852968706022504e-06, "loss": 0.0485, "step": 310825 }, { "epoch": 4.577988542142236, "grad_norm": 1.708455204963684, "learning_rate": 1.8836604986352662e-06, "loss": 0.0511, "step": 310850 }, { "epoch": 4.578356725232323, "grad_norm": 1.1811246871948242, "learning_rate": 1.8820241266682815e-06, "loss": 0.0533, "step": 310875 }, { "epoch": 4.57872490832241, "grad_norm": 1.598293423652649, "learning_rate": 1.880387754701297e-06, "loss": 0.0493, "step": 310900 }, { "epoch": 4.579093091412497, "grad_norm": 1.7547820806503296, "learning_rate": 1.8787513827343123e-06, "loss": 0.0554, "step": 310925 }, { "epoch": 4.5794612745025844, "grad_norm": 1.325798749923706, "learning_rate": 1.8771150107673278e-06, "loss": 0.0487, "step": 310950 }, { "epoch": 4.5798294575926715, "grad_norm": 1.6396585702896118, "learning_rate": 1.8754786388003431e-06, "loss": 0.0553, "step": 310975 }, { "epoch": 4.5801976406827585, "grad_norm": 1.2691404819488525, "learning_rate": 1.8738422668333586e-06, "loss": 0.0538, "step": 311000 }, { "epoch": 4.5805658237728455, "grad_norm": 1.3830153942108154, "learning_rate": 1.8722058948663741e-06, "loss": 0.0483, "step": 311025 }, { "epoch": 4.5809340068629325, "grad_norm": 1.3440278768539429, "learning_rate": 1.8705695228993894e-06, "loss": 0.05, "step": 311050 }, { "epoch": 4.5813021899530195, "grad_norm": 1.1792877912521362, "learning_rate": 1.868933150932405e-06, "loss": 0.0523, "step": 311075 }, { "epoch": 4.5816703730431065, "grad_norm": 1.055132508277893, "learning_rate": 1.8672967789654203e-06, "loss": 0.0522, "step": 311100 }, { "epoch": 4.5820385561331936, "grad_norm": 1.2314695119857788, "learning_rate": 1.8656604069984358e-06, "loss": 0.0533, "step": 311125 }, { "epoch": 4.582406739223281, "grad_norm": 1.379610538482666, "learning_rate": 1.864024035031451e-06, "loss": 0.0503, "step": 311150 }, { "epoch": 4.582774922313368, "grad_norm": 1.4142709970474243, "learning_rate": 1.8623876630644666e-06, "loss": 0.0496, "step": 311175 }, { "epoch": 4.583143105403455, "grad_norm": 1.2724547386169434, "learning_rate": 1.8607512910974821e-06, "loss": 0.0533, "step": 311200 }, { "epoch": 4.583511288493542, "grad_norm": 1.617893934249878, "learning_rate": 1.8591149191304974e-06, "loss": 0.0495, "step": 311225 }, { "epoch": 4.583879471583629, "grad_norm": 0.8769577741622925, "learning_rate": 1.857478547163513e-06, "loss": 0.0472, "step": 311250 }, { "epoch": 4.5842476546737165, "grad_norm": 1.9812270402908325, "learning_rate": 1.8558421751965282e-06, "loss": 0.0519, "step": 311275 }, { "epoch": 4.5846158377638035, "grad_norm": 0.9911406636238098, "learning_rate": 1.8542058032295438e-06, "loss": 0.0534, "step": 311300 }, { "epoch": 4.584984020853891, "grad_norm": 1.3202862739562988, "learning_rate": 1.8525694312625593e-06, "loss": 0.0483, "step": 311325 }, { "epoch": 4.585352203943978, "grad_norm": 1.2891674041748047, "learning_rate": 1.8509330592955748e-06, "loss": 0.052, "step": 311350 }, { "epoch": 4.585720387034065, "grad_norm": 1.5891659259796143, "learning_rate": 1.8492966873285903e-06, "loss": 0.0552, "step": 311375 }, { "epoch": 4.586088570124152, "grad_norm": 1.1767886877059937, "learning_rate": 1.8476603153616058e-06, "loss": 0.0535, "step": 311400 }, { "epoch": 4.586456753214239, "grad_norm": 1.64137601852417, "learning_rate": 1.8460239433946211e-06, "loss": 0.0527, "step": 311425 }, { "epoch": 4.586824936304326, "grad_norm": 1.0645405054092407, "learning_rate": 1.8443875714276366e-06, "loss": 0.0512, "step": 311450 }, { "epoch": 4.587193119394413, "grad_norm": 1.5446661710739136, "learning_rate": 1.842751199460652e-06, "loss": 0.055, "step": 311475 }, { "epoch": 4.5875613024845, "grad_norm": 1.27260160446167, "learning_rate": 1.8411148274936675e-06, "loss": 0.0554, "step": 311500 }, { "epoch": 4.587929485574587, "grad_norm": 1.2854636907577515, "learning_rate": 1.8394784555266828e-06, "loss": 0.0534, "step": 311525 }, { "epoch": 4.588297668664674, "grad_norm": 1.3868088722229004, "learning_rate": 1.8378420835596983e-06, "loss": 0.0565, "step": 311550 }, { "epoch": 4.588665851754761, "grad_norm": 1.0023466348648071, "learning_rate": 1.8362057115927138e-06, "loss": 0.0583, "step": 311575 }, { "epoch": 4.589034034844848, "grad_norm": 1.0893226861953735, "learning_rate": 1.8345693396257291e-06, "loss": 0.0545, "step": 311600 }, { "epoch": 4.589402217934935, "grad_norm": 1.3062037229537964, "learning_rate": 1.8329329676587446e-06, "loss": 0.0513, "step": 311625 }, { "epoch": 4.589770401025022, "grad_norm": 1.3451515436172485, "learning_rate": 1.83129659569176e-06, "loss": 0.0526, "step": 311650 }, { "epoch": 4.590138584115109, "grad_norm": 0.977739155292511, "learning_rate": 1.8296602237247754e-06, "loss": 0.0501, "step": 311675 }, { "epoch": 4.590506767205196, "grad_norm": 1.2526726722717285, "learning_rate": 1.8280238517577907e-06, "loss": 0.0558, "step": 311700 }, { "epoch": 4.590874950295283, "grad_norm": 1.677848219871521, "learning_rate": 1.8263874797908063e-06, "loss": 0.0519, "step": 311725 }, { "epoch": 4.59124313338537, "grad_norm": 1.2457284927368164, "learning_rate": 1.8247511078238218e-06, "loss": 0.0477, "step": 311750 }, { "epoch": 4.591611316475457, "grad_norm": 0.8873688578605652, "learning_rate": 1.823114735856837e-06, "loss": 0.0543, "step": 311775 }, { "epoch": 4.591979499565544, "grad_norm": 1.4722398519515991, "learning_rate": 1.8214783638898526e-06, "loss": 0.0448, "step": 311800 }, { "epoch": 4.592347682655631, "grad_norm": 1.2841516733169556, "learning_rate": 1.8198419919228683e-06, "loss": 0.0536, "step": 311825 }, { "epoch": 4.592715865745718, "grad_norm": 1.401716709136963, "learning_rate": 1.8182056199558836e-06, "loss": 0.0564, "step": 311850 }, { "epoch": 4.593084048835805, "grad_norm": 1.0068591833114624, "learning_rate": 1.8165692479888991e-06, "loss": 0.0482, "step": 311875 }, { "epoch": 4.593452231925892, "grad_norm": 0.9521609544754028, "learning_rate": 1.8149328760219145e-06, "loss": 0.0469, "step": 311900 }, { "epoch": 4.593820415015979, "grad_norm": 0.9109975099563599, "learning_rate": 1.81329650405493e-06, "loss": 0.0498, "step": 311925 }, { "epoch": 4.594188598106066, "grad_norm": 1.2523980140686035, "learning_rate": 1.8116601320879455e-06, "loss": 0.0509, "step": 311950 }, { "epoch": 4.594556781196153, "grad_norm": 1.1682549715042114, "learning_rate": 1.8100237601209608e-06, "loss": 0.0434, "step": 311975 }, { "epoch": 4.59492496428624, "grad_norm": 1.3259446620941162, "learning_rate": 1.8083873881539763e-06, "loss": 0.0521, "step": 312000 }, { "epoch": 4.595293147376327, "grad_norm": 1.3220573663711548, "learning_rate": 1.8067510161869916e-06, "loss": 0.0517, "step": 312025 }, { "epoch": 4.595661330466414, "grad_norm": 1.246146321296692, "learning_rate": 1.8051146442200071e-06, "loss": 0.0489, "step": 312050 }, { "epoch": 4.596029513556501, "grad_norm": 1.0258642435073853, "learning_rate": 1.8034782722530224e-06, "loss": 0.0493, "step": 312075 }, { "epoch": 4.596397696646588, "grad_norm": 1.2516448497772217, "learning_rate": 1.801841900286038e-06, "loss": 0.0521, "step": 312100 }, { "epoch": 4.596765879736676, "grad_norm": 1.1295586824417114, "learning_rate": 1.8002055283190535e-06, "loss": 0.0525, "step": 312125 }, { "epoch": 4.597134062826763, "grad_norm": 1.355372667312622, "learning_rate": 1.7985691563520688e-06, "loss": 0.0479, "step": 312150 }, { "epoch": 4.59750224591685, "grad_norm": 1.616727590560913, "learning_rate": 1.7969327843850843e-06, "loss": 0.0465, "step": 312175 }, { "epoch": 4.597870429006937, "grad_norm": 1.3392568826675415, "learning_rate": 1.7952964124180996e-06, "loss": 0.0487, "step": 312200 }, { "epoch": 4.598238612097024, "grad_norm": 1.1653858423233032, "learning_rate": 1.793660040451115e-06, "loss": 0.0513, "step": 312225 }, { "epoch": 4.598606795187111, "grad_norm": 1.2011961936950684, "learning_rate": 1.7920236684841304e-06, "loss": 0.0513, "step": 312250 }, { "epoch": 4.598974978277198, "grad_norm": 1.2303119897842407, "learning_rate": 1.790387296517146e-06, "loss": 0.0435, "step": 312275 }, { "epoch": 4.599343161367285, "grad_norm": 1.6332257986068726, "learning_rate": 1.7887509245501614e-06, "loss": 0.0516, "step": 312300 }, { "epoch": 4.599711344457372, "grad_norm": 1.1437263488769531, "learning_rate": 1.7871145525831772e-06, "loss": 0.0535, "step": 312325 }, { "epoch": 4.600079527547459, "grad_norm": 0.8941243886947632, "learning_rate": 1.7854781806161925e-06, "loss": 0.0565, "step": 312350 }, { "epoch": 4.600447710637546, "grad_norm": 1.5741775035858154, "learning_rate": 1.783841808649208e-06, "loss": 0.049, "step": 312375 }, { "epoch": 4.600815893727633, "grad_norm": 1.402544617652893, "learning_rate": 1.7822054366822233e-06, "loss": 0.0568, "step": 312400 }, { "epoch": 4.60118407681772, "grad_norm": 1.5375876426696777, "learning_rate": 1.7805690647152388e-06, "loss": 0.0555, "step": 312425 }, { "epoch": 4.601552259907807, "grad_norm": 1.2158266305923462, "learning_rate": 1.7789326927482541e-06, "loss": 0.0485, "step": 312450 }, { "epoch": 4.601920442997894, "grad_norm": 0.9526785612106323, "learning_rate": 1.7772963207812696e-06, "loss": 0.0507, "step": 312475 }, { "epoch": 4.602288626087981, "grad_norm": 1.5378708839416504, "learning_rate": 1.7756599488142851e-06, "loss": 0.0482, "step": 312500 }, { "epoch": 4.602656809178068, "grad_norm": 0.9887898564338684, "learning_rate": 1.7740235768473005e-06, "loss": 0.0508, "step": 312525 }, { "epoch": 4.603024992268155, "grad_norm": 1.1639207601547241, "learning_rate": 1.772387204880316e-06, "loss": 0.0481, "step": 312550 }, { "epoch": 4.603393175358242, "grad_norm": 0.9504382014274597, "learning_rate": 1.7707508329133313e-06, "loss": 0.0481, "step": 312575 }, { "epoch": 4.603761358448329, "grad_norm": 1.2448655366897583, "learning_rate": 1.7691144609463468e-06, "loss": 0.0515, "step": 312600 }, { "epoch": 4.604129541538416, "grad_norm": 0.9778062105178833, "learning_rate": 1.767478088979362e-06, "loss": 0.0518, "step": 312625 }, { "epoch": 4.604497724628503, "grad_norm": 1.0024763345718384, "learning_rate": 1.7658417170123776e-06, "loss": 0.0449, "step": 312650 }, { "epoch": 4.60486590771859, "grad_norm": 1.8824326992034912, "learning_rate": 1.7642053450453931e-06, "loss": 0.0512, "step": 312675 }, { "epoch": 4.605234090808677, "grad_norm": 1.2657194137573242, "learning_rate": 1.7625689730784084e-06, "loss": 0.0521, "step": 312700 }, { "epoch": 4.605602273898764, "grad_norm": 1.0155348777770996, "learning_rate": 1.760932601111424e-06, "loss": 0.0544, "step": 312725 }, { "epoch": 4.605970456988851, "grad_norm": 1.4752349853515625, "learning_rate": 1.7592962291444393e-06, "loss": 0.0491, "step": 312750 }, { "epoch": 4.606338640078938, "grad_norm": 1.147755742073059, "learning_rate": 1.7576598571774548e-06, "loss": 0.0462, "step": 312775 }, { "epoch": 4.606706823169025, "grad_norm": 1.1272672414779663, "learning_rate": 1.75602348521047e-06, "loss": 0.0554, "step": 312800 }, { "epoch": 4.607075006259112, "grad_norm": 2.004840135574341, "learning_rate": 1.7543871132434858e-06, "loss": 0.0479, "step": 312825 }, { "epoch": 4.607443189349199, "grad_norm": 1.5390398502349854, "learning_rate": 1.7528161961551807e-06, "loss": 0.0546, "step": 312850 }, { "epoch": 4.607811372439286, "grad_norm": 1.48603355884552, "learning_rate": 1.751179824188196e-06, "loss": 0.049, "step": 312875 }, { "epoch": 4.608179555529373, "grad_norm": 1.4582126140594482, "learning_rate": 1.7495434522212116e-06, "loss": 0.0512, "step": 312900 }, { "epoch": 4.60854773861946, "grad_norm": 1.2588406801223755, "learning_rate": 1.7479070802542269e-06, "loss": 0.0484, "step": 312925 }, { "epoch": 4.6089159217095474, "grad_norm": 1.276369333267212, "learning_rate": 1.7462707082872424e-06, "loss": 0.0519, "step": 312950 }, { "epoch": 4.6092841047996345, "grad_norm": 1.3421517610549927, "learning_rate": 1.744634336320258e-06, "loss": 0.0573, "step": 312975 }, { "epoch": 4.6096522878897215, "grad_norm": 1.6167590618133545, "learning_rate": 1.7429979643532732e-06, "loss": 0.0493, "step": 313000 }, { "epoch": 4.6100204709798085, "grad_norm": 0.8688099384307861, "learning_rate": 1.7413615923862887e-06, "loss": 0.0486, "step": 313025 }, { "epoch": 4.6103886540698955, "grad_norm": 1.4828499555587769, "learning_rate": 1.739725220419304e-06, "loss": 0.0527, "step": 313050 }, { "epoch": 4.6107568371599825, "grad_norm": 0.9489651322364807, "learning_rate": 1.7380888484523195e-06, "loss": 0.0532, "step": 313075 }, { "epoch": 4.6111250202500695, "grad_norm": 1.2675871849060059, "learning_rate": 1.7364524764853348e-06, "loss": 0.0525, "step": 313100 }, { "epoch": 4.6114932033401566, "grad_norm": 1.261709213256836, "learning_rate": 1.7348161045183504e-06, "loss": 0.0494, "step": 313125 }, { "epoch": 4.611861386430244, "grad_norm": 1.3355411291122437, "learning_rate": 1.7331797325513659e-06, "loss": 0.0531, "step": 313150 }, { "epoch": 4.612229569520331, "grad_norm": 1.144920825958252, "learning_rate": 1.7315433605843812e-06, "loss": 0.0439, "step": 313175 }, { "epoch": 4.6125977526104185, "grad_norm": 1.3352934122085571, "learning_rate": 1.7299069886173967e-06, "loss": 0.0472, "step": 313200 }, { "epoch": 4.6129659357005055, "grad_norm": 0.980284571647644, "learning_rate": 1.728270616650412e-06, "loss": 0.0514, "step": 313225 }, { "epoch": 4.6133341187905925, "grad_norm": 1.496730089187622, "learning_rate": 1.7266342446834275e-06, "loss": 0.0524, "step": 313250 }, { "epoch": 4.6137023018806795, "grad_norm": 0.9213072657585144, "learning_rate": 1.7249978727164428e-06, "loss": 0.051, "step": 313275 }, { "epoch": 4.6140704849707666, "grad_norm": 1.0719350576400757, "learning_rate": 1.7233615007494585e-06, "loss": 0.0571, "step": 313300 }, { "epoch": 4.614438668060854, "grad_norm": 1.549264907836914, "learning_rate": 1.721725128782474e-06, "loss": 0.0554, "step": 313325 }, { "epoch": 4.614806851150941, "grad_norm": 0.8943632245063782, "learning_rate": 1.7200887568154896e-06, "loss": 0.0451, "step": 313350 }, { "epoch": 4.615175034241028, "grad_norm": 1.2267400026321411, "learning_rate": 1.7184523848485049e-06, "loss": 0.0551, "step": 313375 }, { "epoch": 4.615543217331115, "grad_norm": 1.7456096410751343, "learning_rate": 1.7168160128815204e-06, "loss": 0.0568, "step": 313400 }, { "epoch": 4.615911400421202, "grad_norm": 1.3656628131866455, "learning_rate": 1.7151796409145357e-06, "loss": 0.0497, "step": 313425 }, { "epoch": 4.616279583511289, "grad_norm": 1.5298798084259033, "learning_rate": 1.7135432689475512e-06, "loss": 0.0543, "step": 313450 }, { "epoch": 4.616647766601376, "grad_norm": 1.545066475868225, "learning_rate": 1.7119068969805665e-06, "loss": 0.0554, "step": 313475 }, { "epoch": 4.617015949691463, "grad_norm": 1.1340826749801636, "learning_rate": 1.710270525013582e-06, "loss": 0.0482, "step": 313500 }, { "epoch": 4.61738413278155, "grad_norm": 1.4046839475631714, "learning_rate": 1.7086341530465976e-06, "loss": 0.0491, "step": 313525 }, { "epoch": 4.617752315871637, "grad_norm": 1.1509557962417603, "learning_rate": 1.7069977810796129e-06, "loss": 0.048, "step": 313550 }, { "epoch": 4.618120498961724, "grad_norm": 0.8133782744407654, "learning_rate": 1.7053614091126284e-06, "loss": 0.0542, "step": 313575 }, { "epoch": 4.618488682051811, "grad_norm": 1.0698652267456055, "learning_rate": 1.7037250371456437e-06, "loss": 0.0503, "step": 313600 }, { "epoch": 4.618856865141898, "grad_norm": 1.2086094617843628, "learning_rate": 1.7020886651786592e-06, "loss": 0.0515, "step": 313625 }, { "epoch": 4.619225048231985, "grad_norm": 1.5048376321792603, "learning_rate": 1.7004522932116745e-06, "loss": 0.052, "step": 313650 }, { "epoch": 4.619593231322072, "grad_norm": 1.3776469230651855, "learning_rate": 1.69881592124469e-06, "loss": 0.0527, "step": 313675 }, { "epoch": 4.619961414412159, "grad_norm": 1.3432800769805908, "learning_rate": 1.6971795492777055e-06, "loss": 0.0582, "step": 313700 }, { "epoch": 4.620329597502246, "grad_norm": 1.5966800451278687, "learning_rate": 1.6955431773107208e-06, "loss": 0.0598, "step": 313725 }, { "epoch": 4.620697780592333, "grad_norm": 1.4164814949035645, "learning_rate": 1.6939068053437364e-06, "loss": 0.0519, "step": 313750 }, { "epoch": 4.62106596368242, "grad_norm": 1.2153713703155518, "learning_rate": 1.692270433376752e-06, "loss": 0.0453, "step": 313775 }, { "epoch": 4.621434146772507, "grad_norm": 1.4762452840805054, "learning_rate": 1.6906340614097674e-06, "loss": 0.0541, "step": 313800 }, { "epoch": 4.621802329862594, "grad_norm": 1.0811917781829834, "learning_rate": 1.688997689442783e-06, "loss": 0.0554, "step": 313825 }, { "epoch": 4.622170512952681, "grad_norm": 1.6722486019134521, "learning_rate": 1.6873613174757982e-06, "loss": 0.0451, "step": 313850 }, { "epoch": 4.622538696042768, "grad_norm": 0.5524638295173645, "learning_rate": 1.6857249455088137e-06, "loss": 0.0422, "step": 313875 }, { "epoch": 4.622906879132855, "grad_norm": 1.4932174682617188, "learning_rate": 1.6840885735418292e-06, "loss": 0.0512, "step": 313900 }, { "epoch": 4.623275062222942, "grad_norm": 1.0760420560836792, "learning_rate": 1.6824522015748445e-06, "loss": 0.0502, "step": 313925 }, { "epoch": 4.623643245313029, "grad_norm": 1.1159383058547974, "learning_rate": 1.68081582960786e-06, "loss": 0.0524, "step": 313950 }, { "epoch": 4.624011428403116, "grad_norm": 0.7388298511505127, "learning_rate": 1.6791794576408754e-06, "loss": 0.0565, "step": 313975 }, { "epoch": 4.624379611493203, "grad_norm": 1.4073553085327148, "learning_rate": 1.6775430856738909e-06, "loss": 0.0542, "step": 314000 }, { "epoch": 4.62474779458329, "grad_norm": 1.2523738145828247, "learning_rate": 1.6759067137069062e-06, "loss": 0.0544, "step": 314025 }, { "epoch": 4.625115977673378, "grad_norm": 1.7144370079040527, "learning_rate": 1.6742703417399217e-06, "loss": 0.0478, "step": 314050 }, { "epoch": 4.625484160763465, "grad_norm": 1.1651575565338135, "learning_rate": 1.6726339697729372e-06, "loss": 0.0561, "step": 314075 }, { "epoch": 4.625852343853552, "grad_norm": 1.1400476694107056, "learning_rate": 1.6709975978059525e-06, "loss": 0.0538, "step": 314100 }, { "epoch": 4.626220526943639, "grad_norm": 1.1046559810638428, "learning_rate": 1.669361225838968e-06, "loss": 0.0488, "step": 314125 }, { "epoch": 4.626588710033726, "grad_norm": 1.2454677820205688, "learning_rate": 1.6677248538719833e-06, "loss": 0.0517, "step": 314150 }, { "epoch": 4.626956893123813, "grad_norm": 1.4426357746124268, "learning_rate": 1.6660884819049989e-06, "loss": 0.0493, "step": 314175 }, { "epoch": 4.6273250762139, "grad_norm": 1.4759565591812134, "learning_rate": 1.6644521099380142e-06, "loss": 0.0509, "step": 314200 }, { "epoch": 4.627693259303987, "grad_norm": 0.9264955520629883, "learning_rate": 1.6628157379710297e-06, "loss": 0.0499, "step": 314225 }, { "epoch": 4.628061442394074, "grad_norm": 1.1288747787475586, "learning_rate": 1.6611793660040452e-06, "loss": 0.0493, "step": 314250 }, { "epoch": 4.628429625484161, "grad_norm": 1.6480368375778198, "learning_rate": 1.6595429940370607e-06, "loss": 0.0529, "step": 314275 }, { "epoch": 4.628797808574248, "grad_norm": 1.027058482170105, "learning_rate": 1.6579066220700762e-06, "loss": 0.048, "step": 314300 }, { "epoch": 4.629165991664335, "grad_norm": 1.4424378871917725, "learning_rate": 1.6562702501030917e-06, "loss": 0.0495, "step": 314325 }, { "epoch": 4.629534174754422, "grad_norm": 1.127670168876648, "learning_rate": 1.654633878136107e-06, "loss": 0.0495, "step": 314350 }, { "epoch": 4.629902357844509, "grad_norm": 0.9387210607528687, "learning_rate": 1.6529975061691226e-06, "loss": 0.0547, "step": 314375 }, { "epoch": 4.630270540934596, "grad_norm": 0.9497400522232056, "learning_rate": 1.6513611342021379e-06, "loss": 0.0506, "step": 314400 }, { "epoch": 4.630638724024683, "grad_norm": 1.5737370252609253, "learning_rate": 1.6497247622351534e-06, "loss": 0.0545, "step": 314425 }, { "epoch": 4.63100690711477, "grad_norm": 1.650532841682434, "learning_rate": 1.648088390268169e-06, "loss": 0.0512, "step": 314450 }, { "epoch": 4.631375090204857, "grad_norm": 1.9460455179214478, "learning_rate": 1.6464520183011842e-06, "loss": 0.0569, "step": 314475 }, { "epoch": 4.631743273294944, "grad_norm": 1.724442958831787, "learning_rate": 1.6448156463341997e-06, "loss": 0.0583, "step": 314500 }, { "epoch": 4.632111456385031, "grad_norm": 1.5884807109832764, "learning_rate": 1.643179274367215e-06, "loss": 0.0538, "step": 314525 }, { "epoch": 4.632479639475118, "grad_norm": 1.2943904399871826, "learning_rate": 1.6415429024002305e-06, "loss": 0.0453, "step": 314550 }, { "epoch": 4.632847822565205, "grad_norm": 0.8792909979820251, "learning_rate": 1.6399065304332458e-06, "loss": 0.0489, "step": 314575 }, { "epoch": 4.633216005655292, "grad_norm": 1.2160710096359253, "learning_rate": 1.6382701584662614e-06, "loss": 0.0534, "step": 314600 }, { "epoch": 4.633584188745379, "grad_norm": 1.3838019371032715, "learning_rate": 1.6366337864992769e-06, "loss": 0.0543, "step": 314625 }, { "epoch": 4.633952371835466, "grad_norm": 1.0022517442703247, "learning_rate": 1.6349974145322922e-06, "loss": 0.049, "step": 314650 }, { "epoch": 4.634320554925553, "grad_norm": 1.6174120903015137, "learning_rate": 1.6333610425653077e-06, "loss": 0.0526, "step": 314675 }, { "epoch": 4.63468873801564, "grad_norm": 1.340504765510559, "learning_rate": 1.631724670598323e-06, "loss": 0.0486, "step": 314700 }, { "epoch": 4.635056921105727, "grad_norm": 1.519906997680664, "learning_rate": 1.6300882986313385e-06, "loss": 0.0466, "step": 314725 }, { "epoch": 4.635425104195814, "grad_norm": 1.1377347707748413, "learning_rate": 1.6284519266643543e-06, "loss": 0.048, "step": 314750 }, { "epoch": 4.635793287285901, "grad_norm": 1.998067021369934, "learning_rate": 1.6268155546973696e-06, "loss": 0.0531, "step": 314775 }, { "epoch": 4.636161470375988, "grad_norm": 1.3252503871917725, "learning_rate": 1.625179182730385e-06, "loss": 0.0562, "step": 314800 }, { "epoch": 4.636529653466075, "grad_norm": 1.2372859716415405, "learning_rate": 1.6235428107634004e-06, "loss": 0.0478, "step": 314825 }, { "epoch": 4.636897836556162, "grad_norm": 0.8300296068191528, "learning_rate": 1.6219064387964159e-06, "loss": 0.05, "step": 314850 }, { "epoch": 4.637266019646249, "grad_norm": 1.3158036470413208, "learning_rate": 1.6202700668294314e-06, "loss": 0.046, "step": 314875 }, { "epoch": 4.637634202736336, "grad_norm": 1.2119743824005127, "learning_rate": 1.6186336948624467e-06, "loss": 0.0445, "step": 314900 }, { "epoch": 4.638002385826423, "grad_norm": 1.3168491125106812, "learning_rate": 1.6169973228954622e-06, "loss": 0.0557, "step": 314925 }, { "epoch": 4.6383705689165105, "grad_norm": 1.3508614301681519, "learning_rate": 1.6153609509284775e-06, "loss": 0.0538, "step": 314950 }, { "epoch": 4.6387387520065975, "grad_norm": 1.364932656288147, "learning_rate": 1.613724578961493e-06, "loss": 0.0449, "step": 314975 }, { "epoch": 4.6391069350966845, "grad_norm": 1.0636990070343018, "learning_rate": 1.6120882069945086e-06, "loss": 0.0598, "step": 315000 }, { "epoch": 4.6394751181867715, "grad_norm": 1.0130901336669922, "learning_rate": 1.6104518350275239e-06, "loss": 0.0501, "step": 315025 }, { "epoch": 4.6398433012768585, "grad_norm": 2.022745370864868, "learning_rate": 1.6088154630605394e-06, "loss": 0.0548, "step": 315050 }, { "epoch": 4.6402114843669455, "grad_norm": 1.4349260330200195, "learning_rate": 1.6071790910935547e-06, "loss": 0.0506, "step": 315075 }, { "epoch": 4.6405796674570325, "grad_norm": 1.2244259119033813, "learning_rate": 1.6055427191265702e-06, "loss": 0.0527, "step": 315100 }, { "epoch": 4.6409478505471204, "grad_norm": 1.2823513746261597, "learning_rate": 1.6039063471595855e-06, "loss": 0.0487, "step": 315125 }, { "epoch": 4.6413160336372075, "grad_norm": 1.5059019327163696, "learning_rate": 1.602269975192601e-06, "loss": 0.053, "step": 315150 }, { "epoch": 4.6416842167272945, "grad_norm": 1.4514081478118896, "learning_rate": 1.6006336032256165e-06, "loss": 0.0491, "step": 315175 }, { "epoch": 4.6420523998173815, "grad_norm": 1.50724458694458, "learning_rate": 1.5989972312586318e-06, "loss": 0.0556, "step": 315200 }, { "epoch": 4.6424205829074685, "grad_norm": 1.20079505443573, "learning_rate": 1.5973608592916474e-06, "loss": 0.0496, "step": 315225 }, { "epoch": 4.6427887659975555, "grad_norm": 1.6044684648513794, "learning_rate": 1.595724487324663e-06, "loss": 0.0528, "step": 315250 }, { "epoch": 4.6431569490876425, "grad_norm": 1.4655390977859497, "learning_rate": 1.5940881153576784e-06, "loss": 0.0519, "step": 315275 }, { "epoch": 4.6435251321777296, "grad_norm": 0.4617000222206116, "learning_rate": 1.592451743390694e-06, "loss": 0.0485, "step": 315300 }, { "epoch": 4.643893315267817, "grad_norm": 1.4401443004608154, "learning_rate": 1.5908153714237092e-06, "loss": 0.0491, "step": 315325 }, { "epoch": 4.644261498357904, "grad_norm": 0.8690106272697449, "learning_rate": 1.5891789994567247e-06, "loss": 0.0531, "step": 315350 }, { "epoch": 4.644629681447991, "grad_norm": 1.3379560708999634, "learning_rate": 1.58754262748974e-06, "loss": 0.0464, "step": 315375 }, { "epoch": 4.644997864538078, "grad_norm": 1.6646497249603271, "learning_rate": 1.5859062555227556e-06, "loss": 0.0479, "step": 315400 }, { "epoch": 4.645366047628165, "grad_norm": 0.9758337736129761, "learning_rate": 1.584269883555771e-06, "loss": 0.0481, "step": 315425 }, { "epoch": 4.645734230718252, "grad_norm": 1.05086350440979, "learning_rate": 1.5826335115887864e-06, "loss": 0.0477, "step": 315450 }, { "epoch": 4.646102413808339, "grad_norm": 1.4862937927246094, "learning_rate": 1.5809971396218019e-06, "loss": 0.0494, "step": 315475 }, { "epoch": 4.646470596898426, "grad_norm": 1.3824751377105713, "learning_rate": 1.5793607676548172e-06, "loss": 0.0513, "step": 315500 }, { "epoch": 4.646838779988513, "grad_norm": 1.7633616924285889, "learning_rate": 1.5777243956878327e-06, "loss": 0.0518, "step": 315525 }, { "epoch": 4.6472069630786, "grad_norm": 0.899743914604187, "learning_rate": 1.5760880237208482e-06, "loss": 0.0507, "step": 315550 }, { "epoch": 4.647575146168687, "grad_norm": 1.464272379875183, "learning_rate": 1.5744516517538635e-06, "loss": 0.0499, "step": 315575 }, { "epoch": 4.647943329258774, "grad_norm": 1.549464464187622, "learning_rate": 1.572815279786879e-06, "loss": 0.0537, "step": 315600 }, { "epoch": 4.648311512348861, "grad_norm": 1.1711963415145874, "learning_rate": 1.5711789078198944e-06, "loss": 0.0499, "step": 315625 }, { "epoch": 4.648679695438948, "grad_norm": 1.3006376028060913, "learning_rate": 1.5695425358529099e-06, "loss": 0.0516, "step": 315650 }, { "epoch": 4.649047878529035, "grad_norm": 1.8834633827209473, "learning_rate": 1.5679061638859252e-06, "loss": 0.0478, "step": 315675 }, { "epoch": 4.649416061619122, "grad_norm": 1.5423249006271362, "learning_rate": 1.5662697919189407e-06, "loss": 0.0514, "step": 315700 }, { "epoch": 4.649784244709209, "grad_norm": 0.895173966884613, "learning_rate": 1.5646334199519562e-06, "loss": 0.0579, "step": 315725 }, { "epoch": 4.650152427799296, "grad_norm": 1.0859516859054565, "learning_rate": 1.5629970479849717e-06, "loss": 0.0537, "step": 315750 }, { "epoch": 4.650520610889383, "grad_norm": 1.404670000076294, "learning_rate": 1.5613606760179872e-06, "loss": 0.0464, "step": 315775 }, { "epoch": 4.65088879397947, "grad_norm": 0.9491305351257324, "learning_rate": 1.5597243040510028e-06, "loss": 0.0511, "step": 315800 }, { "epoch": 4.651256977069557, "grad_norm": 1.3225444555282593, "learning_rate": 1.558087932084018e-06, "loss": 0.0449, "step": 315825 }, { "epoch": 4.651625160159644, "grad_norm": 1.6994314193725586, "learning_rate": 1.5564515601170336e-06, "loss": 0.0527, "step": 315850 }, { "epoch": 4.651993343249731, "grad_norm": 1.5581889152526855, "learning_rate": 1.5548151881500489e-06, "loss": 0.0568, "step": 315875 }, { "epoch": 4.652361526339818, "grad_norm": 1.135936975479126, "learning_rate": 1.5531788161830644e-06, "loss": 0.0518, "step": 315900 }, { "epoch": 4.652729709429905, "grad_norm": 1.3783987760543823, "learning_rate": 1.5515424442160797e-06, "loss": 0.0534, "step": 315925 }, { "epoch": 4.653097892519993, "grad_norm": 0.7552604079246521, "learning_rate": 1.5499060722490952e-06, "loss": 0.0506, "step": 315950 }, { "epoch": 4.65346607561008, "grad_norm": 1.5362344980239868, "learning_rate": 1.5482697002821107e-06, "loss": 0.0498, "step": 315975 }, { "epoch": 4.653834258700167, "grad_norm": 1.3423012495040894, "learning_rate": 1.546633328315126e-06, "loss": 0.0474, "step": 316000 }, { "epoch": 4.654202441790254, "grad_norm": 1.125746250152588, "learning_rate": 1.5449969563481416e-06, "loss": 0.0546, "step": 316025 }, { "epoch": 4.654570624880341, "grad_norm": 1.1130011081695557, "learning_rate": 1.5433605843811569e-06, "loss": 0.0448, "step": 316050 }, { "epoch": 4.654938807970428, "grad_norm": 0.882328987121582, "learning_rate": 1.5417242124141724e-06, "loss": 0.0573, "step": 316075 }, { "epoch": 4.655306991060515, "grad_norm": 1.3008991479873657, "learning_rate": 1.5400878404471879e-06, "loss": 0.0526, "step": 316100 }, { "epoch": 4.655675174150602, "grad_norm": 1.0555094480514526, "learning_rate": 1.5384514684802032e-06, "loss": 0.0525, "step": 316125 }, { "epoch": 4.656043357240689, "grad_norm": 1.6777547597885132, "learning_rate": 1.5368150965132187e-06, "loss": 0.0473, "step": 316150 }, { "epoch": 4.656411540330776, "grad_norm": 1.5286930799484253, "learning_rate": 1.535178724546234e-06, "loss": 0.0504, "step": 316175 }, { "epoch": 4.656779723420863, "grad_norm": 1.203751802444458, "learning_rate": 1.5335423525792495e-06, "loss": 0.0443, "step": 316200 }, { "epoch": 4.65714790651095, "grad_norm": 1.884735345840454, "learning_rate": 1.5319059806122653e-06, "loss": 0.0484, "step": 316225 }, { "epoch": 4.657516089601037, "grad_norm": 1.0852811336517334, "learning_rate": 1.5302696086452806e-06, "loss": 0.0533, "step": 316250 }, { "epoch": 4.657884272691124, "grad_norm": 0.999225914478302, "learning_rate": 1.528633236678296e-06, "loss": 0.0483, "step": 316275 }, { "epoch": 4.658252455781211, "grad_norm": 1.2926151752471924, "learning_rate": 1.5269968647113114e-06, "loss": 0.0528, "step": 316300 }, { "epoch": 4.658620638871298, "grad_norm": 1.5772346258163452, "learning_rate": 1.525360492744327e-06, "loss": 0.0542, "step": 316325 }, { "epoch": 4.658988821961385, "grad_norm": 1.1165692806243896, "learning_rate": 1.5237241207773424e-06, "loss": 0.052, "step": 316350 }, { "epoch": 4.659357005051472, "grad_norm": 1.0065970420837402, "learning_rate": 1.5220877488103577e-06, "loss": 0.0504, "step": 316375 }, { "epoch": 4.659725188141559, "grad_norm": 1.510586142539978, "learning_rate": 1.5204513768433732e-06, "loss": 0.0536, "step": 316400 }, { "epoch": 4.660093371231646, "grad_norm": 1.4851386547088623, "learning_rate": 1.5188150048763885e-06, "loss": 0.0471, "step": 316425 }, { "epoch": 4.660461554321733, "grad_norm": 1.5317485332489014, "learning_rate": 1.517178632909404e-06, "loss": 0.05, "step": 316450 }, { "epoch": 4.66082973741182, "grad_norm": 1.4782650470733643, "learning_rate": 1.5155422609424194e-06, "loss": 0.0565, "step": 316475 }, { "epoch": 4.661197920501907, "grad_norm": 1.536999225616455, "learning_rate": 1.5139058889754349e-06, "loss": 0.0485, "step": 316500 }, { "epoch": 4.661566103591994, "grad_norm": 1.1295926570892334, "learning_rate": 1.5122695170084504e-06, "loss": 0.0534, "step": 316525 }, { "epoch": 4.661934286682081, "grad_norm": 1.2453328371047974, "learning_rate": 1.5106331450414657e-06, "loss": 0.0514, "step": 316550 }, { "epoch": 4.662302469772168, "grad_norm": 1.397569179534912, "learning_rate": 1.5089967730744812e-06, "loss": 0.0512, "step": 316575 }, { "epoch": 4.662670652862255, "grad_norm": 1.7768785953521729, "learning_rate": 1.5073604011074965e-06, "loss": 0.0559, "step": 316600 }, { "epoch": 4.663038835952342, "grad_norm": 1.3191170692443848, "learning_rate": 1.505724029140512e-06, "loss": 0.0566, "step": 316625 }, { "epoch": 4.663407019042429, "grad_norm": 1.1507437229156494, "learning_rate": 1.5040876571735276e-06, "loss": 0.0528, "step": 316650 }, { "epoch": 4.663775202132516, "grad_norm": 1.3382512331008911, "learning_rate": 1.5024512852065429e-06, "loss": 0.0525, "step": 316675 }, { "epoch": 4.664143385222603, "grad_norm": 1.2620623111724854, "learning_rate": 1.5008149132395584e-06, "loss": 0.0503, "step": 316700 }, { "epoch": 4.66451156831269, "grad_norm": 1.5619930028915405, "learning_rate": 1.499178541272574e-06, "loss": 0.0506, "step": 316725 }, { "epoch": 4.664879751402777, "grad_norm": 1.284537672996521, "learning_rate": 1.4975421693055894e-06, "loss": 0.0528, "step": 316750 }, { "epoch": 4.665247934492864, "grad_norm": 1.6497796773910522, "learning_rate": 1.495905797338605e-06, "loss": 0.0492, "step": 316775 }, { "epoch": 4.665616117582951, "grad_norm": 1.3899180889129639, "learning_rate": 1.4942694253716202e-06, "loss": 0.05, "step": 316800 }, { "epoch": 4.665984300673038, "grad_norm": 1.125265121459961, "learning_rate": 1.4926330534046357e-06, "loss": 0.0456, "step": 316825 }, { "epoch": 4.666352483763125, "grad_norm": 1.2792105674743652, "learning_rate": 1.4910621363163305e-06, "loss": 0.0563, "step": 316850 }, { "epoch": 4.666720666853212, "grad_norm": 1.3837476968765259, "learning_rate": 1.489425764349346e-06, "loss": 0.051, "step": 316875 }, { "epoch": 4.667088849943299, "grad_norm": 1.2632555961608887, "learning_rate": 1.4877893923823613e-06, "loss": 0.0503, "step": 316900 }, { "epoch": 4.667457033033386, "grad_norm": 1.1787450313568115, "learning_rate": 1.4861530204153768e-06, "loss": 0.0572, "step": 316925 }, { "epoch": 4.6678252161234735, "grad_norm": 0.8252682089805603, "learning_rate": 1.4845166484483923e-06, "loss": 0.0487, "step": 316950 }, { "epoch": 4.6681933992135605, "grad_norm": 1.583823561668396, "learning_rate": 1.4828802764814076e-06, "loss": 0.0521, "step": 316975 }, { "epoch": 4.6685615823036475, "grad_norm": 0.885945200920105, "learning_rate": 1.4812439045144231e-06, "loss": 0.0522, "step": 317000 }, { "epoch": 4.6689297653937345, "grad_norm": 1.4736963510513306, "learning_rate": 1.4796075325474384e-06, "loss": 0.0496, "step": 317025 }, { "epoch": 4.669297948483822, "grad_norm": 1.4008076190948486, "learning_rate": 1.477971160580454e-06, "loss": 0.054, "step": 317050 }, { "epoch": 4.669666131573909, "grad_norm": 1.1805485486984253, "learning_rate": 1.4763347886134693e-06, "loss": 0.0479, "step": 317075 }, { "epoch": 4.670034314663996, "grad_norm": 1.7553777694702148, "learning_rate": 1.4746984166464848e-06, "loss": 0.0585, "step": 317100 }, { "epoch": 4.6704024977540834, "grad_norm": 1.246410608291626, "learning_rate": 1.4730620446795003e-06, "loss": 0.0483, "step": 317125 }, { "epoch": 4.6707706808441705, "grad_norm": 1.301881194114685, "learning_rate": 1.4714256727125156e-06, "loss": 0.0514, "step": 317150 }, { "epoch": 4.6711388639342575, "grad_norm": 1.1803393363952637, "learning_rate": 1.4697893007455311e-06, "loss": 0.0517, "step": 317175 }, { "epoch": 4.6715070470243445, "grad_norm": 1.532606601715088, "learning_rate": 1.4681529287785468e-06, "loss": 0.055, "step": 317200 }, { "epoch": 4.6718752301144315, "grad_norm": 1.0925289392471313, "learning_rate": 1.4665165568115622e-06, "loss": 0.0449, "step": 317225 }, { "epoch": 4.6722434132045185, "grad_norm": 1.2983659505844116, "learning_rate": 1.464945639723257e-06, "loss": 0.0494, "step": 317250 }, { "epoch": 4.6726115962946055, "grad_norm": 1.8341485261917114, "learning_rate": 1.4633092677562724e-06, "loss": 0.052, "step": 317275 }, { "epoch": 4.6729797793846926, "grad_norm": 1.4778763055801392, "learning_rate": 1.461672895789288e-06, "loss": 0.0533, "step": 317300 }, { "epoch": 4.67334796247478, "grad_norm": 1.5188133716583252, "learning_rate": 1.4600365238223032e-06, "loss": 0.0488, "step": 317325 }, { "epoch": 4.673716145564867, "grad_norm": 1.1263964176177979, "learning_rate": 1.4584001518553187e-06, "loss": 0.0454, "step": 317350 }, { "epoch": 4.674084328654954, "grad_norm": 1.165518879890442, "learning_rate": 1.456763779888334e-06, "loss": 0.0471, "step": 317375 }, { "epoch": 4.674452511745041, "grad_norm": 1.2447923421859741, "learning_rate": 1.4551274079213496e-06, "loss": 0.0537, "step": 317400 }, { "epoch": 4.674820694835128, "grad_norm": 1.7116609811782837, "learning_rate": 1.453491035954365e-06, "loss": 0.0519, "step": 317425 }, { "epoch": 4.675188877925215, "grad_norm": 1.2317157983779907, "learning_rate": 1.4518546639873804e-06, "loss": 0.0485, "step": 317450 }, { "epoch": 4.675557061015302, "grad_norm": 1.0174763202667236, "learning_rate": 1.4502182920203959e-06, "loss": 0.0489, "step": 317475 }, { "epoch": 4.675925244105389, "grad_norm": 1.6003159284591675, "learning_rate": 1.4485819200534112e-06, "loss": 0.05, "step": 317500 }, { "epoch": 4.676293427195476, "grad_norm": 1.542261004447937, "learning_rate": 1.4469455480864267e-06, "loss": 0.0538, "step": 317525 }, { "epoch": 4.676661610285563, "grad_norm": 1.2530255317687988, "learning_rate": 1.445309176119442e-06, "loss": 0.0564, "step": 317550 }, { "epoch": 4.67702979337565, "grad_norm": 1.2042635679244995, "learning_rate": 1.4436728041524575e-06, "loss": 0.0502, "step": 317575 }, { "epoch": 4.677397976465737, "grad_norm": 1.1547775268554688, "learning_rate": 1.442036432185473e-06, "loss": 0.0545, "step": 317600 }, { "epoch": 4.677766159555824, "grad_norm": 1.435349941253662, "learning_rate": 1.4404000602184884e-06, "loss": 0.0541, "step": 317625 }, { "epoch": 4.678134342645911, "grad_norm": 1.601083517074585, "learning_rate": 1.4387636882515039e-06, "loss": 0.0539, "step": 317650 }, { "epoch": 4.678502525735998, "grad_norm": 1.3502408266067505, "learning_rate": 1.4371273162845196e-06, "loss": 0.0572, "step": 317675 }, { "epoch": 4.678870708826085, "grad_norm": 1.0823687314987183, "learning_rate": 1.435490944317535e-06, "loss": 0.0443, "step": 317700 }, { "epoch": 4.679238891916172, "grad_norm": 1.1347333192825317, "learning_rate": 1.4338545723505504e-06, "loss": 0.0489, "step": 317725 }, { "epoch": 4.679607075006259, "grad_norm": 1.3563753366470337, "learning_rate": 1.4322182003835657e-06, "loss": 0.0511, "step": 317750 }, { "epoch": 4.679975258096346, "grad_norm": 1.3053771257400513, "learning_rate": 1.4305818284165812e-06, "loss": 0.0493, "step": 317775 }, { "epoch": 4.680343441186433, "grad_norm": 1.4208966493606567, "learning_rate": 1.4289454564495968e-06, "loss": 0.0562, "step": 317800 }, { "epoch": 4.68071162427652, "grad_norm": 1.2151519060134888, "learning_rate": 1.427309084482612e-06, "loss": 0.05, "step": 317825 }, { "epoch": 4.681079807366607, "grad_norm": 1.4731098413467407, "learning_rate": 1.4256727125156276e-06, "loss": 0.0488, "step": 317850 }, { "epoch": 4.681447990456695, "grad_norm": 1.3843494653701782, "learning_rate": 1.4240363405486429e-06, "loss": 0.0521, "step": 317875 }, { "epoch": 4.681816173546782, "grad_norm": 1.7827988862991333, "learning_rate": 1.4223999685816584e-06, "loss": 0.0505, "step": 317900 }, { "epoch": 4.682184356636869, "grad_norm": 1.2990777492523193, "learning_rate": 1.4207635966146737e-06, "loss": 0.0536, "step": 317925 }, { "epoch": 4.682552539726956, "grad_norm": 1.0244513750076294, "learning_rate": 1.4191272246476892e-06, "loss": 0.0533, "step": 317950 }, { "epoch": 4.682920722817043, "grad_norm": 1.6147840023040771, "learning_rate": 1.4174908526807047e-06, "loss": 0.0499, "step": 317975 }, { "epoch": 4.68328890590713, "grad_norm": 1.1264728307724, "learning_rate": 1.41585448071372e-06, "loss": 0.055, "step": 318000 }, { "epoch": 4.683657088997217, "grad_norm": 1.2167012691497803, "learning_rate": 1.4142181087467356e-06, "loss": 0.0556, "step": 318025 }, { "epoch": 4.684025272087304, "grad_norm": 1.3691784143447876, "learning_rate": 1.4125817367797509e-06, "loss": 0.0452, "step": 318050 }, { "epoch": 4.684393455177391, "grad_norm": 1.1392791271209717, "learning_rate": 1.4109453648127664e-06, "loss": 0.0532, "step": 318075 }, { "epoch": 4.684761638267478, "grad_norm": 1.8969905376434326, "learning_rate": 1.4093089928457817e-06, "loss": 0.0508, "step": 318100 }, { "epoch": 4.685129821357565, "grad_norm": 1.0668405294418335, "learning_rate": 1.4076726208787972e-06, "loss": 0.052, "step": 318125 }, { "epoch": 4.685498004447652, "grad_norm": 1.4562944173812866, "learning_rate": 1.406036248911813e-06, "loss": 0.0512, "step": 318150 }, { "epoch": 4.685866187537739, "grad_norm": 1.69785737991333, "learning_rate": 1.4043998769448282e-06, "loss": 0.0484, "step": 318175 }, { "epoch": 4.686234370627826, "grad_norm": 0.809941828250885, "learning_rate": 1.4027635049778437e-06, "loss": 0.0472, "step": 318200 }, { "epoch": 4.686602553717913, "grad_norm": 1.2320263385772705, "learning_rate": 1.4011271330108593e-06, "loss": 0.0493, "step": 318225 }, { "epoch": 4.686970736808, "grad_norm": 1.3246911764144897, "learning_rate": 1.3994907610438746e-06, "loss": 0.0551, "step": 318250 }, { "epoch": 4.687338919898087, "grad_norm": 1.4349480867385864, "learning_rate": 1.39785438907689e-06, "loss": 0.0475, "step": 318275 }, { "epoch": 4.687707102988174, "grad_norm": 0.7472424507141113, "learning_rate": 1.3962180171099054e-06, "loss": 0.0539, "step": 318300 }, { "epoch": 4.688075286078261, "grad_norm": 1.3324025869369507, "learning_rate": 1.394581645142921e-06, "loss": 0.049, "step": 318325 }, { "epoch": 4.688443469168348, "grad_norm": 1.3278992176055908, "learning_rate": 1.3929452731759362e-06, "loss": 0.0494, "step": 318350 }, { "epoch": 4.688811652258435, "grad_norm": 1.0949945449829102, "learning_rate": 1.3913089012089517e-06, "loss": 0.0513, "step": 318375 }, { "epoch": 4.689179835348522, "grad_norm": 0.8925934433937073, "learning_rate": 1.3896725292419672e-06, "loss": 0.0485, "step": 318400 }, { "epoch": 4.689548018438609, "grad_norm": 1.1080822944641113, "learning_rate": 1.3880361572749825e-06, "loss": 0.0493, "step": 318425 }, { "epoch": 4.689916201528696, "grad_norm": 1.1725772619247437, "learning_rate": 1.386399785307998e-06, "loss": 0.0478, "step": 318450 }, { "epoch": 4.690284384618783, "grad_norm": 1.3580622673034668, "learning_rate": 1.3847634133410134e-06, "loss": 0.0561, "step": 318475 }, { "epoch": 4.69065256770887, "grad_norm": 1.0833568572998047, "learning_rate": 1.3831270413740289e-06, "loss": 0.0476, "step": 318500 }, { "epoch": 4.691020750798957, "grad_norm": 1.0186476707458496, "learning_rate": 1.3814906694070444e-06, "loss": 0.0576, "step": 318525 }, { "epoch": 4.691388933889044, "grad_norm": 1.232149600982666, "learning_rate": 1.3798542974400597e-06, "loss": 0.0512, "step": 318550 }, { "epoch": 4.691757116979131, "grad_norm": 1.1767240762710571, "learning_rate": 1.3782179254730752e-06, "loss": 0.0464, "step": 318575 }, { "epoch": 4.692125300069218, "grad_norm": 1.4793529510498047, "learning_rate": 1.3765815535060905e-06, "loss": 0.0555, "step": 318600 }, { "epoch": 4.692493483159305, "grad_norm": 1.2782994508743286, "learning_rate": 1.374945181539106e-06, "loss": 0.0506, "step": 318625 }, { "epoch": 4.692861666249392, "grad_norm": 1.3629090785980225, "learning_rate": 1.3733088095721218e-06, "loss": 0.0581, "step": 318650 }, { "epoch": 4.693229849339479, "grad_norm": 0.9741042256355286, "learning_rate": 1.371672437605137e-06, "loss": 0.0594, "step": 318675 }, { "epoch": 4.693598032429566, "grad_norm": 1.7201316356658936, "learning_rate": 1.3700360656381526e-06, "loss": 0.0553, "step": 318700 }, { "epoch": 4.693966215519653, "grad_norm": 1.2854005098342896, "learning_rate": 1.3683996936711679e-06, "loss": 0.048, "step": 318725 }, { "epoch": 4.69433439860974, "grad_norm": 1.5507843494415283, "learning_rate": 1.3667633217041834e-06, "loss": 0.053, "step": 318750 }, { "epoch": 4.694702581699827, "grad_norm": 1.2158867120742798, "learning_rate": 1.365126949737199e-06, "loss": 0.0502, "step": 318775 }, { "epoch": 4.695070764789914, "grad_norm": 1.0017958879470825, "learning_rate": 1.3634905777702142e-06, "loss": 0.0484, "step": 318800 }, { "epoch": 4.695438947880001, "grad_norm": 1.4825832843780518, "learning_rate": 1.3618542058032297e-06, "loss": 0.0525, "step": 318825 }, { "epoch": 4.695807130970088, "grad_norm": 1.5330793857574463, "learning_rate": 1.360217833836245e-06, "loss": 0.0543, "step": 318850 }, { "epoch": 4.696175314060175, "grad_norm": 1.3737893104553223, "learning_rate": 1.3585814618692606e-06, "loss": 0.057, "step": 318875 }, { "epoch": 4.696543497150262, "grad_norm": 1.6639982461929321, "learning_rate": 1.3569450899022759e-06, "loss": 0.0535, "step": 318900 }, { "epoch": 4.696911680240349, "grad_norm": 1.3726798295974731, "learning_rate": 1.3553087179352914e-06, "loss": 0.047, "step": 318925 }, { "epoch": 4.697279863330437, "grad_norm": 1.4801117181777954, "learning_rate": 1.353672345968307e-06, "loss": 0.0511, "step": 318950 }, { "epoch": 4.697648046420524, "grad_norm": 1.135445475578308, "learning_rate": 1.3520359740013222e-06, "loss": 0.0519, "step": 318975 }, { "epoch": 4.698016229510611, "grad_norm": 1.2365450859069824, "learning_rate": 1.3503996020343377e-06, "loss": 0.0494, "step": 319000 }, { "epoch": 4.698384412600698, "grad_norm": 1.4642046689987183, "learning_rate": 1.348763230067353e-06, "loss": 0.0475, "step": 319025 }, { "epoch": 4.698752595690785, "grad_norm": 1.3627007007598877, "learning_rate": 1.3471268581003685e-06, "loss": 0.0505, "step": 319050 }, { "epoch": 4.699120778780872, "grad_norm": 1.0912110805511475, "learning_rate": 1.345490486133384e-06, "loss": 0.0557, "step": 319075 }, { "epoch": 4.699488961870959, "grad_norm": 1.7698466777801514, "learning_rate": 1.3438541141663994e-06, "loss": 0.0489, "step": 319100 }, { "epoch": 4.6998571449610465, "grad_norm": 0.9112261533737183, "learning_rate": 1.3422177421994149e-06, "loss": 0.0552, "step": 319125 }, { "epoch": 4.7002253280511335, "grad_norm": 1.3635870218276978, "learning_rate": 1.3405813702324306e-06, "loss": 0.0512, "step": 319150 }, { "epoch": 4.7005935111412205, "grad_norm": 0.8168050050735474, "learning_rate": 1.338944998265446e-06, "loss": 0.0521, "step": 319175 }, { "epoch": 4.7009616942313075, "grad_norm": 1.0441139936447144, "learning_rate": 1.3373086262984614e-06, "loss": 0.0506, "step": 319200 }, { "epoch": 4.7013298773213945, "grad_norm": 1.6629910469055176, "learning_rate": 1.3356722543314767e-06, "loss": 0.0503, "step": 319225 }, { "epoch": 4.7016980604114815, "grad_norm": 1.2574551105499268, "learning_rate": 1.3340358823644922e-06, "loss": 0.0515, "step": 319250 }, { "epoch": 4.7020662435015685, "grad_norm": 1.3636797666549683, "learning_rate": 1.3323995103975075e-06, "loss": 0.0486, "step": 319275 }, { "epoch": 4.702434426591656, "grad_norm": 0.7224562764167786, "learning_rate": 1.330763138430523e-06, "loss": 0.0509, "step": 319300 }, { "epoch": 4.702802609681743, "grad_norm": 1.9357801675796509, "learning_rate": 1.3291267664635386e-06, "loss": 0.057, "step": 319325 }, { "epoch": 4.70317079277183, "grad_norm": 0.9637460112571716, "learning_rate": 1.3274903944965539e-06, "loss": 0.0489, "step": 319350 }, { "epoch": 4.703538975861917, "grad_norm": 1.1193232536315918, "learning_rate": 1.3258540225295694e-06, "loss": 0.0565, "step": 319375 }, { "epoch": 4.703907158952004, "grad_norm": 1.5464826822280884, "learning_rate": 1.3242831054412641e-06, "loss": 0.0537, "step": 319400 }, { "epoch": 4.704275342042091, "grad_norm": 1.3487155437469482, "learning_rate": 1.3226467334742796e-06, "loss": 0.0501, "step": 319425 }, { "epoch": 4.704643525132178, "grad_norm": 1.546373724937439, "learning_rate": 1.321010361507295e-06, "loss": 0.0525, "step": 319450 }, { "epoch": 4.705011708222265, "grad_norm": 1.136923909187317, "learning_rate": 1.3193739895403105e-06, "loss": 0.0534, "step": 319475 }, { "epoch": 4.705379891312352, "grad_norm": 1.3697749376296997, "learning_rate": 1.3177376175733258e-06, "loss": 0.0518, "step": 319500 }, { "epoch": 4.705748074402439, "grad_norm": 1.372707724571228, "learning_rate": 1.3161012456063413e-06, "loss": 0.0472, "step": 319525 }, { "epoch": 4.706116257492526, "grad_norm": 1.7428264617919922, "learning_rate": 1.3144648736393568e-06, "loss": 0.0524, "step": 319550 }, { "epoch": 4.706484440582613, "grad_norm": 1.4106968641281128, "learning_rate": 1.312828501672372e-06, "loss": 0.0515, "step": 319575 }, { "epoch": 4.7068526236727, "grad_norm": 1.0190820693969727, "learning_rate": 1.3111921297053876e-06, "loss": 0.054, "step": 319600 }, { "epoch": 4.707220806762787, "grad_norm": 1.2995719909667969, "learning_rate": 1.3095557577384033e-06, "loss": 0.046, "step": 319625 }, { "epoch": 4.707588989852874, "grad_norm": 1.1809227466583252, "learning_rate": 1.3079193857714187e-06, "loss": 0.0492, "step": 319650 }, { "epoch": 4.707957172942961, "grad_norm": 1.3847019672393799, "learning_rate": 1.3062830138044342e-06, "loss": 0.048, "step": 319675 }, { "epoch": 4.708325356033048, "grad_norm": 1.6168261766433716, "learning_rate": 1.3046466418374495e-06, "loss": 0.0538, "step": 319700 }, { "epoch": 4.708693539123135, "grad_norm": 1.3727226257324219, "learning_rate": 1.303010269870465e-06, "loss": 0.0529, "step": 319725 }, { "epoch": 4.709061722213222, "grad_norm": 1.4479005336761475, "learning_rate": 1.3013738979034803e-06, "loss": 0.0504, "step": 319750 }, { "epoch": 4.709429905303309, "grad_norm": 1.310695767402649, "learning_rate": 1.2997375259364958e-06, "loss": 0.0475, "step": 319775 }, { "epoch": 4.709798088393397, "grad_norm": 0.8655557632446289, "learning_rate": 1.2981011539695113e-06, "loss": 0.0493, "step": 319800 }, { "epoch": 4.710166271483484, "grad_norm": 1.3890773057937622, "learning_rate": 1.2964647820025266e-06, "loss": 0.0424, "step": 319825 }, { "epoch": 4.710534454573571, "grad_norm": 1.1416980028152466, "learning_rate": 1.2948284100355421e-06, "loss": 0.0532, "step": 319850 }, { "epoch": 4.710902637663658, "grad_norm": 1.0013505220413208, "learning_rate": 1.2931920380685575e-06, "loss": 0.0466, "step": 319875 }, { "epoch": 4.711270820753745, "grad_norm": 1.165756106376648, "learning_rate": 1.291555666101573e-06, "loss": 0.0556, "step": 319900 }, { "epoch": 4.711639003843832, "grad_norm": 1.3401901721954346, "learning_rate": 1.2899192941345885e-06, "loss": 0.0453, "step": 319925 }, { "epoch": 4.712007186933919, "grad_norm": 1.4125206470489502, "learning_rate": 1.2882829221676038e-06, "loss": 0.0433, "step": 319950 }, { "epoch": 4.712375370024006, "grad_norm": 1.076216459274292, "learning_rate": 1.2866465502006193e-06, "loss": 0.0511, "step": 319975 }, { "epoch": 4.712743553114093, "grad_norm": 1.5663036108016968, "learning_rate": 1.2850101782336346e-06, "loss": 0.0518, "step": 320000 }, { "epoch": 4.71311173620418, "grad_norm": 1.4842246770858765, "learning_rate": 1.2833738062666501e-06, "loss": 0.0592, "step": 320025 }, { "epoch": 4.713479919294267, "grad_norm": 1.2105618715286255, "learning_rate": 1.2817374342996654e-06, "loss": 0.05, "step": 320050 }, { "epoch": 4.713848102384354, "grad_norm": 1.1992719173431396, "learning_rate": 1.280101062332681e-06, "loss": 0.0516, "step": 320075 }, { "epoch": 4.714216285474441, "grad_norm": 1.4981083869934082, "learning_rate": 1.2784646903656967e-06, "loss": 0.0522, "step": 320100 }, { "epoch": 4.714584468564528, "grad_norm": 1.3090482950210571, "learning_rate": 1.276828318398712e-06, "loss": 0.0539, "step": 320125 }, { "epoch": 4.714952651654615, "grad_norm": 1.3454090356826782, "learning_rate": 1.2751919464317275e-06, "loss": 0.0549, "step": 320150 }, { "epoch": 4.715320834744702, "grad_norm": 1.3377399444580078, "learning_rate": 1.273555574464743e-06, "loss": 0.0504, "step": 320175 }, { "epoch": 4.715689017834789, "grad_norm": 1.4805724620819092, "learning_rate": 1.2719192024977583e-06, "loss": 0.0515, "step": 320200 }, { "epoch": 4.716057200924876, "grad_norm": 1.2056387662887573, "learning_rate": 1.2702828305307738e-06, "loss": 0.0479, "step": 320225 }, { "epoch": 4.716425384014963, "grad_norm": 1.34861159324646, "learning_rate": 1.2686464585637891e-06, "loss": 0.0482, "step": 320250 }, { "epoch": 4.71679356710505, "grad_norm": 0.8846838474273682, "learning_rate": 1.2670100865968047e-06, "loss": 0.0492, "step": 320275 }, { "epoch": 4.717161750195137, "grad_norm": 1.5410267114639282, "learning_rate": 1.26537371462982e-06, "loss": 0.0478, "step": 320300 }, { "epoch": 4.717529933285224, "grad_norm": 1.4455264806747437, "learning_rate": 1.2637373426628355e-06, "loss": 0.0547, "step": 320325 }, { "epoch": 4.717898116375311, "grad_norm": 1.443855881690979, "learning_rate": 1.262100970695851e-06, "loss": 0.0474, "step": 320350 }, { "epoch": 4.718266299465398, "grad_norm": 1.4226480722427368, "learning_rate": 1.2604645987288663e-06, "loss": 0.0525, "step": 320375 }, { "epoch": 4.718634482555485, "grad_norm": 1.2856069803237915, "learning_rate": 1.2588282267618818e-06, "loss": 0.0495, "step": 320400 }, { "epoch": 4.719002665645572, "grad_norm": 1.7483409643173218, "learning_rate": 1.2571918547948971e-06, "loss": 0.0494, "step": 320425 }, { "epoch": 4.719370848735659, "grad_norm": 1.3221842050552368, "learning_rate": 1.2555554828279126e-06, "loss": 0.0481, "step": 320450 }, { "epoch": 4.719739031825746, "grad_norm": 1.5567339658737183, "learning_rate": 1.2539191108609281e-06, "loss": 0.0499, "step": 320475 }, { "epoch": 4.720107214915833, "grad_norm": 1.3565034866333008, "learning_rate": 1.2522827388939435e-06, "loss": 0.0521, "step": 320500 }, { "epoch": 4.72047539800592, "grad_norm": 0.942406415939331, "learning_rate": 1.250646366926959e-06, "loss": 0.0531, "step": 320525 }, { "epoch": 4.720843581096007, "grad_norm": 1.432540774345398, "learning_rate": 1.2490099949599745e-06, "loss": 0.0516, "step": 320550 }, { "epoch": 4.721211764186094, "grad_norm": 0.9517788887023926, "learning_rate": 1.2473736229929898e-06, "loss": 0.0521, "step": 320575 }, { "epoch": 4.721579947276181, "grad_norm": 1.0094650983810425, "learning_rate": 1.2457372510260053e-06, "loss": 0.0555, "step": 320600 }, { "epoch": 4.721948130366268, "grad_norm": 1.7948014736175537, "learning_rate": 1.2441008790590208e-06, "loss": 0.0538, "step": 320625 }, { "epoch": 4.722316313456355, "grad_norm": 1.6364896297454834, "learning_rate": 1.2424645070920361e-06, "loss": 0.0466, "step": 320650 }, { "epoch": 4.722684496546442, "grad_norm": 1.1485779285430908, "learning_rate": 1.2408281351250516e-06, "loss": 0.0502, "step": 320675 }, { "epoch": 4.723052679636529, "grad_norm": 1.5522489547729492, "learning_rate": 1.239191763158067e-06, "loss": 0.0534, "step": 320700 }, { "epoch": 4.723420862726616, "grad_norm": 1.1088215112686157, "learning_rate": 1.2375553911910827e-06, "loss": 0.049, "step": 320725 }, { "epoch": 4.723789045816703, "grad_norm": 1.452671766281128, "learning_rate": 1.235919019224098e-06, "loss": 0.0522, "step": 320750 }, { "epoch": 4.72415722890679, "grad_norm": 1.465901494026184, "learning_rate": 1.2342826472571135e-06, "loss": 0.0476, "step": 320775 }, { "epoch": 4.724525411996877, "grad_norm": 0.9885482788085938, "learning_rate": 1.2326462752901288e-06, "loss": 0.0492, "step": 320800 }, { "epoch": 4.724893595086964, "grad_norm": 1.0932338237762451, "learning_rate": 1.2310099033231443e-06, "loss": 0.0538, "step": 320825 }, { "epoch": 4.725261778177051, "grad_norm": 1.2269691228866577, "learning_rate": 1.2293735313561596e-06, "loss": 0.0466, "step": 320850 }, { "epoch": 4.725629961267139, "grad_norm": 1.564968228340149, "learning_rate": 1.2277371593891751e-06, "loss": 0.0529, "step": 320875 }, { "epoch": 4.725998144357226, "grad_norm": 1.5487475395202637, "learning_rate": 1.2261007874221907e-06, "loss": 0.0554, "step": 320900 }, { "epoch": 4.726366327447313, "grad_norm": 1.544245958328247, "learning_rate": 1.224464415455206e-06, "loss": 0.0594, "step": 320925 }, { "epoch": 4.7267345105374, "grad_norm": 1.680324673652649, "learning_rate": 1.2228280434882215e-06, "loss": 0.0536, "step": 320950 }, { "epoch": 4.727102693627487, "grad_norm": 1.075310468673706, "learning_rate": 1.221191671521237e-06, "loss": 0.0481, "step": 320975 }, { "epoch": 4.727470876717574, "grad_norm": 0.7573911547660828, "learning_rate": 1.2195552995542525e-06, "loss": 0.056, "step": 321000 }, { "epoch": 4.727839059807661, "grad_norm": 1.0710138082504272, "learning_rate": 1.2179189275872678e-06, "loss": 0.0482, "step": 321025 }, { "epoch": 4.728207242897748, "grad_norm": 0.8604738116264343, "learning_rate": 1.2162825556202833e-06, "loss": 0.0522, "step": 321050 }, { "epoch": 4.728575425987835, "grad_norm": 1.9007833003997803, "learning_rate": 1.2146461836532986e-06, "loss": 0.0536, "step": 321075 }, { "epoch": 4.728943609077922, "grad_norm": 1.7964595556259155, "learning_rate": 1.2130098116863141e-06, "loss": 0.0481, "step": 321100 }, { "epoch": 4.7293117921680095, "grad_norm": 0.9984670877456665, "learning_rate": 1.2113734397193295e-06, "loss": 0.0434, "step": 321125 }, { "epoch": 4.7296799752580965, "grad_norm": 0.8985347747802734, "learning_rate": 1.209737067752345e-06, "loss": 0.0581, "step": 321150 }, { "epoch": 4.7300481583481835, "grad_norm": 1.394180178642273, "learning_rate": 1.2081006957853605e-06, "loss": 0.0499, "step": 321175 }, { "epoch": 4.7304163414382705, "grad_norm": 0.9982278347015381, "learning_rate": 1.2064643238183758e-06, "loss": 0.0503, "step": 321200 }, { "epoch": 4.7307845245283575, "grad_norm": 1.2349838018417358, "learning_rate": 1.2048279518513913e-06, "loss": 0.0513, "step": 321225 }, { "epoch": 4.7311527076184445, "grad_norm": 1.577845573425293, "learning_rate": 1.2031915798844068e-06, "loss": 0.056, "step": 321250 }, { "epoch": 4.7315208907085315, "grad_norm": 0.9992119073867798, "learning_rate": 1.2015552079174223e-06, "loss": 0.0532, "step": 321275 }, { "epoch": 4.731889073798619, "grad_norm": 1.7987030744552612, "learning_rate": 1.1999188359504376e-06, "loss": 0.0503, "step": 321300 }, { "epoch": 4.732257256888706, "grad_norm": 1.6102665662765503, "learning_rate": 1.1982824639834532e-06, "loss": 0.0535, "step": 321325 }, { "epoch": 4.732625439978793, "grad_norm": 1.149827480316162, "learning_rate": 1.1966460920164685e-06, "loss": 0.0433, "step": 321350 }, { "epoch": 4.73299362306888, "grad_norm": 1.6134099960327148, "learning_rate": 1.195009720049484e-06, "loss": 0.0543, "step": 321375 }, { "epoch": 4.733361806158967, "grad_norm": 1.3289073705673218, "learning_rate": 1.1933733480824993e-06, "loss": 0.0524, "step": 321400 }, { "epoch": 4.733729989249054, "grad_norm": 1.0444259643554688, "learning_rate": 1.1917369761155148e-06, "loss": 0.0496, "step": 321425 }, { "epoch": 4.734098172339141, "grad_norm": 1.2919409275054932, "learning_rate": 1.1901006041485303e-06, "loss": 0.0489, "step": 321450 }, { "epoch": 4.734466355429228, "grad_norm": 1.063719391822815, "learning_rate": 1.1884642321815458e-06, "loss": 0.0524, "step": 321475 }, { "epoch": 4.734834538519315, "grad_norm": 1.7920410633087158, "learning_rate": 1.1868278602145611e-06, "loss": 0.048, "step": 321500 }, { "epoch": 4.735202721609402, "grad_norm": 1.9228638410568237, "learning_rate": 1.1851914882475767e-06, "loss": 0.0467, "step": 321525 }, { "epoch": 4.735570904699489, "grad_norm": 1.3136377334594727, "learning_rate": 1.1835551162805922e-06, "loss": 0.0473, "step": 321550 }, { "epoch": 4.735939087789576, "grad_norm": 1.263770341873169, "learning_rate": 1.1819187443136075e-06, "loss": 0.0514, "step": 321575 }, { "epoch": 4.736307270879663, "grad_norm": 1.6247344017028809, "learning_rate": 1.180282372346623e-06, "loss": 0.0545, "step": 321600 }, { "epoch": 4.73667545396975, "grad_norm": 1.4460808038711548, "learning_rate": 1.1786460003796383e-06, "loss": 0.0504, "step": 321625 }, { "epoch": 4.737043637059837, "grad_norm": 0.8617902398109436, "learning_rate": 1.1770096284126538e-06, "loss": 0.0551, "step": 321650 }, { "epoch": 4.737411820149924, "grad_norm": 1.2764588594436646, "learning_rate": 1.1753732564456691e-06, "loss": 0.0548, "step": 321675 }, { "epoch": 4.737780003240011, "grad_norm": 1.4454516172409058, "learning_rate": 1.1737368844786848e-06, "loss": 0.0488, "step": 321700 }, { "epoch": 4.738148186330099, "grad_norm": 1.0546947717666626, "learning_rate": 1.1721005125117001e-06, "loss": 0.0508, "step": 321725 }, { "epoch": 4.738516369420186, "grad_norm": 1.3404879570007324, "learning_rate": 1.1704641405447157e-06, "loss": 0.0571, "step": 321750 }, { "epoch": 4.738884552510273, "grad_norm": 1.3486171960830688, "learning_rate": 1.168827768577731e-06, "loss": 0.0529, "step": 321775 }, { "epoch": 4.73925273560036, "grad_norm": 1.0811330080032349, "learning_rate": 1.1671913966107465e-06, "loss": 0.0574, "step": 321800 }, { "epoch": 4.739620918690447, "grad_norm": 1.2224199771881104, "learning_rate": 1.165555024643762e-06, "loss": 0.0469, "step": 321825 }, { "epoch": 4.739989101780534, "grad_norm": 1.5925763845443726, "learning_rate": 1.1639186526767773e-06, "loss": 0.0521, "step": 321850 }, { "epoch": 4.740357284870621, "grad_norm": 1.9667174816131592, "learning_rate": 1.1622822807097928e-06, "loss": 0.0629, "step": 321875 }, { "epoch": 4.740725467960708, "grad_norm": 1.2191787958145142, "learning_rate": 1.1606459087428081e-06, "loss": 0.0547, "step": 321900 }, { "epoch": 4.741093651050795, "grad_norm": 1.41875422000885, "learning_rate": 1.1590095367758236e-06, "loss": 0.0511, "step": 321925 }, { "epoch": 4.741461834140882, "grad_norm": 1.2733855247497559, "learning_rate": 1.1573731648088392e-06, "loss": 0.0516, "step": 321950 }, { "epoch": 4.741830017230969, "grad_norm": 1.1774779558181763, "learning_rate": 1.1557367928418547e-06, "loss": 0.0479, "step": 321975 }, { "epoch": 4.742198200321056, "grad_norm": 1.20047128200531, "learning_rate": 1.15410042087487e-06, "loss": 0.0528, "step": 322000 }, { "epoch": 4.742566383411143, "grad_norm": 1.8065284490585327, "learning_rate": 1.1524640489078855e-06, "loss": 0.0515, "step": 322025 }, { "epoch": 4.74293456650123, "grad_norm": 1.3913235664367676, "learning_rate": 1.1508276769409008e-06, "loss": 0.0512, "step": 322050 }, { "epoch": 4.743302749591317, "grad_norm": 1.7780894041061401, "learning_rate": 1.1491913049739163e-06, "loss": 0.049, "step": 322075 }, { "epoch": 4.743670932681404, "grad_norm": 0.8062098026275635, "learning_rate": 1.1475549330069318e-06, "loss": 0.0557, "step": 322100 }, { "epoch": 4.744039115771491, "grad_norm": 1.3578803539276123, "learning_rate": 1.1459185610399471e-06, "loss": 0.0547, "step": 322125 }, { "epoch": 4.744407298861578, "grad_norm": 1.3337706327438354, "learning_rate": 1.1442821890729627e-06, "loss": 0.0597, "step": 322150 }, { "epoch": 4.744775481951665, "grad_norm": 1.9644591808319092, "learning_rate": 1.142645817105978e-06, "loss": 0.0532, "step": 322175 }, { "epoch": 4.745143665041752, "grad_norm": 1.4632443189620972, "learning_rate": 1.1410094451389937e-06, "loss": 0.0543, "step": 322200 }, { "epoch": 4.745511848131839, "grad_norm": 1.484179139137268, "learning_rate": 1.139373073172009e-06, "loss": 0.0524, "step": 322225 }, { "epoch": 4.745880031221926, "grad_norm": 1.3495906591415405, "learning_rate": 1.1377367012050245e-06, "loss": 0.054, "step": 322250 }, { "epoch": 4.746248214312013, "grad_norm": 0.9905472993850708, "learning_rate": 1.1361003292380398e-06, "loss": 0.0486, "step": 322275 }, { "epoch": 4.7466163974021, "grad_norm": 1.2342287302017212, "learning_rate": 1.1344639572710553e-06, "loss": 0.0504, "step": 322300 }, { "epoch": 4.746984580492187, "grad_norm": 0.6113099455833435, "learning_rate": 1.1328275853040706e-06, "loss": 0.0415, "step": 322325 }, { "epoch": 4.747352763582274, "grad_norm": 1.0404375791549683, "learning_rate": 1.1311912133370861e-06, "loss": 0.0504, "step": 322350 }, { "epoch": 4.747720946672361, "grad_norm": 1.4372038841247559, "learning_rate": 1.1295548413701017e-06, "loss": 0.0535, "step": 322375 }, { "epoch": 4.748089129762448, "grad_norm": 1.7727999687194824, "learning_rate": 1.127918469403117e-06, "loss": 0.0537, "step": 322400 }, { "epoch": 4.748457312852535, "grad_norm": 1.2488081455230713, "learning_rate": 1.1262820974361325e-06, "loss": 0.0501, "step": 322425 }, { "epoch": 4.748825495942622, "grad_norm": 0.9674785137176514, "learning_rate": 1.124645725469148e-06, "loss": 0.0542, "step": 322450 }, { "epoch": 4.749193679032709, "grad_norm": 1.5368746519088745, "learning_rate": 1.1230093535021635e-06, "loss": 0.053, "step": 322475 }, { "epoch": 4.749561862122796, "grad_norm": 1.7688888311386108, "learning_rate": 1.1213729815351788e-06, "loss": 0.0516, "step": 322500 }, { "epoch": 4.749930045212883, "grad_norm": 1.4937430620193481, "learning_rate": 1.1197366095681943e-06, "loss": 0.0507, "step": 322525 }, { "epoch": 4.75029822830297, "grad_norm": 1.249537706375122, "learning_rate": 1.1181002376012096e-06, "loss": 0.0505, "step": 322550 }, { "epoch": 4.750666411393057, "grad_norm": 1.3840093612670898, "learning_rate": 1.1164638656342252e-06, "loss": 0.0462, "step": 322575 }, { "epoch": 4.751034594483144, "grad_norm": 1.5492513179779053, "learning_rate": 1.1148274936672405e-06, "loss": 0.058, "step": 322600 }, { "epoch": 4.751402777573231, "grad_norm": 1.401488184928894, "learning_rate": 1.113191121700256e-06, "loss": 0.0512, "step": 322625 }, { "epoch": 4.751770960663318, "grad_norm": 1.3748246431350708, "learning_rate": 1.1115547497332715e-06, "loss": 0.061, "step": 322650 }, { "epoch": 4.752139143753405, "grad_norm": 1.383940577507019, "learning_rate": 1.109918377766287e-06, "loss": 0.0501, "step": 322675 }, { "epoch": 4.752507326843492, "grad_norm": 0.9253655672073364, "learning_rate": 1.1082820057993023e-06, "loss": 0.0528, "step": 322700 }, { "epoch": 4.752875509933579, "grad_norm": 0.9685051441192627, "learning_rate": 1.1066456338323178e-06, "loss": 0.0503, "step": 322725 }, { "epoch": 4.753243693023666, "grad_norm": 0.9926207661628723, "learning_rate": 1.1050092618653333e-06, "loss": 0.0534, "step": 322750 }, { "epoch": 4.753611876113753, "grad_norm": 1.3221259117126465, "learning_rate": 1.1033728898983486e-06, "loss": 0.0512, "step": 322775 }, { "epoch": 4.753980059203841, "grad_norm": 0.907096803188324, "learning_rate": 1.1017365179313642e-06, "loss": 0.0488, "step": 322800 }, { "epoch": 4.754348242293928, "grad_norm": 1.1337298154830933, "learning_rate": 1.1001001459643795e-06, "loss": 0.0536, "step": 322825 }, { "epoch": 4.754716425384015, "grad_norm": 1.404736042022705, "learning_rate": 1.098463773997395e-06, "loss": 0.0527, "step": 322850 }, { "epoch": 4.755084608474102, "grad_norm": 1.2021695375442505, "learning_rate": 1.0968274020304103e-06, "loss": 0.05, "step": 322875 }, { "epoch": 4.755452791564189, "grad_norm": 0.8419737815856934, "learning_rate": 1.0951910300634258e-06, "loss": 0.0549, "step": 322900 }, { "epoch": 4.755820974654276, "grad_norm": 1.5595836639404297, "learning_rate": 1.0935546580964413e-06, "loss": 0.05, "step": 322925 }, { "epoch": 4.756189157744363, "grad_norm": 1.1508982181549072, "learning_rate": 1.0919182861294568e-06, "loss": 0.0494, "step": 322950 }, { "epoch": 4.75655734083445, "grad_norm": 1.384660005569458, "learning_rate": 1.0902819141624721e-06, "loss": 0.0548, "step": 322975 }, { "epoch": 4.756925523924537, "grad_norm": 1.290677785873413, "learning_rate": 1.0886455421954877e-06, "loss": 0.0418, "step": 323000 }, { "epoch": 4.757293707014624, "grad_norm": 1.8103145360946655, "learning_rate": 1.0870091702285032e-06, "loss": 0.0547, "step": 323025 }, { "epoch": 4.757661890104711, "grad_norm": 1.6526089906692505, "learning_rate": 1.0853727982615185e-06, "loss": 0.0546, "step": 323050 }, { "epoch": 4.758030073194798, "grad_norm": 0.9802822470664978, "learning_rate": 1.083736426294534e-06, "loss": 0.0565, "step": 323075 }, { "epoch": 4.758398256284885, "grad_norm": 1.385122299194336, "learning_rate": 1.0821000543275493e-06, "loss": 0.0484, "step": 323100 }, { "epoch": 4.7587664393749725, "grad_norm": 0.9054996371269226, "learning_rate": 1.0804636823605648e-06, "loss": 0.0463, "step": 323125 }, { "epoch": 4.7591346224650595, "grad_norm": 1.4178301095962524, "learning_rate": 1.0788273103935801e-06, "loss": 0.0507, "step": 323150 }, { "epoch": 4.7595028055551465, "grad_norm": 0.997786819934845, "learning_rate": 1.0771909384265958e-06, "loss": 0.0566, "step": 323175 }, { "epoch": 4.7598709886452335, "grad_norm": 1.315610408782959, "learning_rate": 1.0755545664596112e-06, "loss": 0.0538, "step": 323200 }, { "epoch": 4.7602391717353205, "grad_norm": 1.4628230333328247, "learning_rate": 1.0739181944926267e-06, "loss": 0.0502, "step": 323225 }, { "epoch": 4.7606073548254075, "grad_norm": 1.667850136756897, "learning_rate": 1.072281822525642e-06, "loss": 0.0556, "step": 323250 }, { "epoch": 4.7609755379154945, "grad_norm": 0.65080726146698, "learning_rate": 1.0706454505586575e-06, "loss": 0.0532, "step": 323275 }, { "epoch": 4.761343721005582, "grad_norm": 0.9832432866096497, "learning_rate": 1.069009078591673e-06, "loss": 0.0473, "step": 323300 }, { "epoch": 4.761711904095669, "grad_norm": 1.011878252029419, "learning_rate": 1.0673727066246883e-06, "loss": 0.0556, "step": 323325 }, { "epoch": 4.762080087185756, "grad_norm": 0.9791728258132935, "learning_rate": 1.0657363346577038e-06, "loss": 0.0468, "step": 323350 }, { "epoch": 4.762448270275843, "grad_norm": 1.6162090301513672, "learning_rate": 1.0640999626907191e-06, "loss": 0.0514, "step": 323375 }, { "epoch": 4.76281645336593, "grad_norm": 1.2402819395065308, "learning_rate": 1.062529045602414e-06, "loss": 0.0546, "step": 323400 }, { "epoch": 4.763184636456017, "grad_norm": 1.3816293478012085, "learning_rate": 1.0608926736354296e-06, "loss": 0.0492, "step": 323425 }, { "epoch": 4.763552819546104, "grad_norm": 1.1788517236709595, "learning_rate": 1.0592563016684449e-06, "loss": 0.051, "step": 323450 }, { "epoch": 4.763921002636191, "grad_norm": 1.0061683654785156, "learning_rate": 1.0576199297014604e-06, "loss": 0.0455, "step": 323475 }, { "epoch": 4.764289185726278, "grad_norm": 1.5762227773666382, "learning_rate": 1.055983557734476e-06, "loss": 0.0527, "step": 323500 }, { "epoch": 4.764657368816365, "grad_norm": 1.380527377128601, "learning_rate": 1.0543471857674912e-06, "loss": 0.0486, "step": 323525 }, { "epoch": 4.765025551906452, "grad_norm": 1.0709291696548462, "learning_rate": 1.0527108138005067e-06, "loss": 0.054, "step": 323550 }, { "epoch": 4.765393734996539, "grad_norm": 1.018418312072754, "learning_rate": 1.051074441833522e-06, "loss": 0.0484, "step": 323575 }, { "epoch": 4.765761918086626, "grad_norm": 1.36138916015625, "learning_rate": 1.0494380698665376e-06, "loss": 0.0524, "step": 323600 }, { "epoch": 4.766130101176714, "grad_norm": 0.963590681552887, "learning_rate": 1.0478016978995529e-06, "loss": 0.0479, "step": 323625 }, { "epoch": 4.766498284266801, "grad_norm": 1.878432035446167, "learning_rate": 1.0461653259325686e-06, "loss": 0.0559, "step": 323650 }, { "epoch": 4.766866467356888, "grad_norm": 0.9522743225097656, "learning_rate": 1.044528953965584e-06, "loss": 0.0472, "step": 323675 }, { "epoch": 4.767234650446975, "grad_norm": 1.4975364208221436, "learning_rate": 1.0428925819985994e-06, "loss": 0.055, "step": 323700 }, { "epoch": 4.767602833537062, "grad_norm": 1.1804375648498535, "learning_rate": 1.0412562100316147e-06, "loss": 0.0511, "step": 323725 }, { "epoch": 4.767971016627149, "grad_norm": 1.4956169128417969, "learning_rate": 1.0396198380646302e-06, "loss": 0.0506, "step": 323750 }, { "epoch": 4.768339199717236, "grad_norm": 1.0784454345703125, "learning_rate": 1.0379834660976458e-06, "loss": 0.0491, "step": 323775 }, { "epoch": 4.768707382807323, "grad_norm": 1.3169667720794678, "learning_rate": 1.036347094130661e-06, "loss": 0.0509, "step": 323800 }, { "epoch": 4.76907556589741, "grad_norm": 1.6832619905471802, "learning_rate": 1.0347107221636766e-06, "loss": 0.0487, "step": 323825 }, { "epoch": 4.769443748987497, "grad_norm": 1.136932134628296, "learning_rate": 1.0330743501966919e-06, "loss": 0.0505, "step": 323850 }, { "epoch": 4.769811932077584, "grad_norm": 1.147109031677246, "learning_rate": 1.0314379782297074e-06, "loss": 0.0543, "step": 323875 }, { "epoch": 4.770180115167671, "grad_norm": 1.5984269380569458, "learning_rate": 1.029801606262723e-06, "loss": 0.0486, "step": 323900 }, { "epoch": 4.770548298257758, "grad_norm": 1.9397368431091309, "learning_rate": 1.0281652342957384e-06, "loss": 0.0519, "step": 323925 }, { "epoch": 4.770916481347845, "grad_norm": 1.4014109373092651, "learning_rate": 1.0265288623287537e-06, "loss": 0.0516, "step": 323950 }, { "epoch": 4.771284664437932, "grad_norm": 0.9060550928115845, "learning_rate": 1.0248924903617692e-06, "loss": 0.0489, "step": 323975 }, { "epoch": 4.771652847528019, "grad_norm": 0.9443454146385193, "learning_rate": 1.0232561183947846e-06, "loss": 0.0494, "step": 324000 }, { "epoch": 4.772021030618106, "grad_norm": 1.1277412176132202, "learning_rate": 1.0216197464278e-06, "loss": 0.0497, "step": 324025 }, { "epoch": 4.772389213708193, "grad_norm": 1.4183262586593628, "learning_rate": 1.0199833744608156e-06, "loss": 0.0546, "step": 324050 }, { "epoch": 4.77275739679828, "grad_norm": 1.7397887706756592, "learning_rate": 1.0183470024938309e-06, "loss": 0.0525, "step": 324075 }, { "epoch": 4.773125579888367, "grad_norm": 1.469543695449829, "learning_rate": 1.0167106305268464e-06, "loss": 0.0575, "step": 324100 }, { "epoch": 4.773493762978454, "grad_norm": 1.4543373584747314, "learning_rate": 1.0150742585598617e-06, "loss": 0.0514, "step": 324125 }, { "epoch": 4.773861946068541, "grad_norm": 1.2241088151931763, "learning_rate": 1.0134378865928774e-06, "loss": 0.0488, "step": 324150 }, { "epoch": 4.774230129158628, "grad_norm": 0.8647933006286621, "learning_rate": 1.0118015146258927e-06, "loss": 0.0518, "step": 324175 }, { "epoch": 4.774598312248715, "grad_norm": 0.7101517915725708, "learning_rate": 1.0101651426589083e-06, "loss": 0.0564, "step": 324200 }, { "epoch": 4.774966495338802, "grad_norm": 1.52452552318573, "learning_rate": 1.0085287706919236e-06, "loss": 0.0585, "step": 324225 }, { "epoch": 4.775334678428889, "grad_norm": 1.1051076650619507, "learning_rate": 1.006892398724939e-06, "loss": 0.0503, "step": 324250 }, { "epoch": 4.775702861518976, "grad_norm": 1.1222940683364868, "learning_rate": 1.0052560267579544e-06, "loss": 0.0514, "step": 324275 }, { "epoch": 4.776071044609063, "grad_norm": 1.3647911548614502, "learning_rate": 1.00361965479097e-06, "loss": 0.0459, "step": 324300 }, { "epoch": 4.77643922769915, "grad_norm": 0.4823213815689087, "learning_rate": 1.0019832828239854e-06, "loss": 0.0477, "step": 324325 }, { "epoch": 4.776807410789237, "grad_norm": 1.3930208683013916, "learning_rate": 1.0003469108570007e-06, "loss": 0.0523, "step": 324350 }, { "epoch": 4.777175593879324, "grad_norm": 1.2660657167434692, "learning_rate": 9.987759937686957e-07, "loss": 0.061, "step": 324375 }, { "epoch": 4.777543776969411, "grad_norm": 1.159363865852356, "learning_rate": 9.971396218017112e-07, "loss": 0.0516, "step": 324400 }, { "epoch": 4.777911960059498, "grad_norm": 1.0487005710601807, "learning_rate": 9.955032498347265e-07, "loss": 0.0479, "step": 324425 }, { "epoch": 4.778280143149585, "grad_norm": 1.370375394821167, "learning_rate": 9.93866877867742e-07, "loss": 0.0504, "step": 324450 }, { "epoch": 4.778648326239672, "grad_norm": 1.228419303894043, "learning_rate": 9.922305059007573e-07, "loss": 0.0498, "step": 324475 }, { "epoch": 4.779016509329759, "grad_norm": 0.745658814907074, "learning_rate": 9.905941339337728e-07, "loss": 0.0433, "step": 324500 }, { "epoch": 4.779384692419846, "grad_norm": 1.2757568359375, "learning_rate": 9.889577619667883e-07, "loss": 0.0562, "step": 324525 }, { "epoch": 4.779752875509933, "grad_norm": 1.2557812929153442, "learning_rate": 9.873213899998036e-07, "loss": 0.0588, "step": 324550 }, { "epoch": 4.78012105860002, "grad_norm": 1.9373170137405396, "learning_rate": 9.856850180328192e-07, "loss": 0.0595, "step": 324575 }, { "epoch": 4.780489241690107, "grad_norm": 1.338927984237671, "learning_rate": 9.840486460658345e-07, "loss": 0.0549, "step": 324600 }, { "epoch": 4.780857424780194, "grad_norm": 1.3716917037963867, "learning_rate": 9.824122740988502e-07, "loss": 0.0458, "step": 324625 }, { "epoch": 4.781225607870281, "grad_norm": 0.8940625190734863, "learning_rate": 9.807759021318655e-07, "loss": 0.0495, "step": 324650 }, { "epoch": 4.781593790960368, "grad_norm": 1.1801742315292358, "learning_rate": 9.79139530164881e-07, "loss": 0.0495, "step": 324675 }, { "epoch": 4.781961974050456, "grad_norm": 1.4311069250106812, "learning_rate": 9.775031581978963e-07, "loss": 0.0513, "step": 324700 }, { "epoch": 4.782330157140543, "grad_norm": 1.0667465925216675, "learning_rate": 9.758667862309118e-07, "loss": 0.0416, "step": 324725 }, { "epoch": 4.78269834023063, "grad_norm": 0.8358407616615295, "learning_rate": 9.742304142639271e-07, "loss": 0.0477, "step": 324750 }, { "epoch": 4.783066523320717, "grad_norm": 1.2532055377960205, "learning_rate": 9.725940422969426e-07, "loss": 0.0568, "step": 324775 }, { "epoch": 4.783434706410804, "grad_norm": 0.9507465958595276, "learning_rate": 9.709576703299582e-07, "loss": 0.0485, "step": 324800 }, { "epoch": 4.783802889500891, "grad_norm": 1.4061230421066284, "learning_rate": 9.693212983629735e-07, "loss": 0.0483, "step": 324825 }, { "epoch": 4.784171072590978, "grad_norm": 1.0594359636306763, "learning_rate": 9.67684926395989e-07, "loss": 0.0517, "step": 324850 }, { "epoch": 4.784539255681065, "grad_norm": 1.1768832206726074, "learning_rate": 9.660485544290045e-07, "loss": 0.0472, "step": 324875 }, { "epoch": 4.784907438771152, "grad_norm": 1.840198040008545, "learning_rate": 9.6441218246202e-07, "loss": 0.0556, "step": 324900 }, { "epoch": 4.785275621861239, "grad_norm": 1.4787946939468384, "learning_rate": 9.627758104950353e-07, "loss": 0.0484, "step": 324925 }, { "epoch": 4.785643804951326, "grad_norm": 1.386216402053833, "learning_rate": 9.611394385280508e-07, "loss": 0.0448, "step": 324950 }, { "epoch": 4.786011988041413, "grad_norm": 1.3710432052612305, "learning_rate": 9.595030665610661e-07, "loss": 0.0531, "step": 324975 }, { "epoch": 4.7863801711315, "grad_norm": 1.7313579320907593, "learning_rate": 9.578666945940817e-07, "loss": 0.0501, "step": 325000 }, { "epoch": 4.786748354221587, "grad_norm": 1.5621999502182007, "learning_rate": 9.56230322627097e-07, "loss": 0.0455, "step": 325025 }, { "epoch": 4.787116537311674, "grad_norm": 0.8633673191070557, "learning_rate": 9.545939506601125e-07, "loss": 0.0492, "step": 325050 }, { "epoch": 4.787484720401761, "grad_norm": 1.5263844728469849, "learning_rate": 9.529575786931279e-07, "loss": 0.0504, "step": 325075 }, { "epoch": 4.7878529034918484, "grad_norm": 1.505133867263794, "learning_rate": 9.513212067261435e-07, "loss": 0.0513, "step": 325100 }, { "epoch": 4.7882210865819355, "grad_norm": 1.5316458940505981, "learning_rate": 9.496848347591589e-07, "loss": 0.0526, "step": 325125 }, { "epoch": 4.7885892696720225, "grad_norm": 0.865847647190094, "learning_rate": 9.480484627921743e-07, "loss": 0.0479, "step": 325150 }, { "epoch": 4.7889574527621095, "grad_norm": 1.2504726648330688, "learning_rate": 9.464120908251897e-07, "loss": 0.0482, "step": 325175 }, { "epoch": 4.7893256358521965, "grad_norm": 1.563200831413269, "learning_rate": 9.447757188582052e-07, "loss": 0.0521, "step": 325200 }, { "epoch": 4.7896938189422835, "grad_norm": 1.4421212673187256, "learning_rate": 9.431393468912206e-07, "loss": 0.056, "step": 325225 }, { "epoch": 4.7900620020323705, "grad_norm": 1.4201912879943848, "learning_rate": 9.415029749242361e-07, "loss": 0.0527, "step": 325250 }, { "epoch": 4.7904301851224576, "grad_norm": 1.103562831878662, "learning_rate": 9.398666029572515e-07, "loss": 0.051, "step": 325275 }, { "epoch": 4.790798368212545, "grad_norm": 0.8093121647834778, "learning_rate": 9.382302309902669e-07, "loss": 0.046, "step": 325300 }, { "epoch": 4.791166551302632, "grad_norm": 1.7056058645248413, "learning_rate": 9.365938590232823e-07, "loss": 0.0486, "step": 325325 }, { "epoch": 4.791534734392719, "grad_norm": 1.1183526515960693, "learning_rate": 9.349574870562979e-07, "loss": 0.0478, "step": 325350 }, { "epoch": 4.791902917482806, "grad_norm": 1.2861144542694092, "learning_rate": 9.333211150893133e-07, "loss": 0.0482, "step": 325375 }, { "epoch": 4.792271100572893, "grad_norm": 1.2314645051956177, "learning_rate": 9.316847431223288e-07, "loss": 0.0504, "step": 325400 }, { "epoch": 4.79263928366298, "grad_norm": 1.6391178369522095, "learning_rate": 9.300483711553442e-07, "loss": 0.0521, "step": 325425 }, { "epoch": 4.793007466753067, "grad_norm": 1.5589710474014282, "learning_rate": 9.284119991883596e-07, "loss": 0.0503, "step": 325450 }, { "epoch": 4.793375649843154, "grad_norm": 1.2693525552749634, "learning_rate": 9.26775627221375e-07, "loss": 0.0533, "step": 325475 }, { "epoch": 4.793743832933241, "grad_norm": 0.7547500729560852, "learning_rate": 9.251392552543904e-07, "loss": 0.0519, "step": 325500 }, { "epoch": 4.794112016023328, "grad_norm": 1.2113326787948608, "learning_rate": 9.235028832874059e-07, "loss": 0.0463, "step": 325525 }, { "epoch": 4.794480199113416, "grad_norm": 1.2797787189483643, "learning_rate": 9.218665113204213e-07, "loss": 0.0561, "step": 325550 }, { "epoch": 4.794848382203503, "grad_norm": 1.2187013626098633, "learning_rate": 9.202301393534367e-07, "loss": 0.049, "step": 325575 }, { "epoch": 4.79521656529359, "grad_norm": 1.6214213371276855, "learning_rate": 9.185937673864522e-07, "loss": 0.0517, "step": 325600 }, { "epoch": 4.795584748383677, "grad_norm": 1.0034376382827759, "learning_rate": 9.169573954194678e-07, "loss": 0.0518, "step": 325625 }, { "epoch": 4.795952931473764, "grad_norm": 1.2485432624816895, "learning_rate": 9.153210234524832e-07, "loss": 0.0531, "step": 325650 }, { "epoch": 4.796321114563851, "grad_norm": 1.3036500215530396, "learning_rate": 9.136846514854986e-07, "loss": 0.0526, "step": 325675 }, { "epoch": 4.796689297653938, "grad_norm": 1.1776338815689087, "learning_rate": 9.12048279518514e-07, "loss": 0.0612, "step": 325700 }, { "epoch": 4.797057480744025, "grad_norm": 1.1754858493804932, "learning_rate": 9.104119075515294e-07, "loss": 0.0493, "step": 325725 }, { "epoch": 4.797425663834112, "grad_norm": 0.8314536809921265, "learning_rate": 9.087755355845448e-07, "loss": 0.0503, "step": 325750 }, { "epoch": 4.797793846924199, "grad_norm": 1.3675329685211182, "learning_rate": 9.071391636175602e-07, "loss": 0.0468, "step": 325775 }, { "epoch": 4.798162030014286, "grad_norm": 1.031081199645996, "learning_rate": 9.055027916505757e-07, "loss": 0.0599, "step": 325800 }, { "epoch": 4.798530213104373, "grad_norm": 0.9318610429763794, "learning_rate": 9.038664196835912e-07, "loss": 0.0526, "step": 325825 }, { "epoch": 4.79889839619446, "grad_norm": 1.7119038105010986, "learning_rate": 9.022300477166067e-07, "loss": 0.0524, "step": 325850 }, { "epoch": 4.799266579284547, "grad_norm": 1.085234522819519, "learning_rate": 9.005936757496221e-07, "loss": 0.0512, "step": 325875 }, { "epoch": 4.799634762374634, "grad_norm": 1.5839431285858154, "learning_rate": 8.989573037826376e-07, "loss": 0.0502, "step": 325900 }, { "epoch": 4.800002945464721, "grad_norm": 0.8413432836532593, "learning_rate": 8.97320931815653e-07, "loss": 0.0509, "step": 325925 }, { "epoch": 4.800371128554808, "grad_norm": 1.1245535612106323, "learning_rate": 8.956845598486684e-07, "loss": 0.0579, "step": 325950 }, { "epoch": 4.800739311644895, "grad_norm": 1.8375391960144043, "learning_rate": 8.940481878816838e-07, "loss": 0.0581, "step": 325975 }, { "epoch": 4.801107494734982, "grad_norm": 1.3693115711212158, "learning_rate": 8.924118159146992e-07, "loss": 0.0499, "step": 326000 }, { "epoch": 4.801475677825069, "grad_norm": 1.5521268844604492, "learning_rate": 8.907754439477146e-07, "loss": 0.0571, "step": 326025 }, { "epoch": 4.801843860915156, "grad_norm": 1.5986840724945068, "learning_rate": 8.891390719807301e-07, "loss": 0.0602, "step": 326050 }, { "epoch": 4.802212044005243, "grad_norm": 0.8774245381355286, "learning_rate": 8.875027000137457e-07, "loss": 0.0482, "step": 326075 }, { "epoch": 4.80258022709533, "grad_norm": 1.327412724494934, "learning_rate": 8.858663280467611e-07, "loss": 0.0568, "step": 326100 }, { "epoch": 4.802948410185417, "grad_norm": 1.7069092988967896, "learning_rate": 8.842299560797765e-07, "loss": 0.0502, "step": 326125 }, { "epoch": 4.803316593275504, "grad_norm": 0.9696958661079407, "learning_rate": 8.825935841127919e-07, "loss": 0.0564, "step": 326150 }, { "epoch": 4.803684776365591, "grad_norm": 0.9806936979293823, "learning_rate": 8.809572121458074e-07, "loss": 0.0529, "step": 326175 }, { "epoch": 4.804052959455678, "grad_norm": 1.2925121784210205, "learning_rate": 8.793208401788228e-07, "loss": 0.0476, "step": 326200 }, { "epoch": 4.804421142545765, "grad_norm": 1.307326078414917, "learning_rate": 8.776844682118382e-07, "loss": 0.0436, "step": 326225 }, { "epoch": 4.804789325635852, "grad_norm": 1.6416270732879639, "learning_rate": 8.760480962448537e-07, "loss": 0.048, "step": 326250 }, { "epoch": 4.805157508725939, "grad_norm": 2.0258803367614746, "learning_rate": 8.744117242778691e-07, "loss": 0.0577, "step": 326275 }, { "epoch": 4.805525691816026, "grad_norm": 1.1298747062683105, "learning_rate": 8.727753523108845e-07, "loss": 0.0524, "step": 326300 }, { "epoch": 4.805893874906113, "grad_norm": 1.1773145198822021, "learning_rate": 8.711389803439001e-07, "loss": 0.0468, "step": 326325 }, { "epoch": 4.8062620579962, "grad_norm": 0.7392027378082275, "learning_rate": 8.695026083769155e-07, "loss": 0.0534, "step": 326350 }, { "epoch": 4.806630241086287, "grad_norm": 1.398547649383545, "learning_rate": 8.678662364099309e-07, "loss": 0.0499, "step": 326375 }, { "epoch": 4.806998424176374, "grad_norm": 1.8869794607162476, "learning_rate": 8.662298644429463e-07, "loss": 0.0477, "step": 326400 }, { "epoch": 4.807366607266461, "grad_norm": 1.2947700023651123, "learning_rate": 8.645934924759617e-07, "loss": 0.0534, "step": 326425 }, { "epoch": 4.807734790356548, "grad_norm": 1.1962100267410278, "learning_rate": 8.629571205089773e-07, "loss": 0.0543, "step": 326450 }, { "epoch": 4.808102973446635, "grad_norm": 1.1489663124084473, "learning_rate": 8.613207485419927e-07, "loss": 0.0518, "step": 326475 }, { "epoch": 4.808471156536722, "grad_norm": 1.3486053943634033, "learning_rate": 8.596843765750081e-07, "loss": 0.0504, "step": 326500 }, { "epoch": 4.808839339626809, "grad_norm": 1.1370948553085327, "learning_rate": 8.580480046080235e-07, "loss": 0.0493, "step": 326525 }, { "epoch": 4.809207522716896, "grad_norm": 1.1146185398101807, "learning_rate": 8.564116326410389e-07, "loss": 0.0541, "step": 326550 }, { "epoch": 4.809575705806983, "grad_norm": 1.0838857889175415, "learning_rate": 8.547752606740545e-07, "loss": 0.0535, "step": 326575 }, { "epoch": 4.80994388889707, "grad_norm": 1.1172491312026978, "learning_rate": 8.531388887070699e-07, "loss": 0.0506, "step": 326600 }, { "epoch": 4.810312071987158, "grad_norm": 1.3487467765808105, "learning_rate": 8.515025167400853e-07, "loss": 0.054, "step": 326625 }, { "epoch": 4.810680255077245, "grad_norm": 1.6122474670410156, "learning_rate": 8.498661447731007e-07, "loss": 0.0452, "step": 326650 }, { "epoch": 4.811048438167332, "grad_norm": 1.456758975982666, "learning_rate": 8.482297728061162e-07, "loss": 0.0491, "step": 326675 }, { "epoch": 4.811416621257419, "grad_norm": 1.0345221757888794, "learning_rate": 8.465934008391316e-07, "loss": 0.051, "step": 326700 }, { "epoch": 4.811784804347506, "grad_norm": 1.674077033996582, "learning_rate": 8.449570288721471e-07, "loss": 0.0493, "step": 326725 }, { "epoch": 4.812152987437593, "grad_norm": 1.4658232927322388, "learning_rate": 8.433206569051625e-07, "loss": 0.0517, "step": 326750 }, { "epoch": 4.81252117052768, "grad_norm": 1.1406266689300537, "learning_rate": 8.416842849381779e-07, "loss": 0.0439, "step": 326775 }, { "epoch": 4.812889353617767, "grad_norm": 1.344596266746521, "learning_rate": 8.400479129711933e-07, "loss": 0.0466, "step": 326800 }, { "epoch": 4.813257536707854, "grad_norm": 0.862582802772522, "learning_rate": 8.384115410042088e-07, "loss": 0.0495, "step": 326825 }, { "epoch": 4.813625719797941, "grad_norm": 1.445375680923462, "learning_rate": 8.367751690372243e-07, "loss": 0.047, "step": 326850 }, { "epoch": 4.813993902888028, "grad_norm": 1.269646406173706, "learning_rate": 8.351387970702398e-07, "loss": 0.0537, "step": 326875 }, { "epoch": 4.814362085978115, "grad_norm": 1.3017737865447998, "learning_rate": 8.335024251032552e-07, "loss": 0.0527, "step": 326900 }, { "epoch": 4.814730269068202, "grad_norm": 1.1769801378250122, "learning_rate": 8.318660531362706e-07, "loss": 0.0553, "step": 326925 }, { "epoch": 4.815098452158289, "grad_norm": 0.9682812690734863, "learning_rate": 8.30229681169286e-07, "loss": 0.0523, "step": 326950 }, { "epoch": 4.815466635248376, "grad_norm": 1.2495931386947632, "learning_rate": 8.285933092023014e-07, "loss": 0.0435, "step": 326975 }, { "epoch": 4.815834818338463, "grad_norm": 1.8156076669692993, "learning_rate": 8.269569372353169e-07, "loss": 0.0533, "step": 327000 }, { "epoch": 4.81620300142855, "grad_norm": 1.5517669916152954, "learning_rate": 8.253205652683323e-07, "loss": 0.0488, "step": 327025 }, { "epoch": 4.816571184518637, "grad_norm": 1.2019058465957642, "learning_rate": 8.236841933013477e-07, "loss": 0.0573, "step": 327050 }, { "epoch": 4.816939367608724, "grad_norm": 1.6249291896820068, "learning_rate": 8.220478213343633e-07, "loss": 0.0574, "step": 327075 }, { "epoch": 4.8173075506988114, "grad_norm": 1.2429358959197998, "learning_rate": 8.204114493673787e-07, "loss": 0.0438, "step": 327100 }, { "epoch": 4.8176757337888985, "grad_norm": 0.8272768259048462, "learning_rate": 8.187750774003942e-07, "loss": 0.05, "step": 327125 }, { "epoch": 4.8180439168789855, "grad_norm": 1.6176915168762207, "learning_rate": 8.171387054334096e-07, "loss": 0.0573, "step": 327150 }, { "epoch": 4.8184120999690725, "grad_norm": 1.1579937934875488, "learning_rate": 8.15502333466425e-07, "loss": 0.0533, "step": 327175 }, { "epoch": 4.8187802830591595, "grad_norm": 1.4144437313079834, "learning_rate": 8.138659614994404e-07, "loss": 0.0474, "step": 327200 }, { "epoch": 4.8191484661492465, "grad_norm": 1.2462953329086304, "learning_rate": 8.122950444111352e-07, "loss": 0.0532, "step": 327225 }, { "epoch": 4.8195166492393335, "grad_norm": 1.2043429613113403, "learning_rate": 8.106586724441507e-07, "loss": 0.0439, "step": 327250 }, { "epoch": 4.8198848323294206, "grad_norm": 1.1414886713027954, "learning_rate": 8.090223004771661e-07, "loss": 0.0516, "step": 327275 }, { "epoch": 4.820253015419508, "grad_norm": 1.1177997589111328, "learning_rate": 8.073859285101816e-07, "loss": 0.0515, "step": 327300 }, { "epoch": 4.820621198509595, "grad_norm": 1.1892731189727783, "learning_rate": 8.057495565431971e-07, "loss": 0.0459, "step": 327325 }, { "epoch": 4.820989381599682, "grad_norm": 1.4891889095306396, "learning_rate": 8.041131845762125e-07, "loss": 0.0532, "step": 327350 }, { "epoch": 4.821357564689769, "grad_norm": 1.2619518041610718, "learning_rate": 8.024768126092279e-07, "loss": 0.0503, "step": 327375 }, { "epoch": 4.821725747779856, "grad_norm": 0.9511722326278687, "learning_rate": 8.008404406422433e-07, "loss": 0.0661, "step": 327400 }, { "epoch": 4.822093930869943, "grad_norm": 1.9508581161499023, "learning_rate": 7.992040686752587e-07, "loss": 0.0427, "step": 327425 }, { "epoch": 4.82246211396003, "grad_norm": 1.086293339729309, "learning_rate": 7.975676967082741e-07, "loss": 0.0541, "step": 327450 }, { "epoch": 4.822830297050118, "grad_norm": 0.9589499831199646, "learning_rate": 7.959313247412897e-07, "loss": 0.0482, "step": 327475 }, { "epoch": 4.823198480140205, "grad_norm": 1.375903844833374, "learning_rate": 7.942949527743051e-07, "loss": 0.0583, "step": 327500 }, { "epoch": 4.823566663230292, "grad_norm": 2.153398036956787, "learning_rate": 7.926585808073205e-07, "loss": 0.0548, "step": 327525 }, { "epoch": 4.823934846320379, "grad_norm": 1.0173044204711914, "learning_rate": 7.91022208840336e-07, "loss": 0.0463, "step": 327550 }, { "epoch": 4.824303029410466, "grad_norm": 1.4276267290115356, "learning_rate": 7.893858368733514e-07, "loss": 0.0548, "step": 327575 }, { "epoch": 4.824671212500553, "grad_norm": 0.8254579901695251, "learning_rate": 7.877494649063669e-07, "loss": 0.0555, "step": 327600 }, { "epoch": 4.82503939559064, "grad_norm": 1.2390910387039185, "learning_rate": 7.861130929393823e-07, "loss": 0.0464, "step": 327625 }, { "epoch": 4.825407578680727, "grad_norm": 1.0116550922393799, "learning_rate": 7.844767209723977e-07, "loss": 0.0515, "step": 327650 }, { "epoch": 4.825775761770814, "grad_norm": 1.6932095289230347, "learning_rate": 7.828403490054132e-07, "loss": 0.0552, "step": 327675 }, { "epoch": 4.826143944860901, "grad_norm": 1.4505504369735718, "learning_rate": 7.812039770384286e-07, "loss": 0.0512, "step": 327700 }, { "epoch": 4.826512127950988, "grad_norm": 0.9411727786064148, "learning_rate": 7.79567605071444e-07, "loss": 0.0448, "step": 327725 }, { "epoch": 4.826880311041075, "grad_norm": 1.2102724313735962, "learning_rate": 7.779312331044595e-07, "loss": 0.0542, "step": 327750 }, { "epoch": 4.827248494131162, "grad_norm": 1.5939611196517944, "learning_rate": 7.76294861137475e-07, "loss": 0.0508, "step": 327775 }, { "epoch": 4.827616677221249, "grad_norm": 1.3736553192138672, "learning_rate": 7.746584891704904e-07, "loss": 0.0487, "step": 327800 }, { "epoch": 4.827984860311336, "grad_norm": 1.20279860496521, "learning_rate": 7.730221172035058e-07, "loss": 0.0488, "step": 327825 }, { "epoch": 4.828353043401423, "grad_norm": 1.5555202960968018, "learning_rate": 7.713857452365212e-07, "loss": 0.0557, "step": 327850 }, { "epoch": 4.82872122649151, "grad_norm": 0.9337530136108398, "learning_rate": 7.697493732695368e-07, "loss": 0.0516, "step": 327875 }, { "epoch": 4.829089409581597, "grad_norm": 1.2430012226104736, "learning_rate": 7.681130013025522e-07, "loss": 0.0527, "step": 327900 }, { "epoch": 4.829457592671684, "grad_norm": 1.108012318611145, "learning_rate": 7.664766293355676e-07, "loss": 0.0474, "step": 327925 }, { "epoch": 4.829825775761771, "grad_norm": 1.0347472429275513, "learning_rate": 7.64840257368583e-07, "loss": 0.0428, "step": 327950 }, { "epoch": 4.830193958851858, "grad_norm": 1.0532573461532593, "learning_rate": 7.632038854015984e-07, "loss": 0.0494, "step": 327975 }, { "epoch": 4.830562141941945, "grad_norm": 1.7293601036071777, "learning_rate": 7.615675134346138e-07, "loss": 0.0533, "step": 328000 }, { "epoch": 4.830930325032032, "grad_norm": 0.7408385872840881, "learning_rate": 7.599311414676294e-07, "loss": 0.0456, "step": 328025 }, { "epoch": 4.831298508122119, "grad_norm": 1.4440840482711792, "learning_rate": 7.582947695006448e-07, "loss": 0.0587, "step": 328050 }, { "epoch": 4.831666691212206, "grad_norm": 1.2703137397766113, "learning_rate": 7.566583975336603e-07, "loss": 0.0549, "step": 328075 }, { "epoch": 4.832034874302293, "grad_norm": 2.1623666286468506, "learning_rate": 7.550220255666757e-07, "loss": 0.0516, "step": 328100 }, { "epoch": 4.83240305739238, "grad_norm": 0.751977801322937, "learning_rate": 7.533856535996911e-07, "loss": 0.0508, "step": 328125 }, { "epoch": 4.832771240482467, "grad_norm": 1.1010229587554932, "learning_rate": 7.517492816327066e-07, "loss": 0.0498, "step": 328150 }, { "epoch": 4.833139423572554, "grad_norm": 1.6952306032180786, "learning_rate": 7.50112909665722e-07, "loss": 0.0524, "step": 328175 }, { "epoch": 4.833507606662641, "grad_norm": 1.6126854419708252, "learning_rate": 7.484765376987374e-07, "loss": 0.0502, "step": 328200 }, { "epoch": 4.833875789752728, "grad_norm": 0.7832873463630676, "learning_rate": 7.468401657317528e-07, "loss": 0.0499, "step": 328225 }, { "epoch": 4.834243972842815, "grad_norm": 1.3976212739944458, "learning_rate": 7.452037937647682e-07, "loss": 0.0504, "step": 328250 }, { "epoch": 4.834612155932902, "grad_norm": 1.4674153327941895, "learning_rate": 7.435674217977839e-07, "loss": 0.0489, "step": 328275 }, { "epoch": 4.834980339022989, "grad_norm": 1.2507990598678589, "learning_rate": 7.419310498307993e-07, "loss": 0.0483, "step": 328300 }, { "epoch": 4.835348522113076, "grad_norm": 1.1491302251815796, "learning_rate": 7.402946778638147e-07, "loss": 0.0513, "step": 328325 }, { "epoch": 4.835716705203163, "grad_norm": 1.1399215459823608, "learning_rate": 7.386583058968301e-07, "loss": 0.046, "step": 328350 }, { "epoch": 4.83608488829325, "grad_norm": 1.4150493144989014, "learning_rate": 7.370219339298455e-07, "loss": 0.0528, "step": 328375 }, { "epoch": 4.836453071383337, "grad_norm": 1.4345759153366089, "learning_rate": 7.353855619628609e-07, "loss": 0.0508, "step": 328400 }, { "epoch": 4.836821254473424, "grad_norm": 1.3032423257827759, "learning_rate": 7.337491899958764e-07, "loss": 0.0504, "step": 328425 }, { "epoch": 4.837189437563511, "grad_norm": 1.1716992855072021, "learning_rate": 7.321128180288918e-07, "loss": 0.0504, "step": 328450 }, { "epoch": 4.837557620653598, "grad_norm": 1.040269136428833, "learning_rate": 7.304764460619072e-07, "loss": 0.0507, "step": 328475 }, { "epoch": 4.837925803743685, "grad_norm": 0.811455488204956, "learning_rate": 7.288400740949227e-07, "loss": 0.0559, "step": 328500 }, { "epoch": 4.838293986833772, "grad_norm": 1.0569169521331787, "learning_rate": 7.272037021279383e-07, "loss": 0.0473, "step": 328525 }, { "epoch": 4.83866216992386, "grad_norm": 1.2798147201538086, "learning_rate": 7.255673301609537e-07, "loss": 0.0507, "step": 328550 }, { "epoch": 4.839030353013947, "grad_norm": 1.2332454919815063, "learning_rate": 7.239309581939691e-07, "loss": 0.0455, "step": 328575 }, { "epoch": 4.839398536104034, "grad_norm": 1.310945987701416, "learning_rate": 7.222945862269845e-07, "loss": 0.0508, "step": 328600 }, { "epoch": 4.839766719194121, "grad_norm": 1.7524762153625488, "learning_rate": 7.206582142599999e-07, "loss": 0.0473, "step": 328625 }, { "epoch": 4.840134902284208, "grad_norm": 1.3432730436325073, "learning_rate": 7.190218422930153e-07, "loss": 0.0432, "step": 328650 }, { "epoch": 4.840503085374295, "grad_norm": 0.9963130354881287, "learning_rate": 7.173854703260307e-07, "loss": 0.054, "step": 328675 }, { "epoch": 4.840871268464382, "grad_norm": 1.740356206893921, "learning_rate": 7.157490983590463e-07, "loss": 0.0572, "step": 328700 }, { "epoch": 4.841239451554469, "grad_norm": 1.0979925394058228, "learning_rate": 7.141127263920617e-07, "loss": 0.0468, "step": 328725 }, { "epoch": 4.841607634644556, "grad_norm": 1.3253620862960815, "learning_rate": 7.124763544250771e-07, "loss": 0.0452, "step": 328750 }, { "epoch": 4.841975817734643, "grad_norm": 1.3796427249908447, "learning_rate": 7.108399824580926e-07, "loss": 0.0496, "step": 328775 }, { "epoch": 4.84234400082473, "grad_norm": 1.4862746000289917, "learning_rate": 7.092036104911081e-07, "loss": 0.0518, "step": 328800 }, { "epoch": 4.842712183914817, "grad_norm": 1.012390375137329, "learning_rate": 7.075672385241235e-07, "loss": 0.0492, "step": 328825 }, { "epoch": 4.843080367004904, "grad_norm": 1.2318142652511597, "learning_rate": 7.059308665571389e-07, "loss": 0.0548, "step": 328850 }, { "epoch": 4.843448550094991, "grad_norm": 1.677403211593628, "learning_rate": 7.042944945901543e-07, "loss": 0.0526, "step": 328875 }, { "epoch": 4.843816733185078, "grad_norm": 1.266858696937561, "learning_rate": 7.026581226231697e-07, "loss": 0.0523, "step": 328900 }, { "epoch": 4.844184916275165, "grad_norm": 1.00752592086792, "learning_rate": 7.010217506561852e-07, "loss": 0.0519, "step": 328925 }, { "epoch": 4.844553099365252, "grad_norm": 1.9096211194992065, "learning_rate": 6.993853786892006e-07, "loss": 0.0502, "step": 328950 }, { "epoch": 4.844921282455339, "grad_norm": 1.1299902200698853, "learning_rate": 6.977490067222161e-07, "loss": 0.0474, "step": 328975 }, { "epoch": 4.845289465545426, "grad_norm": 1.2764970064163208, "learning_rate": 6.961126347552315e-07, "loss": 0.052, "step": 329000 }, { "epoch": 4.845657648635513, "grad_norm": 1.3504260778427124, "learning_rate": 6.94476262788247e-07, "loss": 0.0594, "step": 329025 }, { "epoch": 4.8460258317256, "grad_norm": 1.3586283922195435, "learning_rate": 6.928398908212624e-07, "loss": 0.0571, "step": 329050 }, { "epoch": 4.846394014815687, "grad_norm": 1.3230373859405518, "learning_rate": 6.912035188542779e-07, "loss": 0.0519, "step": 329075 }, { "epoch": 4.8467621979057744, "grad_norm": 1.492194414138794, "learning_rate": 6.895671468872933e-07, "loss": 0.054, "step": 329100 }, { "epoch": 4.8471303809958615, "grad_norm": 1.6579687595367432, "learning_rate": 6.879307749203088e-07, "loss": 0.0491, "step": 329125 }, { "epoch": 4.8474985640859485, "grad_norm": 1.6297032833099365, "learning_rate": 6.862944029533242e-07, "loss": 0.0437, "step": 329150 }, { "epoch": 4.8478667471760355, "grad_norm": 1.1896623373031616, "learning_rate": 6.846580309863396e-07, "loss": 0.0514, "step": 329175 }, { "epoch": 4.8482349302661225, "grad_norm": 1.277224063873291, "learning_rate": 6.83021659019355e-07, "loss": 0.0488, "step": 329200 }, { "epoch": 4.8486031133562095, "grad_norm": 1.1937624216079712, "learning_rate": 6.813852870523704e-07, "loss": 0.0491, "step": 329225 }, { "epoch": 4.8489712964462965, "grad_norm": 1.638359785079956, "learning_rate": 6.79748915085386e-07, "loss": 0.0532, "step": 329250 }, { "epoch": 4.849339479536384, "grad_norm": 1.6064482927322388, "learning_rate": 6.781125431184014e-07, "loss": 0.0498, "step": 329275 }, { "epoch": 4.849707662626471, "grad_norm": 1.092834711074829, "learning_rate": 6.764761711514168e-07, "loss": 0.0491, "step": 329300 }, { "epoch": 4.850075845716558, "grad_norm": 1.2567262649536133, "learning_rate": 6.748397991844323e-07, "loss": 0.0559, "step": 329325 }, { "epoch": 4.850444028806645, "grad_norm": 1.1250033378601074, "learning_rate": 6.732034272174478e-07, "loss": 0.0533, "step": 329350 }, { "epoch": 4.8508122118967325, "grad_norm": 0.8184024691581726, "learning_rate": 6.715670552504632e-07, "loss": 0.0582, "step": 329375 }, { "epoch": 4.8511803949868195, "grad_norm": 1.5957434177398682, "learning_rate": 6.699306832834786e-07, "loss": 0.0499, "step": 329400 }, { "epoch": 4.8515485780769065, "grad_norm": 1.2781341075897217, "learning_rate": 6.68294311316494e-07, "loss": 0.0526, "step": 329425 }, { "epoch": 4.8519167611669936, "grad_norm": 1.4263216257095337, "learning_rate": 6.666579393495094e-07, "loss": 0.0479, "step": 329450 }, { "epoch": 4.852284944257081, "grad_norm": 1.428635835647583, "learning_rate": 6.650215673825248e-07, "loss": 0.0503, "step": 329475 }, { "epoch": 4.852653127347168, "grad_norm": 1.3879873752593994, "learning_rate": 6.633851954155404e-07, "loss": 0.0544, "step": 329500 }, { "epoch": 4.853021310437255, "grad_norm": 1.4613816738128662, "learning_rate": 6.617488234485559e-07, "loss": 0.0458, "step": 329525 }, { "epoch": 4.853389493527342, "grad_norm": 1.7965375185012817, "learning_rate": 6.601124514815713e-07, "loss": 0.046, "step": 329550 }, { "epoch": 4.853757676617429, "grad_norm": 1.6311373710632324, "learning_rate": 6.584760795145867e-07, "loss": 0.0519, "step": 329575 }, { "epoch": 4.854125859707516, "grad_norm": 1.1862528324127197, "learning_rate": 6.568397075476021e-07, "loss": 0.0555, "step": 329600 }, { "epoch": 4.854494042797603, "grad_norm": 1.306323528289795, "learning_rate": 6.552033355806176e-07, "loss": 0.0465, "step": 329625 }, { "epoch": 4.85486222588769, "grad_norm": 0.8707461357116699, "learning_rate": 6.53566963613633e-07, "loss": 0.0559, "step": 329650 }, { "epoch": 4.855230408977777, "grad_norm": 1.1268742084503174, "learning_rate": 6.519305916466484e-07, "loss": 0.0529, "step": 329675 }, { "epoch": 4.855598592067864, "grad_norm": 1.3436145782470703, "learning_rate": 6.502942196796638e-07, "loss": 0.0469, "step": 329700 }, { "epoch": 4.855966775157951, "grad_norm": 1.2283753156661987, "learning_rate": 6.486578477126792e-07, "loss": 0.0508, "step": 329725 }, { "epoch": 4.856334958248038, "grad_norm": 1.127272367477417, "learning_rate": 6.470214757456949e-07, "loss": 0.0514, "step": 329750 }, { "epoch": 4.856703141338125, "grad_norm": 1.8110359907150269, "learning_rate": 6.453851037787103e-07, "loss": 0.0562, "step": 329775 }, { "epoch": 4.857071324428212, "grad_norm": 0.9426853656768799, "learning_rate": 6.437487318117257e-07, "loss": 0.0506, "step": 329800 }, { "epoch": 4.857439507518299, "grad_norm": 1.3399136066436768, "learning_rate": 6.421123598447411e-07, "loss": 0.0503, "step": 329825 }, { "epoch": 4.857807690608386, "grad_norm": 1.0861376523971558, "learning_rate": 6.404759878777565e-07, "loss": 0.0502, "step": 329850 }, { "epoch": 4.858175873698473, "grad_norm": 1.4880216121673584, "learning_rate": 6.388396159107719e-07, "loss": 0.0504, "step": 329875 }, { "epoch": 4.85854405678856, "grad_norm": 1.7946224212646484, "learning_rate": 6.372032439437874e-07, "loss": 0.0488, "step": 329900 }, { "epoch": 4.858912239878647, "grad_norm": 1.44832444190979, "learning_rate": 6.355668719768028e-07, "loss": 0.0479, "step": 329925 }, { "epoch": 4.859280422968734, "grad_norm": 1.042392611503601, "learning_rate": 6.339305000098183e-07, "loss": 0.0517, "step": 329950 }, { "epoch": 4.859648606058821, "grad_norm": 1.482990026473999, "learning_rate": 6.322941280428337e-07, "loss": 0.0559, "step": 329975 }, { "epoch": 4.860016789148908, "grad_norm": 0.9654814004898071, "learning_rate": 6.306577560758492e-07, "loss": 0.0494, "step": 330000 }, { "epoch": 4.860384972238995, "grad_norm": 1.0492042303085327, "learning_rate": 6.290213841088647e-07, "loss": 0.0503, "step": 330025 }, { "epoch": 4.860753155329082, "grad_norm": 1.1498184204101562, "learning_rate": 6.273850121418801e-07, "loss": 0.0482, "step": 330050 }, { "epoch": 4.861121338419169, "grad_norm": 1.389785885810852, "learning_rate": 6.257486401748955e-07, "loss": 0.0514, "step": 330075 }, { "epoch": 4.861489521509256, "grad_norm": 1.2617695331573486, "learning_rate": 6.241122682079109e-07, "loss": 0.0476, "step": 330100 }, { "epoch": 4.861857704599343, "grad_norm": 1.6691839694976807, "learning_rate": 6.224758962409263e-07, "loss": 0.0544, "step": 330125 }, { "epoch": 4.86222588768943, "grad_norm": 1.0076056718826294, "learning_rate": 6.208395242739417e-07, "loss": 0.0482, "step": 330150 }, { "epoch": 4.862594070779517, "grad_norm": 1.3942625522613525, "learning_rate": 6.192031523069573e-07, "loss": 0.0521, "step": 330175 }, { "epoch": 4.862962253869604, "grad_norm": 1.074027419090271, "learning_rate": 6.175667803399727e-07, "loss": 0.051, "step": 330200 }, { "epoch": 4.863330436959691, "grad_norm": 0.8190245628356934, "learning_rate": 6.159304083729881e-07, "loss": 0.0555, "step": 330225 }, { "epoch": 4.863698620049778, "grad_norm": 1.3431546688079834, "learning_rate": 6.142940364060036e-07, "loss": 0.0477, "step": 330250 }, { "epoch": 4.864066803139865, "grad_norm": 1.045159935951233, "learning_rate": 6.12657664439019e-07, "loss": 0.0425, "step": 330275 }, { "epoch": 4.864434986229952, "grad_norm": 1.422806978225708, "learning_rate": 6.110212924720345e-07, "loss": 0.0493, "step": 330300 }, { "epoch": 4.864803169320039, "grad_norm": 1.7697246074676514, "learning_rate": 6.093849205050499e-07, "loss": 0.0448, "step": 330325 }, { "epoch": 4.865171352410126, "grad_norm": 1.3944896459579468, "learning_rate": 6.077485485380653e-07, "loss": 0.0596, "step": 330350 }, { "epoch": 4.865539535500213, "grad_norm": 1.082607626914978, "learning_rate": 6.061121765710808e-07, "loss": 0.0443, "step": 330375 }, { "epoch": 4.8659077185903, "grad_norm": 0.8295331597328186, "learning_rate": 6.044758046040962e-07, "loss": 0.0456, "step": 330400 }, { "epoch": 4.866275901680387, "grad_norm": 1.444100022315979, "learning_rate": 6.028394326371117e-07, "loss": 0.049, "step": 330425 }, { "epoch": 4.866644084770474, "grad_norm": 1.6951584815979004, "learning_rate": 6.012030606701271e-07, "loss": 0.0499, "step": 330450 }, { "epoch": 4.867012267860562, "grad_norm": 1.1521512269973755, "learning_rate": 5.995666887031425e-07, "loss": 0.0469, "step": 330475 }, { "epoch": 4.867380450950649, "grad_norm": 1.6782010793685913, "learning_rate": 5.979303167361579e-07, "loss": 0.0582, "step": 330500 }, { "epoch": 4.867748634040736, "grad_norm": 0.9512406587600708, "learning_rate": 5.962939447691734e-07, "loss": 0.0516, "step": 330525 }, { "epoch": 4.868116817130823, "grad_norm": 1.4517745971679688, "learning_rate": 5.946575728021888e-07, "loss": 0.0555, "step": 330550 }, { "epoch": 4.86848500022091, "grad_norm": 1.5887190103530884, "learning_rate": 5.930212008352044e-07, "loss": 0.0475, "step": 330575 }, { "epoch": 4.868853183310997, "grad_norm": 1.53322172164917, "learning_rate": 5.913848288682198e-07, "loss": 0.0469, "step": 330600 }, { "epoch": 4.869221366401084, "grad_norm": 0.5552016496658325, "learning_rate": 5.897484569012352e-07, "loss": 0.0497, "step": 330625 }, { "epoch": 4.869589549491171, "grad_norm": 1.5306885242462158, "learning_rate": 5.881120849342506e-07, "loss": 0.0468, "step": 330650 }, { "epoch": 4.869957732581258, "grad_norm": 1.3030372858047485, "learning_rate": 5.864757129672661e-07, "loss": 0.0509, "step": 330675 }, { "epoch": 4.870325915671345, "grad_norm": 1.384799838066101, "learning_rate": 5.848393410002815e-07, "loss": 0.0539, "step": 330700 }, { "epoch": 4.870694098761432, "grad_norm": 1.330909013748169, "learning_rate": 5.832029690332969e-07, "loss": 0.0527, "step": 330725 }, { "epoch": 4.871062281851519, "grad_norm": 1.3578200340270996, "learning_rate": 5.815665970663123e-07, "loss": 0.0563, "step": 330750 }, { "epoch": 4.871430464941606, "grad_norm": 1.4362881183624268, "learning_rate": 5.799956799780073e-07, "loss": 0.0525, "step": 330775 }, { "epoch": 4.871798648031693, "grad_norm": 1.0092297792434692, "learning_rate": 5.783593080110227e-07, "loss": 0.0527, "step": 330800 }, { "epoch": 4.87216683112178, "grad_norm": 1.3779137134552002, "learning_rate": 5.767229360440381e-07, "loss": 0.0487, "step": 330825 }, { "epoch": 4.872535014211867, "grad_norm": 0.7738792896270752, "learning_rate": 5.750865640770535e-07, "loss": 0.0567, "step": 330850 }, { "epoch": 4.872903197301954, "grad_norm": 0.8938313722610474, "learning_rate": 5.734501921100689e-07, "loss": 0.052, "step": 330875 }, { "epoch": 4.873271380392041, "grad_norm": 1.2754948139190674, "learning_rate": 5.718138201430844e-07, "loss": 0.0531, "step": 330900 }, { "epoch": 4.873639563482128, "grad_norm": 1.1972107887268066, "learning_rate": 5.701774481760998e-07, "loss": 0.0492, "step": 330925 }, { "epoch": 4.874007746572215, "grad_norm": 1.327114462852478, "learning_rate": 5.685410762091152e-07, "loss": 0.0523, "step": 330950 }, { "epoch": 4.874375929662302, "grad_norm": 1.462823510169983, "learning_rate": 5.669047042421308e-07, "loss": 0.0476, "step": 330975 }, { "epoch": 4.874744112752389, "grad_norm": 0.8686959743499756, "learning_rate": 5.652683322751462e-07, "loss": 0.0515, "step": 331000 }, { "epoch": 4.875112295842476, "grad_norm": 1.6701581478118896, "learning_rate": 5.636319603081617e-07, "loss": 0.0469, "step": 331025 }, { "epoch": 4.875480478932563, "grad_norm": 1.3872337341308594, "learning_rate": 5.619955883411771e-07, "loss": 0.0519, "step": 331050 }, { "epoch": 4.87584866202265, "grad_norm": 1.8140034675598145, "learning_rate": 5.603592163741925e-07, "loss": 0.0559, "step": 331075 }, { "epoch": 4.8762168451127375, "grad_norm": 1.8342561721801758, "learning_rate": 5.587228444072079e-07, "loss": 0.0514, "step": 331100 }, { "epoch": 4.8765850282028245, "grad_norm": 0.9031476378440857, "learning_rate": 5.570864724402233e-07, "loss": 0.0516, "step": 331125 }, { "epoch": 4.8769532112929115, "grad_norm": 1.5302590131759644, "learning_rate": 5.554501004732388e-07, "loss": 0.054, "step": 331150 }, { "epoch": 4.8773213943829985, "grad_norm": 1.2084879875183105, "learning_rate": 5.538137285062543e-07, "loss": 0.0526, "step": 331175 }, { "epoch": 4.8776895774730855, "grad_norm": 1.3794918060302734, "learning_rate": 5.521773565392697e-07, "loss": 0.0485, "step": 331200 }, { "epoch": 4.8780577605631725, "grad_norm": 0.9214732050895691, "learning_rate": 5.505409845722851e-07, "loss": 0.0562, "step": 331225 }, { "epoch": 4.8784259436532595, "grad_norm": 1.7100882530212402, "learning_rate": 5.489046126053006e-07, "loss": 0.0549, "step": 331250 }, { "epoch": 4.878794126743347, "grad_norm": 1.8604553937911987, "learning_rate": 5.47268240638316e-07, "loss": 0.0506, "step": 331275 }, { "epoch": 4.8791623098334345, "grad_norm": 1.4793431758880615, "learning_rate": 5.456318686713315e-07, "loss": 0.048, "step": 331300 }, { "epoch": 4.8795304929235215, "grad_norm": 0.7446379661560059, "learning_rate": 5.439954967043469e-07, "loss": 0.0492, "step": 331325 }, { "epoch": 4.8798986760136085, "grad_norm": 1.1350880861282349, "learning_rate": 5.423591247373623e-07, "loss": 0.0476, "step": 331350 }, { "epoch": 4.8802668591036955, "grad_norm": 1.6795074939727783, "learning_rate": 5.407227527703778e-07, "loss": 0.0544, "step": 331375 }, { "epoch": 4.8806350421937825, "grad_norm": 1.2859065532684326, "learning_rate": 5.390863808033933e-07, "loss": 0.0479, "step": 331400 }, { "epoch": 4.8810032252838695, "grad_norm": 0.7876410484313965, "learning_rate": 5.374500088364087e-07, "loss": 0.0475, "step": 331425 }, { "epoch": 4.8813714083739566, "grad_norm": 0.9864166975021362, "learning_rate": 5.358136368694241e-07, "loss": 0.0469, "step": 331450 }, { "epoch": 4.881739591464044, "grad_norm": 1.576211929321289, "learning_rate": 5.341772649024395e-07, "loss": 0.0533, "step": 331475 }, { "epoch": 4.882107774554131, "grad_norm": 1.6332675218582153, "learning_rate": 5.325408929354549e-07, "loss": 0.0537, "step": 331500 }, { "epoch": 4.882475957644218, "grad_norm": 1.6791424751281738, "learning_rate": 5.309045209684704e-07, "loss": 0.0492, "step": 331525 }, { "epoch": 4.882844140734305, "grad_norm": 1.1094706058502197, "learning_rate": 5.292681490014858e-07, "loss": 0.0513, "step": 331550 }, { "epoch": 4.883212323824392, "grad_norm": 1.319616675376892, "learning_rate": 5.276317770345014e-07, "loss": 0.0511, "step": 331575 }, { "epoch": 4.883580506914479, "grad_norm": 1.0090830326080322, "learning_rate": 5.259954050675168e-07, "loss": 0.0434, "step": 331600 }, { "epoch": 4.883948690004566, "grad_norm": 1.749432921409607, "learning_rate": 5.243590331005322e-07, "loss": 0.0526, "step": 331625 }, { "epoch": 4.884316873094653, "grad_norm": 1.5274707078933716, "learning_rate": 5.227226611335477e-07, "loss": 0.0473, "step": 331650 }, { "epoch": 4.88468505618474, "grad_norm": 1.4532458782196045, "learning_rate": 5.210862891665631e-07, "loss": 0.0444, "step": 331675 }, { "epoch": 4.885053239274827, "grad_norm": 1.5814168453216553, "learning_rate": 5.194499171995785e-07, "loss": 0.0503, "step": 331700 }, { "epoch": 4.885421422364914, "grad_norm": 1.417539358139038, "learning_rate": 5.178135452325939e-07, "loss": 0.0499, "step": 331725 }, { "epoch": 4.885789605455001, "grad_norm": 1.2722365856170654, "learning_rate": 5.161771732656094e-07, "loss": 0.0484, "step": 331750 }, { "epoch": 4.886157788545088, "grad_norm": 1.0175721645355225, "learning_rate": 5.145408012986248e-07, "loss": 0.0517, "step": 331775 }, { "epoch": 4.886525971635175, "grad_norm": 1.4009501934051514, "learning_rate": 5.129044293316403e-07, "loss": 0.0453, "step": 331800 }, { "epoch": 4.886894154725262, "grad_norm": 1.321657657623291, "learning_rate": 5.112680573646557e-07, "loss": 0.0486, "step": 331825 }, { "epoch": 4.887262337815349, "grad_norm": 1.2296380996704102, "learning_rate": 5.096316853976712e-07, "loss": 0.0605, "step": 331850 }, { "epoch": 4.887630520905436, "grad_norm": 1.1335623264312744, "learning_rate": 5.079953134306866e-07, "loss": 0.0493, "step": 331875 }, { "epoch": 4.887998703995523, "grad_norm": 1.815750241279602, "learning_rate": 5.06358941463702e-07, "loss": 0.0556, "step": 331900 }, { "epoch": 4.88836688708561, "grad_norm": 1.2230286598205566, "learning_rate": 5.047225694967175e-07, "loss": 0.0538, "step": 331925 }, { "epoch": 4.888735070175697, "grad_norm": 1.824168086051941, "learning_rate": 5.030861975297329e-07, "loss": 0.052, "step": 331950 }, { "epoch": 4.889103253265784, "grad_norm": 1.2683234214782715, "learning_rate": 5.014498255627483e-07, "loss": 0.0516, "step": 331975 }, { "epoch": 4.889471436355871, "grad_norm": 1.3644198179244995, "learning_rate": 4.998134535957639e-07, "loss": 0.0548, "step": 332000 }, { "epoch": 4.889839619445958, "grad_norm": 1.120449423789978, "learning_rate": 4.981770816287793e-07, "loss": 0.0507, "step": 332025 }, { "epoch": 4.890207802536045, "grad_norm": 0.8146657347679138, "learning_rate": 4.965407096617947e-07, "loss": 0.0475, "step": 332050 }, { "epoch": 4.890575985626132, "grad_norm": 1.4584808349609375, "learning_rate": 4.949043376948101e-07, "loss": 0.0547, "step": 332075 }, { "epoch": 4.890944168716219, "grad_norm": 1.1213934421539307, "learning_rate": 4.932679657278255e-07, "loss": 0.0563, "step": 332100 }, { "epoch": 4.891312351806306, "grad_norm": 1.1377229690551758, "learning_rate": 4.91631593760841e-07, "loss": 0.0496, "step": 332125 }, { "epoch": 4.891680534896393, "grad_norm": 1.2989132404327393, "learning_rate": 4.899952217938564e-07, "loss": 0.0504, "step": 332150 }, { "epoch": 4.89204871798648, "grad_norm": 1.4607120752334595, "learning_rate": 4.883588498268718e-07, "loss": 0.0501, "step": 332175 }, { "epoch": 4.892416901076567, "grad_norm": 1.5263921022415161, "learning_rate": 4.867224778598874e-07, "loss": 0.0556, "step": 332200 }, { "epoch": 4.892785084166654, "grad_norm": 0.8324810266494751, "learning_rate": 4.850861058929028e-07, "loss": 0.0534, "step": 332225 }, { "epoch": 4.893153267256741, "grad_norm": 1.6191877126693726, "learning_rate": 4.834497339259183e-07, "loss": 0.0511, "step": 332250 }, { "epoch": 4.893521450346828, "grad_norm": 1.1996718645095825, "learning_rate": 4.818133619589337e-07, "loss": 0.0494, "step": 332275 }, { "epoch": 4.893889633436915, "grad_norm": 1.200501799583435, "learning_rate": 4.801769899919491e-07, "loss": 0.0503, "step": 332300 }, { "epoch": 4.894257816527002, "grad_norm": 1.8005608320236206, "learning_rate": 4.785406180249645e-07, "loss": 0.0499, "step": 332325 }, { "epoch": 4.894625999617089, "grad_norm": 1.4183095693588257, "learning_rate": 4.769042460579799e-07, "loss": 0.0535, "step": 332350 }, { "epoch": 4.894994182707177, "grad_norm": 1.3116949796676636, "learning_rate": 4.7526787409099544e-07, "loss": 0.0541, "step": 332375 }, { "epoch": 4.895362365797264, "grad_norm": 1.670743465423584, "learning_rate": 4.7363150212401085e-07, "loss": 0.0503, "step": 332400 }, { "epoch": 4.895730548887351, "grad_norm": 1.104217290878296, "learning_rate": 4.719951301570263e-07, "loss": 0.0507, "step": 332425 }, { "epoch": 4.896098731977438, "grad_norm": 1.4548280239105225, "learning_rate": 4.703587581900417e-07, "loss": 0.0482, "step": 332450 }, { "epoch": 4.896466915067525, "grad_norm": 1.2266905307769775, "learning_rate": 4.6872238622305713e-07, "loss": 0.0492, "step": 332475 }, { "epoch": 4.896835098157612, "grad_norm": 1.3408427238464355, "learning_rate": 4.6708601425607265e-07, "loss": 0.0487, "step": 332500 }, { "epoch": 4.897203281247699, "grad_norm": 1.434448003768921, "learning_rate": 4.6544964228908806e-07, "loss": 0.0499, "step": 332525 }, { "epoch": 4.897571464337786, "grad_norm": 1.2371189594268799, "learning_rate": 4.6381327032210347e-07, "loss": 0.0558, "step": 332550 }, { "epoch": 4.897939647427873, "grad_norm": 1.33144211769104, "learning_rate": 4.6217689835511893e-07, "loss": 0.0532, "step": 332575 }, { "epoch": 4.89830783051796, "grad_norm": 1.6626392602920532, "learning_rate": 4.6054052638813434e-07, "loss": 0.0503, "step": 332600 }, { "epoch": 4.898676013608047, "grad_norm": 1.1345593929290771, "learning_rate": 4.5890415442114986e-07, "loss": 0.0535, "step": 332625 }, { "epoch": 4.899044196698134, "grad_norm": 1.310836911201477, "learning_rate": 4.5726778245416527e-07, "loss": 0.0484, "step": 332650 }, { "epoch": 4.899412379788221, "grad_norm": 1.0395610332489014, "learning_rate": 4.556314104871807e-07, "loss": 0.0558, "step": 332675 }, { "epoch": 4.899780562878308, "grad_norm": 0.6484836935997009, "learning_rate": 4.5399503852019614e-07, "loss": 0.0498, "step": 332700 }, { "epoch": 4.900148745968395, "grad_norm": 0.7665823101997375, "learning_rate": 4.5235866655321155e-07, "loss": 0.0463, "step": 332725 }, { "epoch": 4.900516929058482, "grad_norm": 1.2902981042861938, "learning_rate": 4.5072229458622707e-07, "loss": 0.0453, "step": 332750 }, { "epoch": 4.900885112148569, "grad_norm": 1.5675803422927856, "learning_rate": 4.490859226192425e-07, "loss": 0.0534, "step": 332775 }, { "epoch": 4.901253295238656, "grad_norm": 0.6525160670280457, "learning_rate": 4.474495506522579e-07, "loss": 0.0526, "step": 332800 }, { "epoch": 4.901621478328743, "grad_norm": 1.0245834589004517, "learning_rate": 4.458131786852733e-07, "loss": 0.0459, "step": 332825 }, { "epoch": 4.90198966141883, "grad_norm": 1.2377426624298096, "learning_rate": 4.4417680671828876e-07, "loss": 0.0497, "step": 332850 }, { "epoch": 4.902357844508917, "grad_norm": 1.6618950366973877, "learning_rate": 4.425404347513042e-07, "loss": 0.0512, "step": 332875 }, { "epoch": 4.902726027599004, "grad_norm": 1.6873887777328491, "learning_rate": 4.409040627843197e-07, "loss": 0.0563, "step": 332900 }, { "epoch": 4.903094210689091, "grad_norm": 0.8748207092285156, "learning_rate": 4.392676908173351e-07, "loss": 0.0445, "step": 332925 }, { "epoch": 4.903462393779178, "grad_norm": 1.0240164995193481, "learning_rate": 4.376313188503505e-07, "loss": 0.0552, "step": 332950 }, { "epoch": 4.903830576869265, "grad_norm": 1.0954469442367554, "learning_rate": 4.35994946883366e-07, "loss": 0.0533, "step": 332975 }, { "epoch": 4.904198759959352, "grad_norm": 1.4549872875213623, "learning_rate": 4.3435857491638144e-07, "loss": 0.0496, "step": 333000 }, { "epoch": 4.904566943049439, "grad_norm": 1.754491925239563, "learning_rate": 4.327222029493969e-07, "loss": 0.0557, "step": 333025 }, { "epoch": 4.904935126139526, "grad_norm": 0.7370721697807312, "learning_rate": 4.310858309824123e-07, "loss": 0.0524, "step": 333050 }, { "epoch": 4.905303309229613, "grad_norm": 1.241559386253357, "learning_rate": 4.294494590154277e-07, "loss": 0.0485, "step": 333075 }, { "epoch": 4.9056714923197005, "grad_norm": 1.6938914060592651, "learning_rate": 4.2781308704844324e-07, "loss": 0.0483, "step": 333100 }, { "epoch": 4.9060396754097875, "grad_norm": 1.006069540977478, "learning_rate": 4.2617671508145865e-07, "loss": 0.0529, "step": 333125 }, { "epoch": 4.9064078584998745, "grad_norm": 1.5768673419952393, "learning_rate": 4.2454034311447406e-07, "loss": 0.0511, "step": 333150 }, { "epoch": 4.9067760415899615, "grad_norm": 2.3626139163970947, "learning_rate": 4.229039711474895e-07, "loss": 0.0552, "step": 333175 }, { "epoch": 4.9071442246800485, "grad_norm": 0.9993570446968079, "learning_rate": 4.2126759918050493e-07, "loss": 0.0481, "step": 333200 }, { "epoch": 4.907512407770136, "grad_norm": 1.7570234537124634, "learning_rate": 4.196966820921998e-07, "loss": 0.0485, "step": 333225 }, { "epoch": 4.907880590860223, "grad_norm": 1.5358786582946777, "learning_rate": 4.180603101252152e-07, "loss": 0.0515, "step": 333250 }, { "epoch": 4.9082487739503105, "grad_norm": 1.0907758474349976, "learning_rate": 4.1642393815823064e-07, "loss": 0.0482, "step": 333275 }, { "epoch": 4.9086169570403975, "grad_norm": 1.3479907512664795, "learning_rate": 4.147875661912461e-07, "loss": 0.0566, "step": 333300 }, { "epoch": 4.9089851401304845, "grad_norm": 1.0612907409667969, "learning_rate": 4.1315119422426156e-07, "loss": 0.0474, "step": 333325 }, { "epoch": 4.9093533232205715, "grad_norm": 1.3949934244155884, "learning_rate": 4.11514822257277e-07, "loss": 0.0487, "step": 333350 }, { "epoch": 4.9097215063106585, "grad_norm": 1.223125696182251, "learning_rate": 4.0987845029029244e-07, "loss": 0.0491, "step": 333375 }, { "epoch": 4.9100896894007455, "grad_norm": 0.973293662071228, "learning_rate": 4.0824207832330785e-07, "loss": 0.0447, "step": 333400 }, { "epoch": 4.9104578724908325, "grad_norm": 1.2000644207000732, "learning_rate": 4.0660570635632326e-07, "loss": 0.0482, "step": 333425 }, { "epoch": 4.91082605558092, "grad_norm": 1.2607672214508057, "learning_rate": 4.0496933438933877e-07, "loss": 0.0527, "step": 333450 }, { "epoch": 4.911194238671007, "grad_norm": 1.2454999685287476, "learning_rate": 4.033329624223542e-07, "loss": 0.0591, "step": 333475 }, { "epoch": 4.911562421761094, "grad_norm": 1.1193015575408936, "learning_rate": 4.0169659045536965e-07, "loss": 0.0518, "step": 333500 }, { "epoch": 4.911930604851181, "grad_norm": 1.552511215209961, "learning_rate": 4.0006021848838506e-07, "loss": 0.0553, "step": 333525 }, { "epoch": 4.912298787941268, "grad_norm": 1.1871947050094604, "learning_rate": 3.9842384652140047e-07, "loss": 0.049, "step": 333550 }, { "epoch": 4.912666971031355, "grad_norm": 1.3959815502166748, "learning_rate": 3.96787474554416e-07, "loss": 0.0481, "step": 333575 }, { "epoch": 4.913035154121442, "grad_norm": 1.3606975078582764, "learning_rate": 3.951511025874314e-07, "loss": 0.0433, "step": 333600 }, { "epoch": 4.913403337211529, "grad_norm": 1.348492980003357, "learning_rate": 3.9351473062044686e-07, "loss": 0.0496, "step": 333625 }, { "epoch": 4.913771520301616, "grad_norm": 1.370913028717041, "learning_rate": 3.9187835865346227e-07, "loss": 0.0487, "step": 333650 }, { "epoch": 4.914139703391703, "grad_norm": 1.3554139137268066, "learning_rate": 3.902419866864777e-07, "loss": 0.0498, "step": 333675 }, { "epoch": 4.91450788648179, "grad_norm": 1.1865737438201904, "learning_rate": 3.886056147194932e-07, "loss": 0.0476, "step": 333700 }, { "epoch": 4.914876069571877, "grad_norm": 1.9947452545166016, "learning_rate": 3.869692427525086e-07, "loss": 0.0525, "step": 333725 }, { "epoch": 4.915244252661964, "grad_norm": 1.2996271848678589, "learning_rate": 3.85332870785524e-07, "loss": 0.0515, "step": 333750 }, { "epoch": 4.915612435752051, "grad_norm": 1.139541506767273, "learning_rate": 3.836964988185395e-07, "loss": 0.0537, "step": 333775 }, { "epoch": 4.915980618842138, "grad_norm": 1.5225929021835327, "learning_rate": 3.820601268515549e-07, "loss": 0.0495, "step": 333800 }, { "epoch": 4.916348801932225, "grad_norm": 1.0650289058685303, "learning_rate": 3.804237548845704e-07, "loss": 0.0459, "step": 333825 }, { "epoch": 4.916716985022312, "grad_norm": 1.4298961162567139, "learning_rate": 3.787873829175858e-07, "loss": 0.0582, "step": 333850 }, { "epoch": 4.917085168112399, "grad_norm": 1.447590947151184, "learning_rate": 3.771510109506012e-07, "loss": 0.0456, "step": 333875 }, { "epoch": 4.917453351202486, "grad_norm": 1.102942705154419, "learning_rate": 3.755146389836167e-07, "loss": 0.0547, "step": 333900 }, { "epoch": 4.917821534292573, "grad_norm": 0.8419262170791626, "learning_rate": 3.738782670166321e-07, "loss": 0.047, "step": 333925 }, { "epoch": 4.91818971738266, "grad_norm": 1.1380261182785034, "learning_rate": 3.7224189504964756e-07, "loss": 0.0491, "step": 333950 }, { "epoch": 4.918557900472747, "grad_norm": 1.1962227821350098, "learning_rate": 3.70605523082663e-07, "loss": 0.0506, "step": 333975 }, { "epoch": 4.918926083562834, "grad_norm": 1.627918004989624, "learning_rate": 3.6896915111567843e-07, "loss": 0.0496, "step": 334000 }, { "epoch": 4.919294266652921, "grad_norm": 1.403448462486267, "learning_rate": 3.6733277914869384e-07, "loss": 0.0497, "step": 334025 }, { "epoch": 4.919662449743008, "grad_norm": 1.3129823207855225, "learning_rate": 3.656964071817093e-07, "loss": 0.0548, "step": 334050 }, { "epoch": 4.920030632833095, "grad_norm": 0.92596435546875, "learning_rate": 3.6406003521472477e-07, "loss": 0.0456, "step": 334075 }, { "epoch": 4.920398815923182, "grad_norm": 1.1982165575027466, "learning_rate": 3.6242366324774023e-07, "loss": 0.052, "step": 334100 }, { "epoch": 4.920766999013269, "grad_norm": 0.7939584255218506, "learning_rate": 3.6078729128075564e-07, "loss": 0.0536, "step": 334125 }, { "epoch": 4.921135182103356, "grad_norm": 0.966560959815979, "learning_rate": 3.5915091931377106e-07, "loss": 0.0459, "step": 334150 }, { "epoch": 4.921503365193443, "grad_norm": 0.7333667278289795, "learning_rate": 3.5751454734678647e-07, "loss": 0.0515, "step": 334175 }, { "epoch": 4.92187154828353, "grad_norm": 1.469001054763794, "learning_rate": 3.55878175379802e-07, "loss": 0.0534, "step": 334200 }, { "epoch": 4.922239731373617, "grad_norm": 1.4328724145889282, "learning_rate": 3.542418034128174e-07, "loss": 0.0509, "step": 334225 }, { "epoch": 4.922607914463704, "grad_norm": 1.0952802896499634, "learning_rate": 3.5260543144583286e-07, "loss": 0.0514, "step": 334250 }, { "epoch": 4.922976097553791, "grad_norm": 1.4380772113800049, "learning_rate": 3.5096905947884827e-07, "loss": 0.0558, "step": 334275 }, { "epoch": 4.923344280643879, "grad_norm": 1.3975204229354858, "learning_rate": 3.493326875118637e-07, "loss": 0.0456, "step": 334300 }, { "epoch": 4.923712463733966, "grad_norm": 1.3751635551452637, "learning_rate": 3.476963155448792e-07, "loss": 0.0524, "step": 334325 }, { "epoch": 4.924080646824053, "grad_norm": 1.231284737586975, "learning_rate": 3.460599435778946e-07, "loss": 0.0475, "step": 334350 }, { "epoch": 4.92444882991414, "grad_norm": 1.2894213199615479, "learning_rate": 3.4442357161091007e-07, "loss": 0.0491, "step": 334375 }, { "epoch": 4.924817013004227, "grad_norm": 1.3960341215133667, "learning_rate": 3.427871996439255e-07, "loss": 0.0499, "step": 334400 }, { "epoch": 4.925185196094314, "grad_norm": 1.9660850763320923, "learning_rate": 3.411508276769409e-07, "loss": 0.0493, "step": 334425 }, { "epoch": 4.925553379184401, "grad_norm": 1.4844788312911987, "learning_rate": 3.395144557099564e-07, "loss": 0.056, "step": 334450 }, { "epoch": 4.925921562274488, "grad_norm": 1.1590521335601807, "learning_rate": 3.378780837429718e-07, "loss": 0.0529, "step": 334475 }, { "epoch": 4.926289745364575, "grad_norm": 1.0546517372131348, "learning_rate": 3.362417117759872e-07, "loss": 0.049, "step": 334500 }, { "epoch": 4.926657928454662, "grad_norm": 1.026505470275879, "learning_rate": 3.346053398090027e-07, "loss": 0.0501, "step": 334525 }, { "epoch": 4.927026111544749, "grad_norm": 1.2053793668746948, "learning_rate": 3.329689678420181e-07, "loss": 0.0517, "step": 334550 }, { "epoch": 4.927394294634836, "grad_norm": 0.9803434014320374, "learning_rate": 3.313325958750336e-07, "loss": 0.0474, "step": 334575 }, { "epoch": 4.927762477724923, "grad_norm": 1.159724235534668, "learning_rate": 3.29696223908049e-07, "loss": 0.0519, "step": 334600 }, { "epoch": 4.92813066081501, "grad_norm": 0.7486581802368164, "learning_rate": 3.2805985194106443e-07, "loss": 0.0479, "step": 334625 }, { "epoch": 4.928498843905097, "grad_norm": 1.1323797702789307, "learning_rate": 3.264234799740799e-07, "loss": 0.0457, "step": 334650 }, { "epoch": 4.928867026995184, "grad_norm": 1.5626558065414429, "learning_rate": 3.2478710800709536e-07, "loss": 0.0516, "step": 334675 }, { "epoch": 4.929235210085271, "grad_norm": 1.535515308380127, "learning_rate": 3.2315073604011077e-07, "loss": 0.0549, "step": 334700 }, { "epoch": 4.929603393175358, "grad_norm": 0.9101071357727051, "learning_rate": 3.2151436407312623e-07, "loss": 0.0466, "step": 334725 }, { "epoch": 4.929971576265445, "grad_norm": 0.5328569412231445, "learning_rate": 3.1987799210614164e-07, "loss": 0.0464, "step": 334750 }, { "epoch": 4.930339759355532, "grad_norm": 1.1395955085754395, "learning_rate": 3.1824162013915705e-07, "loss": 0.0555, "step": 334775 }, { "epoch": 4.930707942445619, "grad_norm": 1.7324939966201782, "learning_rate": 3.1660524817217257e-07, "loss": 0.0507, "step": 334800 }, { "epoch": 4.931076125535706, "grad_norm": 1.212140679359436, "learning_rate": 3.14968876205188e-07, "loss": 0.0514, "step": 334825 }, { "epoch": 4.931444308625793, "grad_norm": 0.6742513179779053, "learning_rate": 3.1333250423820344e-07, "loss": 0.0499, "step": 334850 }, { "epoch": 4.93181249171588, "grad_norm": 1.4140636920928955, "learning_rate": 3.1169613227121885e-07, "loss": 0.0497, "step": 334875 }, { "epoch": 4.932180674805967, "grad_norm": 1.450247883796692, "learning_rate": 3.100597603042343e-07, "loss": 0.0578, "step": 334900 }, { "epoch": 4.932548857896054, "grad_norm": 1.3651790618896484, "learning_rate": 3.0842338833724973e-07, "loss": 0.0544, "step": 334925 }, { "epoch": 4.932917040986141, "grad_norm": 1.8230345249176025, "learning_rate": 3.067870163702652e-07, "loss": 0.0494, "step": 334950 }, { "epoch": 4.933285224076228, "grad_norm": 1.5822384357452393, "learning_rate": 3.051506444032806e-07, "loss": 0.0465, "step": 334975 }, { "epoch": 4.933653407166315, "grad_norm": 1.2938799858093262, "learning_rate": 3.0351427243629606e-07, "loss": 0.0515, "step": 335000 }, { "epoch": 4.934021590256402, "grad_norm": 1.7103651762008667, "learning_rate": 3.0187790046931153e-07, "loss": 0.0571, "step": 335025 }, { "epoch": 4.934389773346489, "grad_norm": 1.286973237991333, "learning_rate": 3.0024152850232694e-07, "loss": 0.0499, "step": 335050 }, { "epoch": 4.934757956436576, "grad_norm": 1.3599498271942139, "learning_rate": 2.986051565353424e-07, "loss": 0.0511, "step": 335075 }, { "epoch": 4.9351261395266635, "grad_norm": 1.8223161697387695, "learning_rate": 2.969687845683578e-07, "loss": 0.0534, "step": 335100 }, { "epoch": 4.9354943226167505, "grad_norm": 1.3782145977020264, "learning_rate": 2.953324126013733e-07, "loss": 0.0501, "step": 335125 }, { "epoch": 4.935862505706838, "grad_norm": 1.3739384412765503, "learning_rate": 2.9369604063438874e-07, "loss": 0.0565, "step": 335150 }, { "epoch": 4.936230688796925, "grad_norm": 1.2734593152999878, "learning_rate": 2.9205966866740415e-07, "loss": 0.0468, "step": 335175 }, { "epoch": 4.936598871887012, "grad_norm": 1.7262781858444214, "learning_rate": 2.904232967004196e-07, "loss": 0.053, "step": 335200 }, { "epoch": 4.936967054977099, "grad_norm": 1.7785955667495728, "learning_rate": 2.88786924733435e-07, "loss": 0.0555, "step": 335225 }, { "epoch": 4.937335238067186, "grad_norm": 1.5030748844146729, "learning_rate": 2.8715055276645043e-07, "loss": 0.0553, "step": 335250 }, { "epoch": 4.9377034211572735, "grad_norm": 1.154745101928711, "learning_rate": 2.855141807994659e-07, "loss": 0.0511, "step": 335275 }, { "epoch": 4.9380716042473605, "grad_norm": 1.1805689334869385, "learning_rate": 2.8387780883248136e-07, "loss": 0.0536, "step": 335300 }, { "epoch": 4.9384397873374475, "grad_norm": 1.0875493288040161, "learning_rate": 2.822414368654968e-07, "loss": 0.0552, "step": 335325 }, { "epoch": 4.9388079704275345, "grad_norm": 1.270925521850586, "learning_rate": 2.8060506489851223e-07, "loss": 0.0519, "step": 335350 }, { "epoch": 4.9391761535176215, "grad_norm": 1.1785509586334229, "learning_rate": 2.7896869293152764e-07, "loss": 0.0466, "step": 335375 }, { "epoch": 4.9395443366077085, "grad_norm": 1.1776304244995117, "learning_rate": 2.773323209645431e-07, "loss": 0.0521, "step": 335400 }, { "epoch": 4.9399125196977955, "grad_norm": 0.881881833076477, "learning_rate": 2.7569594899755857e-07, "loss": 0.0508, "step": 335425 }, { "epoch": 4.940280702787883, "grad_norm": 1.2219123840332031, "learning_rate": 2.7405957703057403e-07, "loss": 0.0542, "step": 335450 }, { "epoch": 4.94064888587797, "grad_norm": 1.6511127948760986, "learning_rate": 2.7242320506358944e-07, "loss": 0.0519, "step": 335475 }, { "epoch": 4.941017068968057, "grad_norm": 1.2264586687088013, "learning_rate": 2.7078683309660485e-07, "loss": 0.0474, "step": 335500 }, { "epoch": 4.941385252058144, "grad_norm": 0.7492833137512207, "learning_rate": 2.691504611296203e-07, "loss": 0.0479, "step": 335525 }, { "epoch": 4.941753435148231, "grad_norm": 1.6098127365112305, "learning_rate": 2.6751408916263573e-07, "loss": 0.0487, "step": 335550 }, { "epoch": 4.942121618238318, "grad_norm": 1.2260299921035767, "learning_rate": 2.658777171956512e-07, "loss": 0.0469, "step": 335575 }, { "epoch": 4.942489801328405, "grad_norm": 1.1029223203659058, "learning_rate": 2.6424134522866665e-07, "loss": 0.0503, "step": 335600 }, { "epoch": 4.942857984418492, "grad_norm": 2.017568349838257, "learning_rate": 2.6260497326168206e-07, "loss": 0.0488, "step": 335625 }, { "epoch": 4.943226167508579, "grad_norm": 1.5061942338943481, "learning_rate": 2.6096860129469753e-07, "loss": 0.0533, "step": 335650 }, { "epoch": 4.943594350598666, "grad_norm": 1.5082191228866577, "learning_rate": 2.5933222932771294e-07, "loss": 0.0533, "step": 335675 }, { "epoch": 4.943962533688753, "grad_norm": 0.6955373883247375, "learning_rate": 2.576958573607284e-07, "loss": 0.0553, "step": 335700 }, { "epoch": 4.94433071677884, "grad_norm": 1.2744516134262085, "learning_rate": 2.5605948539374386e-07, "loss": 0.0514, "step": 335725 }, { "epoch": 4.944698899868927, "grad_norm": 1.1599795818328857, "learning_rate": 2.5442311342675933e-07, "loss": 0.0557, "step": 335750 }, { "epoch": 4.945067082959014, "grad_norm": 0.999093770980835, "learning_rate": 2.5278674145977474e-07, "loss": 0.0526, "step": 335775 }, { "epoch": 4.945435266049101, "grad_norm": 1.3702365159988403, "learning_rate": 2.5115036949279015e-07, "loss": 0.0489, "step": 335800 }, { "epoch": 4.945803449139188, "grad_norm": 1.4919281005859375, "learning_rate": 2.495139975258056e-07, "loss": 0.0502, "step": 335825 }, { "epoch": 4.946171632229275, "grad_norm": 1.319697380065918, "learning_rate": 2.47877625558821e-07, "loss": 0.0543, "step": 335850 }, { "epoch": 4.946539815319362, "grad_norm": 1.5539913177490234, "learning_rate": 2.462412535918365e-07, "loss": 0.0498, "step": 335875 }, { "epoch": 4.946907998409449, "grad_norm": 1.0406968593597412, "learning_rate": 2.4460488162485195e-07, "loss": 0.0492, "step": 335900 }, { "epoch": 4.947276181499536, "grad_norm": 1.4123789072036743, "learning_rate": 2.4296850965786736e-07, "loss": 0.0534, "step": 335925 }, { "epoch": 4.947644364589623, "grad_norm": 0.5169786810874939, "learning_rate": 2.413321376908828e-07, "loss": 0.0473, "step": 335950 }, { "epoch": 4.94801254767971, "grad_norm": 1.24554443359375, "learning_rate": 2.3969576572389823e-07, "loss": 0.0457, "step": 335975 }, { "epoch": 4.948380730769797, "grad_norm": 1.6909687519073486, "learning_rate": 2.380593937569137e-07, "loss": 0.0562, "step": 336000 }, { "epoch": 4.948748913859884, "grad_norm": 1.3616148233413696, "learning_rate": 2.3642302178992913e-07, "loss": 0.0527, "step": 336025 }, { "epoch": 4.949117096949971, "grad_norm": 1.5458955764770508, "learning_rate": 2.3478664982294457e-07, "loss": 0.0539, "step": 336050 }, { "epoch": 4.949485280040058, "grad_norm": 1.1470768451690674, "learning_rate": 2.3315027785596003e-07, "loss": 0.0577, "step": 336075 }, { "epoch": 4.949853463130145, "grad_norm": 1.7016922235488892, "learning_rate": 2.3151390588897544e-07, "loss": 0.0524, "step": 336100 }, { "epoch": 4.950221646220232, "grad_norm": 1.0179957151412964, "learning_rate": 2.298775339219909e-07, "loss": 0.0509, "step": 336125 }, { "epoch": 4.950589829310319, "grad_norm": 1.4813505411148071, "learning_rate": 2.2824116195500634e-07, "loss": 0.0515, "step": 336150 }, { "epoch": 4.950958012400406, "grad_norm": 1.5040459632873535, "learning_rate": 2.2660478998802175e-07, "loss": 0.0554, "step": 336175 }, { "epoch": 4.951326195490493, "grad_norm": 1.3197249174118042, "learning_rate": 2.2496841802103722e-07, "loss": 0.049, "step": 336200 }, { "epoch": 4.951694378580581, "grad_norm": 1.9273678064346313, "learning_rate": 2.2333204605405265e-07, "loss": 0.0547, "step": 336225 }, { "epoch": 4.952062561670668, "grad_norm": 1.6828017234802246, "learning_rate": 2.2169567408706812e-07, "loss": 0.0513, "step": 336250 }, { "epoch": 4.952430744760755, "grad_norm": 1.3567432165145874, "learning_rate": 2.2005930212008353e-07, "loss": 0.0522, "step": 336275 }, { "epoch": 4.952798927850842, "grad_norm": 0.9874514937400818, "learning_rate": 2.1842293015309896e-07, "loss": 0.0468, "step": 336300 }, { "epoch": 4.953167110940929, "grad_norm": 0.7054497599601746, "learning_rate": 2.1678655818611443e-07, "loss": 0.046, "step": 336325 }, { "epoch": 4.953535294031016, "grad_norm": 1.7746872901916504, "learning_rate": 2.1515018621912986e-07, "loss": 0.0511, "step": 336350 }, { "epoch": 4.953903477121103, "grad_norm": 1.3093620538711548, "learning_rate": 2.135138142521453e-07, "loss": 0.0502, "step": 336375 }, { "epoch": 4.95427166021119, "grad_norm": 1.322843074798584, "learning_rate": 2.1187744228516074e-07, "loss": 0.0518, "step": 336400 }, { "epoch": 4.954639843301277, "grad_norm": 1.187206506729126, "learning_rate": 2.102410703181762e-07, "loss": 0.051, "step": 336425 }, { "epoch": 4.955008026391364, "grad_norm": 1.5089824199676514, "learning_rate": 2.0860469835119164e-07, "loss": 0.0508, "step": 336450 }, { "epoch": 4.955376209481451, "grad_norm": 1.4597768783569336, "learning_rate": 2.0696832638420705e-07, "loss": 0.0468, "step": 336475 }, { "epoch": 4.955744392571538, "grad_norm": 1.157888650894165, "learning_rate": 2.053319544172225e-07, "loss": 0.0472, "step": 336500 }, { "epoch": 4.956112575661625, "grad_norm": 2.018395185470581, "learning_rate": 2.0369558245023795e-07, "loss": 0.0479, "step": 336525 }, { "epoch": 4.956480758751712, "grad_norm": 1.4621374607086182, "learning_rate": 2.020592104832534e-07, "loss": 0.0497, "step": 336550 }, { "epoch": 4.956848941841799, "grad_norm": 1.0740878582000732, "learning_rate": 2.0042283851626882e-07, "loss": 0.0539, "step": 336575 }, { "epoch": 4.957217124931886, "grad_norm": 1.709808111190796, "learning_rate": 1.9878646654928426e-07, "loss": 0.0572, "step": 336600 }, { "epoch": 4.957585308021973, "grad_norm": 0.843580424785614, "learning_rate": 1.9715009458229972e-07, "loss": 0.0492, "step": 336625 }, { "epoch": 4.95795349111206, "grad_norm": 1.3082584142684937, "learning_rate": 1.9551372261531513e-07, "loss": 0.0446, "step": 336650 }, { "epoch": 4.958321674202147, "grad_norm": 1.191066861152649, "learning_rate": 1.938773506483306e-07, "loss": 0.0487, "step": 336675 }, { "epoch": 4.958689857292234, "grad_norm": 1.2488881349563599, "learning_rate": 1.9224097868134603e-07, "loss": 0.0504, "step": 336700 }, { "epoch": 4.959058040382321, "grad_norm": 1.416822075843811, "learning_rate": 1.9060460671436147e-07, "loss": 0.051, "step": 336725 }, { "epoch": 4.959426223472408, "grad_norm": 1.1460826396942139, "learning_rate": 1.8896823474737693e-07, "loss": 0.0532, "step": 336750 }, { "epoch": 4.959794406562495, "grad_norm": 1.1475660800933838, "learning_rate": 1.8733186278039234e-07, "loss": 0.0505, "step": 336775 }, { "epoch": 4.960162589652582, "grad_norm": 2.0693392753601074, "learning_rate": 1.856954908134078e-07, "loss": 0.0543, "step": 336800 }, { "epoch": 4.960530772742669, "grad_norm": 1.0268577337265015, "learning_rate": 1.8405911884642324e-07, "loss": 0.0512, "step": 336825 }, { "epoch": 4.960898955832756, "grad_norm": 0.741677463054657, "learning_rate": 1.8242274687943865e-07, "loss": 0.0458, "step": 336850 }, { "epoch": 4.961267138922843, "grad_norm": 1.309356689453125, "learning_rate": 1.8078637491245411e-07, "loss": 0.0572, "step": 336875 }, { "epoch": 4.96163532201293, "grad_norm": 1.411283254623413, "learning_rate": 1.7915000294546955e-07, "loss": 0.0515, "step": 336900 }, { "epoch": 4.962003505103017, "grad_norm": 1.1230144500732422, "learning_rate": 1.7751363097848501e-07, "loss": 0.0546, "step": 336925 }, { "epoch": 4.962371688193104, "grad_norm": 1.6702622175216675, "learning_rate": 1.7587725901150042e-07, "loss": 0.049, "step": 336950 }, { "epoch": 4.962739871283191, "grad_norm": 1.3394089937210083, "learning_rate": 1.7424088704451586e-07, "loss": 0.0526, "step": 336975 }, { "epoch": 4.963108054373278, "grad_norm": 2.077453136444092, "learning_rate": 1.7260451507753132e-07, "loss": 0.0489, "step": 337000 }, { "epoch": 4.963476237463365, "grad_norm": 1.6623622179031372, "learning_rate": 1.7096814311054676e-07, "loss": 0.0518, "step": 337025 }, { "epoch": 4.963844420553453, "grad_norm": 1.215965986251831, "learning_rate": 1.693317711435622e-07, "loss": 0.0551, "step": 337050 }, { "epoch": 4.96421260364354, "grad_norm": 1.5748342275619507, "learning_rate": 1.6769539917657764e-07, "loss": 0.0516, "step": 337075 }, { "epoch": 4.964580786733627, "grad_norm": 0.9643785953521729, "learning_rate": 1.660590272095931e-07, "loss": 0.0438, "step": 337100 }, { "epoch": 4.964948969823714, "grad_norm": 1.6236388683319092, "learning_rate": 1.6442265524260854e-07, "loss": 0.0505, "step": 337125 }, { "epoch": 4.965317152913801, "grad_norm": 1.2222963571548462, "learning_rate": 1.6278628327562395e-07, "loss": 0.0492, "step": 337150 }, { "epoch": 4.965685336003888, "grad_norm": 1.48945152759552, "learning_rate": 1.611499113086394e-07, "loss": 0.0478, "step": 337175 }, { "epoch": 4.966053519093975, "grad_norm": 1.3975740671157837, "learning_rate": 1.5951353934165485e-07, "loss": 0.0528, "step": 337200 }, { "epoch": 4.966421702184062, "grad_norm": 1.1448314189910889, "learning_rate": 1.5794262225334968e-07, "loss": 0.0586, "step": 337225 }, { "epoch": 4.966789885274149, "grad_norm": 1.710077166557312, "learning_rate": 1.563062502863651e-07, "loss": 0.055, "step": 337250 }, { "epoch": 4.9671580683642365, "grad_norm": 1.4766947031021118, "learning_rate": 1.5466987831938055e-07, "loss": 0.056, "step": 337275 }, { "epoch": 4.9675262514543235, "grad_norm": 1.4366049766540527, "learning_rate": 1.53033506352396e-07, "loss": 0.0531, "step": 337300 }, { "epoch": 4.9678944345444105, "grad_norm": 1.582160234451294, "learning_rate": 1.5139713438541142e-07, "loss": 0.0491, "step": 337325 }, { "epoch": 4.9682626176344975, "grad_norm": 1.4440860748291016, "learning_rate": 1.4976076241842686e-07, "loss": 0.0535, "step": 337350 }, { "epoch": 4.9686308007245845, "grad_norm": 1.2116984128952026, "learning_rate": 1.4812439045144232e-07, "loss": 0.0497, "step": 337375 }, { "epoch": 4.9689989838146715, "grad_norm": 1.1933037042617798, "learning_rate": 1.4655347336313713e-07, "loss": 0.0461, "step": 337400 }, { "epoch": 4.9693671669047585, "grad_norm": 1.3626468181610107, "learning_rate": 1.4491710139615257e-07, "loss": 0.0552, "step": 337425 }, { "epoch": 4.969735349994846, "grad_norm": 1.5436662435531616, "learning_rate": 1.4328072942916803e-07, "loss": 0.0505, "step": 337450 }, { "epoch": 4.970103533084933, "grad_norm": 0.9573881030082703, "learning_rate": 1.4164435746218344e-07, "loss": 0.0558, "step": 337475 }, { "epoch": 4.97047171617502, "grad_norm": 1.3884872198104858, "learning_rate": 1.400079854951989e-07, "loss": 0.0497, "step": 337500 }, { "epoch": 4.970839899265107, "grad_norm": 1.5933350324630737, "learning_rate": 1.3837161352821434e-07, "loss": 0.0534, "step": 337525 }, { "epoch": 4.971208082355194, "grad_norm": 1.4374070167541504, "learning_rate": 1.3673524156122978e-07, "loss": 0.0473, "step": 337550 }, { "epoch": 4.971576265445281, "grad_norm": 1.8629683256149292, "learning_rate": 1.3509886959424521e-07, "loss": 0.0521, "step": 337575 }, { "epoch": 4.971944448535368, "grad_norm": 1.319122552871704, "learning_rate": 1.3346249762726068e-07, "loss": 0.0593, "step": 337600 }, { "epoch": 4.972312631625455, "grad_norm": 1.3031715154647827, "learning_rate": 1.318261256602761e-07, "loss": 0.0471, "step": 337625 }, { "epoch": 4.972680814715542, "grad_norm": 0.5853040814399719, "learning_rate": 1.3018975369329152e-07, "loss": 0.0565, "step": 337650 }, { "epoch": 4.973048997805629, "grad_norm": 1.227638602256775, "learning_rate": 1.28553381726307e-07, "loss": 0.0497, "step": 337675 }, { "epoch": 4.973417180895716, "grad_norm": 1.5295565128326416, "learning_rate": 1.2691700975932242e-07, "loss": 0.0521, "step": 337700 }, { "epoch": 4.973785363985803, "grad_norm": 1.1376793384552002, "learning_rate": 1.2528063779233786e-07, "loss": 0.0529, "step": 337725 }, { "epoch": 4.97415354707589, "grad_norm": 1.0767399072647095, "learning_rate": 1.236442658253533e-07, "loss": 0.0563, "step": 337750 }, { "epoch": 4.974521730165977, "grad_norm": 1.2164992094039917, "learning_rate": 1.2200789385836873e-07, "loss": 0.0512, "step": 337775 }, { "epoch": 4.974889913256064, "grad_norm": 0.9008989334106445, "learning_rate": 1.2037152189138417e-07, "loss": 0.0584, "step": 337800 }, { "epoch": 4.975258096346151, "grad_norm": 1.5048881769180298, "learning_rate": 1.1873514992439962e-07, "loss": 0.0567, "step": 337825 }, { "epoch": 4.975626279436238, "grad_norm": 1.5705159902572632, "learning_rate": 1.1709877795741507e-07, "loss": 0.056, "step": 337850 }, { "epoch": 4.975994462526325, "grad_norm": 1.4366154670715332, "learning_rate": 1.154624059904305e-07, "loss": 0.0507, "step": 337875 }, { "epoch": 4.976362645616412, "grad_norm": 1.8623098134994507, "learning_rate": 1.1382603402344595e-07, "loss": 0.0496, "step": 337900 }, { "epoch": 4.976730828706499, "grad_norm": 0.9317939877510071, "learning_rate": 1.1218966205646138e-07, "loss": 0.0531, "step": 337925 }, { "epoch": 4.977099011796586, "grad_norm": 1.711613416671753, "learning_rate": 1.1055329008947683e-07, "loss": 0.0497, "step": 337950 }, { "epoch": 4.977467194886673, "grad_norm": 1.1050227880477905, "learning_rate": 1.0891691812249227e-07, "loss": 0.0539, "step": 337975 }, { "epoch": 4.97783537797676, "grad_norm": 1.5085375308990479, "learning_rate": 1.0728054615550772e-07, "loss": 0.0529, "step": 338000 }, { "epoch": 4.978203561066847, "grad_norm": 1.560699701309204, "learning_rate": 1.0564417418852314e-07, "loss": 0.0516, "step": 338025 }, { "epoch": 4.978571744156934, "grad_norm": 1.5578539371490479, "learning_rate": 1.0400780222153859e-07, "loss": 0.0525, "step": 338050 }, { "epoch": 4.978939927247021, "grad_norm": 1.4515794515609741, "learning_rate": 1.0237143025455403e-07, "loss": 0.0551, "step": 338075 }, { "epoch": 4.979308110337108, "grad_norm": 1.3008462190628052, "learning_rate": 1.0073505828756948e-07, "loss": 0.0555, "step": 338100 }, { "epoch": 4.979676293427195, "grad_norm": 1.4472436904907227, "learning_rate": 9.909868632058492e-08, "loss": 0.0421, "step": 338125 }, { "epoch": 4.980044476517283, "grad_norm": 1.3187246322631836, "learning_rate": 9.746231435360034e-08, "loss": 0.0555, "step": 338150 }, { "epoch": 4.98041265960737, "grad_norm": 1.2117855548858643, "learning_rate": 9.582594238661579e-08, "loss": 0.0512, "step": 338175 }, { "epoch": 4.980780842697457, "grad_norm": 1.4057400226593018, "learning_rate": 9.418957041963124e-08, "loss": 0.0467, "step": 338200 }, { "epoch": 4.981149025787544, "grad_norm": 1.093895435333252, "learning_rate": 9.255319845264668e-08, "loss": 0.0543, "step": 338225 }, { "epoch": 4.981517208877631, "grad_norm": 1.2608652114868164, "learning_rate": 9.091682648566213e-08, "loss": 0.0444, "step": 338250 }, { "epoch": 4.981885391967718, "grad_norm": 1.4286466836929321, "learning_rate": 8.928045451867756e-08, "loss": 0.0521, "step": 338275 }, { "epoch": 4.982253575057805, "grad_norm": 1.9670741558074951, "learning_rate": 8.764408255169299e-08, "loss": 0.0495, "step": 338300 }, { "epoch": 4.982621758147892, "grad_norm": 1.6306285858154297, "learning_rate": 8.600771058470844e-08, "loss": 0.0485, "step": 338325 }, { "epoch": 4.982989941237979, "grad_norm": 1.510868787765503, "learning_rate": 8.437133861772387e-08, "loss": 0.0544, "step": 338350 }, { "epoch": 4.983358124328066, "grad_norm": 1.2294979095458984, "learning_rate": 8.273496665073932e-08, "loss": 0.0528, "step": 338375 }, { "epoch": 4.983726307418153, "grad_norm": 1.2693171501159668, "learning_rate": 8.109859468375477e-08, "loss": 0.0453, "step": 338400 }, { "epoch": 4.98409449050824, "grad_norm": 1.668380618095398, "learning_rate": 7.94622227167702e-08, "loss": 0.0487, "step": 338425 }, { "epoch": 4.984462673598327, "grad_norm": 1.1777913570404053, "learning_rate": 7.782585074978563e-08, "loss": 0.0512, "step": 338450 }, { "epoch": 4.984830856688414, "grad_norm": 1.0354845523834229, "learning_rate": 7.618947878280108e-08, "loss": 0.0495, "step": 338475 }, { "epoch": 4.985199039778501, "grad_norm": 1.717870831489563, "learning_rate": 7.455310681581652e-08, "loss": 0.0563, "step": 338500 }, { "epoch": 4.985567222868588, "grad_norm": 1.194688320159912, "learning_rate": 7.291673484883196e-08, "loss": 0.0522, "step": 338525 }, { "epoch": 4.985935405958675, "grad_norm": 1.4643720388412476, "learning_rate": 7.128036288184741e-08, "loss": 0.0562, "step": 338550 }, { "epoch": 4.986303589048762, "grad_norm": 1.3760372400283813, "learning_rate": 6.964399091486284e-08, "loss": 0.0546, "step": 338575 }, { "epoch": 4.986671772138849, "grad_norm": 1.822245478630066, "learning_rate": 6.800761894787828e-08, "loss": 0.0555, "step": 338600 }, { "epoch": 4.987039955228936, "grad_norm": 1.3164429664611816, "learning_rate": 6.637124698089373e-08, "loss": 0.0549, "step": 338625 }, { "epoch": 4.987408138319023, "grad_norm": 1.501807689666748, "learning_rate": 6.473487501390917e-08, "loss": 0.0497, "step": 338650 }, { "epoch": 4.98777632140911, "grad_norm": 1.3571958541870117, "learning_rate": 6.30985030469246e-08, "loss": 0.0608, "step": 338675 }, { "epoch": 4.988144504499197, "grad_norm": 0.8018175363540649, "learning_rate": 6.146213107994005e-08, "loss": 0.054, "step": 338700 }, { "epoch": 4.988512687589284, "grad_norm": 1.8292900323867798, "learning_rate": 5.982575911295549e-08, "loss": 0.0517, "step": 338725 }, { "epoch": 4.988880870679371, "grad_norm": 0.9227701425552368, "learning_rate": 5.818938714597093e-08, "loss": 0.0533, "step": 338750 }, { "epoch": 4.989249053769458, "grad_norm": 1.1791300773620605, "learning_rate": 5.655301517898637e-08, "loss": 0.0547, "step": 338775 }, { "epoch": 4.989617236859545, "grad_norm": 1.451053500175476, "learning_rate": 5.491664321200181e-08, "loss": 0.0537, "step": 338800 }, { "epoch": 4.989985419949632, "grad_norm": 0.9784480929374695, "learning_rate": 5.328027124501725e-08, "loss": 0.0505, "step": 338825 }, { "epoch": 4.990353603039719, "grad_norm": 1.47343111038208, "learning_rate": 5.1643899278032695e-08, "loss": 0.0504, "step": 338850 }, { "epoch": 4.990721786129806, "grad_norm": 1.6505411863327026, "learning_rate": 5.000752731104813e-08, "loss": 0.0546, "step": 338875 }, { "epoch": 4.991089969219893, "grad_norm": 1.5374494791030884, "learning_rate": 4.8371155344063575e-08, "loss": 0.0503, "step": 338900 }, { "epoch": 4.99145815230998, "grad_norm": 1.2402594089508057, "learning_rate": 4.673478337707901e-08, "loss": 0.0517, "step": 338925 }, { "epoch": 4.991826335400067, "grad_norm": 1.457255482673645, "learning_rate": 4.5098411410094456e-08, "loss": 0.0515, "step": 338950 }, { "epoch": 4.992194518490155, "grad_norm": 0.909200131893158, "learning_rate": 4.34620394431099e-08, "loss": 0.0453, "step": 338975 }, { "epoch": 4.992562701580242, "grad_norm": 1.0346252918243408, "learning_rate": 4.1825667476125336e-08, "loss": 0.0477, "step": 339000 }, { "epoch": 4.992930884670329, "grad_norm": 1.4849971532821655, "learning_rate": 4.018929550914078e-08, "loss": 0.0562, "step": 339025 }, { "epoch": 4.993299067760416, "grad_norm": 1.4646254777908325, "learning_rate": 3.8552923542156216e-08, "loss": 0.0545, "step": 339050 }, { "epoch": 4.993667250850503, "grad_norm": 1.4139177799224854, "learning_rate": 3.691655157517166e-08, "loss": 0.0464, "step": 339075 }, { "epoch": 4.99403543394059, "grad_norm": 1.2818245887756348, "learning_rate": 3.52801796081871e-08, "loss": 0.0506, "step": 339100 }, { "epoch": 4.994403617030677, "grad_norm": 1.8238946199417114, "learning_rate": 3.364380764120254e-08, "loss": 0.0491, "step": 339125 }, { "epoch": 4.994771800120764, "grad_norm": 1.6911935806274414, "learning_rate": 3.200743567421798e-08, "loss": 0.0551, "step": 339150 }, { "epoch": 4.995139983210851, "grad_norm": 0.8746576905250549, "learning_rate": 3.037106370723342e-08, "loss": 0.0541, "step": 339175 }, { "epoch": 4.995508166300938, "grad_norm": 1.5822333097457886, "learning_rate": 2.8734691740248863e-08, "loss": 0.0532, "step": 339200 }, { "epoch": 4.995876349391025, "grad_norm": 1.1354644298553467, "learning_rate": 2.7098319773264303e-08, "loss": 0.0522, "step": 339225 }, { "epoch": 4.996244532481112, "grad_norm": 1.8758031129837036, "learning_rate": 2.5461947806279743e-08, "loss": 0.0489, "step": 339250 }, { "epoch": 4.9966127155711995, "grad_norm": 1.827846884727478, "learning_rate": 2.3825575839295183e-08, "loss": 0.0448, "step": 339275 }, { "epoch": 4.9969808986612865, "grad_norm": 1.3098126649856567, "learning_rate": 2.2189203872310627e-08, "loss": 0.0488, "step": 339300 }, { "epoch": 4.9973490817513735, "grad_norm": 1.4571337699890137, "learning_rate": 2.0552831905326067e-08, "loss": 0.0518, "step": 339325 }, { "epoch": 4.9977172648414605, "grad_norm": 0.8155379891395569, "learning_rate": 1.8916459938341504e-08, "loss": 0.0532, "step": 339350 }, { "epoch": 4.9980854479315475, "grad_norm": 1.5733963251113892, "learning_rate": 1.7280087971356947e-08, "loss": 0.055, "step": 339375 }, { "epoch": 4.9984536310216345, "grad_norm": 1.219093918800354, "learning_rate": 1.5643716004372387e-08, "loss": 0.0562, "step": 339400 }, { "epoch": 4.9988218141117216, "grad_norm": 1.0231002569198608, "learning_rate": 1.4007344037387827e-08, "loss": 0.059, "step": 339425 }, { "epoch": 4.999189997201809, "grad_norm": 1.2934322357177734, "learning_rate": 1.2370972070403267e-08, "loss": 0.0521, "step": 339450 }, { "epoch": 4.999558180291896, "grad_norm": 0.9300215840339661, "learning_rate": 1.0734600103418709e-08, "loss": 0.0537, "step": 339475 }, { "epoch": 4.999926363381983, "grad_norm": 1.3510725498199463, "learning_rate": 9.09822813643415e-09, "loss": 0.0472, "step": 339500 }, { "epoch": 5.0, "eval_loss": 0.056979671120643616, "eval_runtime": 116.4543, "eval_samples_per_second": 3046.242, "eval_steps_per_second": 5.951, "step": 339505 } ], "logging_steps": 25, "max_steps": 339505, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }