{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999322951929587, "eval_steps": 500, "global_step": 738, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013540961408259986, "grad_norm": 2.4126635555318527, "learning_rate": 2.5000000000000004e-07, "loss": 3.1189, "step": 1 }, { "epoch": 0.002708192281651997, "grad_norm": 2.8141112841232654, "learning_rate": 5.000000000000001e-07, "loss": 2.6861, "step": 2 }, { "epoch": 0.004062288422477996, "grad_norm": 2.6738862039513274, "learning_rate": 7.5e-07, "loss": 3.3382, "step": 3 }, { "epoch": 0.005416384563303994, "grad_norm": 2.30287728842049, "learning_rate": 1.0000000000000002e-06, "loss": 2.4565, "step": 4 }, { "epoch": 0.006770480704129994, "grad_norm": 42.907191662846444, "learning_rate": 1.25e-06, "loss": 2.5327, "step": 5 }, { "epoch": 0.008124576844955992, "grad_norm": 2.9404364361574724, "learning_rate": 1.5e-06, "loss": 2.7222, "step": 6 }, { "epoch": 0.009478672985781991, "grad_norm": 1.9739382530413463, "learning_rate": 1.75e-06, "loss": 2.1649, "step": 7 }, { "epoch": 0.010832769126607989, "grad_norm": 3.463912464916458, "learning_rate": 2.0000000000000003e-06, "loss": 2.9378, "step": 8 }, { "epoch": 0.012186865267433988, "grad_norm": 2.207345459970691, "learning_rate": 2.25e-06, "loss": 3.6521, "step": 9 }, { "epoch": 0.013540961408259987, "grad_norm": 2.752714935099265, "learning_rate": 2.5e-06, "loss": 2.5672, "step": 10 }, { "epoch": 0.014895057549085985, "grad_norm": 2.7722408021236187, "learning_rate": 2.7500000000000004e-06, "loss": 3.0411, "step": 11 }, { "epoch": 0.016249153689911984, "grad_norm": 2.9367100387488185, "learning_rate": 3e-06, "loss": 2.8985, "step": 12 }, { "epoch": 0.017603249830737983, "grad_norm": 2.7601030286888797, "learning_rate": 3.2500000000000002e-06, "loss": 2.8232, "step": 13 }, { "epoch": 0.018957345971563982, "grad_norm": 3.184703964873984, "learning_rate": 3.5e-06, "loss": 2.9731, "step": 14 }, { "epoch": 0.020311442112389978, "grad_norm": 2.2952697179626105, "learning_rate": 3.7500000000000005e-06, "loss": 2.823, "step": 15 }, { "epoch": 0.021665538253215978, "grad_norm": 2.4601547818230913, "learning_rate": 4.000000000000001e-06, "loss": 2.7683, "step": 16 }, { "epoch": 0.023019634394041977, "grad_norm": 2.0169369154885217, "learning_rate": 4.25e-06, "loss": 2.566, "step": 17 }, { "epoch": 0.024373730534867976, "grad_norm": 1.8874822039928005, "learning_rate": 4.5e-06, "loss": 2.4032, "step": 18 }, { "epoch": 0.025727826675693975, "grad_norm": 1.4345346790759714, "learning_rate": 4.75e-06, "loss": 2.8099, "step": 19 }, { "epoch": 0.027081922816519974, "grad_norm": 2.5317082125734025, "learning_rate": 5e-06, "loss": 2.4169, "step": 20 }, { "epoch": 0.02843601895734597, "grad_norm": 1.4417852513317821, "learning_rate": 5.2500000000000006e-06, "loss": 3.0868, "step": 21 }, { "epoch": 0.02979011509817197, "grad_norm": 0.9044314800944161, "learning_rate": 5.500000000000001e-06, "loss": 2.7374, "step": 22 }, { "epoch": 0.03114421123899797, "grad_norm": 1.4184446315710213, "learning_rate": 5.75e-06, "loss": 2.5317, "step": 23 }, { "epoch": 0.03249830737982397, "grad_norm": 1.6900469714019433, "learning_rate": 6e-06, "loss": 2.4404, "step": 24 }, { "epoch": 0.033852403520649964, "grad_norm": 1.0886842936641719, "learning_rate": 6.25e-06, "loss": 2.8003, "step": 25 }, { "epoch": 0.035206499661475966, "grad_norm": 1.463135149887311, "learning_rate": 6.5000000000000004e-06, "loss": 1.8819, "step": 26 }, { "epoch": 0.03656059580230196, "grad_norm": 1.1222026842497739, "learning_rate": 6.750000000000001e-06, "loss": 2.2671, "step": 27 }, { "epoch": 0.037914691943127965, "grad_norm": 1.0087919882501093, "learning_rate": 7e-06, "loss": 2.4491, "step": 28 }, { "epoch": 0.03926878808395396, "grad_norm": 1.2761649114763567, "learning_rate": 7.25e-06, "loss": 2.7121, "step": 29 }, { "epoch": 0.040622884224779957, "grad_norm": 1.0185872249128933, "learning_rate": 7.500000000000001e-06, "loss": 2.6497, "step": 30 }, { "epoch": 0.04197698036560596, "grad_norm": 1.2789270965641044, "learning_rate": 7.75e-06, "loss": 2.668, "step": 31 }, { "epoch": 0.043331076506431955, "grad_norm": 1.1115471485822677, "learning_rate": 8.000000000000001e-06, "loss": 2.6767, "step": 32 }, { "epoch": 0.04468517264725796, "grad_norm": 1.0627476896958792, "learning_rate": 8.25e-06, "loss": 2.6262, "step": 33 }, { "epoch": 0.046039268788083954, "grad_norm": 1.1212584420019067, "learning_rate": 8.5e-06, "loss": 2.4854, "step": 34 }, { "epoch": 0.04739336492890995, "grad_norm": 0.714156466912055, "learning_rate": 8.750000000000001e-06, "loss": 2.3662, "step": 35 }, { "epoch": 0.04874746106973595, "grad_norm": 1.4520911693345544, "learning_rate": 9e-06, "loss": 2.7126, "step": 36 }, { "epoch": 0.05010155721056195, "grad_norm": 1.493660601298817, "learning_rate": 9.250000000000001e-06, "loss": 3.0131, "step": 37 }, { "epoch": 0.05145565335138795, "grad_norm": 1.2721754658047073, "learning_rate": 9.5e-06, "loss": 3.0784, "step": 38 }, { "epoch": 0.052809749492213946, "grad_norm": 0.8803541178840371, "learning_rate": 9.75e-06, "loss": 2.1007, "step": 39 }, { "epoch": 0.05416384563303995, "grad_norm": 2.0655028083700504, "learning_rate": 1e-05, "loss": 2.5182, "step": 40 }, { "epoch": 0.055517941773865945, "grad_norm": 1.0487372085128044, "learning_rate": 9.999997090241333e-06, "loss": 2.2771, "step": 41 }, { "epoch": 0.05687203791469194, "grad_norm": 0.8160509238097648, "learning_rate": 9.999988360968714e-06, "loss": 2.2042, "step": 42 }, { "epoch": 0.05822613405551794, "grad_norm": 0.8864211341294704, "learning_rate": 9.999973812192306e-06, "loss": 2.7951, "step": 43 }, { "epoch": 0.05958023019634394, "grad_norm": 0.7681880276765591, "learning_rate": 9.99995344392904e-06, "loss": 2.859, "step": 44 }, { "epoch": 0.06093432633716994, "grad_norm": 1.7239414196388092, "learning_rate": 9.999927256202626e-06, "loss": 2.7866, "step": 45 }, { "epoch": 0.06228842247799594, "grad_norm": 0.7879562468694528, "learning_rate": 9.999895249043542e-06, "loss": 2.1671, "step": 46 }, { "epoch": 0.06364251861882193, "grad_norm": 0.8376315627603867, "learning_rate": 9.99985742248904e-06, "loss": 2.6406, "step": 47 }, { "epoch": 0.06499661475964794, "grad_norm": 0.8839739394970294, "learning_rate": 9.999813776583148e-06, "loss": 2.3163, "step": 48 }, { "epoch": 0.06635071090047394, "grad_norm": 0.7373151185194805, "learning_rate": 9.999764311376664e-06, "loss": 2.5735, "step": 49 }, { "epoch": 0.06770480704129993, "grad_norm": 1.1273015474311354, "learning_rate": 9.999709026927162e-06, "loss": 2.2133, "step": 50 }, { "epoch": 0.06905890318212593, "grad_norm": 0.8478718145728893, "learning_rate": 9.99964792329899e-06, "loss": 2.4075, "step": 51 }, { "epoch": 0.07041299932295193, "grad_norm": 1.0667576833889847, "learning_rate": 9.999581000563265e-06, "loss": 2.6946, "step": 52 }, { "epoch": 0.07176709546377792, "grad_norm": 1.0620013368810508, "learning_rate": 9.999508258797876e-06, "loss": 2.6205, "step": 53 }, { "epoch": 0.07312119160460392, "grad_norm": 0.8072331151341536, "learning_rate": 9.999429698087491e-06, "loss": 2.5132, "step": 54 }, { "epoch": 0.07447528774542993, "grad_norm": 0.6900996192892075, "learning_rate": 9.999345318523544e-06, "loss": 2.5071, "step": 55 }, { "epoch": 0.07582938388625593, "grad_norm": 0.7145430930616551, "learning_rate": 9.999255120204248e-06, "loss": 2.6579, "step": 56 }, { "epoch": 0.07718348002708192, "grad_norm": 0.9813015245466205, "learning_rate": 9.999159103234582e-06, "loss": 2.1667, "step": 57 }, { "epoch": 0.07853757616790792, "grad_norm": 0.6801613421398072, "learning_rate": 9.999057267726304e-06, "loss": 2.2016, "step": 58 }, { "epoch": 0.07989167230873392, "grad_norm": 1.340722366857769, "learning_rate": 9.998949613797937e-06, "loss": 2.5416, "step": 59 }, { "epoch": 0.08124576844955991, "grad_norm": 0.8044848626001605, "learning_rate": 9.998836141574781e-06, "loss": 2.6258, "step": 60 }, { "epoch": 0.08259986459038592, "grad_norm": 0.9118107600823588, "learning_rate": 9.99871685118891e-06, "loss": 2.5182, "step": 61 }, { "epoch": 0.08395396073121192, "grad_norm": 0.8103939602292338, "learning_rate": 9.99859174277916e-06, "loss": 2.4597, "step": 62 }, { "epoch": 0.08530805687203792, "grad_norm": 0.7953248414753887, "learning_rate": 9.99846081649115e-06, "loss": 2.6165, "step": 63 }, { "epoch": 0.08666215301286391, "grad_norm": 0.6885480399283996, "learning_rate": 9.998324072477266e-06, "loss": 2.3687, "step": 64 }, { "epoch": 0.08801624915368991, "grad_norm": 0.968337926868628, "learning_rate": 9.99818151089666e-06, "loss": 2.4591, "step": 65 }, { "epoch": 0.08937034529451592, "grad_norm": 0.7154001784843524, "learning_rate": 9.998033131915266e-06, "loss": 2.2175, "step": 66 }, { "epoch": 0.0907244414353419, "grad_norm": 1.18262426992356, "learning_rate": 9.997878935705778e-06, "loss": 3.1191, "step": 67 }, { "epoch": 0.09207853757616791, "grad_norm": 0.7400521396635642, "learning_rate": 9.997718922447669e-06, "loss": 2.5125, "step": 68 }, { "epoch": 0.09343263371699391, "grad_norm": 0.8148295313823943, "learning_rate": 9.997553092327174e-06, "loss": 2.7714, "step": 69 }, { "epoch": 0.0947867298578199, "grad_norm": 0.7378483977717124, "learning_rate": 9.997381445537309e-06, "loss": 2.3631, "step": 70 }, { "epoch": 0.0961408259986459, "grad_norm": 0.6266190246235469, "learning_rate": 9.997203982277852e-06, "loss": 2.4936, "step": 71 }, { "epoch": 0.0974949221394719, "grad_norm": 1.50407707220557, "learning_rate": 9.997020702755353e-06, "loss": 2.7048, "step": 72 }, { "epoch": 0.0988490182802979, "grad_norm": 0.8300611098600416, "learning_rate": 9.996831607183132e-06, "loss": 2.7146, "step": 73 }, { "epoch": 0.1002031144211239, "grad_norm": 0.7552213758897641, "learning_rate": 9.996636695781276e-06, "loss": 2.5399, "step": 74 }, { "epoch": 0.1015572105619499, "grad_norm": 0.705291949087134, "learning_rate": 9.996435968776646e-06, "loss": 2.6843, "step": 75 }, { "epoch": 0.1029113067027759, "grad_norm": 0.7673837145200612, "learning_rate": 9.996229426402867e-06, "loss": 2.7562, "step": 76 }, { "epoch": 0.10426540284360189, "grad_norm": 0.7439456610505197, "learning_rate": 9.996017068900335e-06, "loss": 2.0488, "step": 77 }, { "epoch": 0.10561949898442789, "grad_norm": 0.8945277132428896, "learning_rate": 9.995798896516215e-06, "loss": 2.48, "step": 78 }, { "epoch": 0.1069735951252539, "grad_norm": 0.8901395084271732, "learning_rate": 9.995574909504434e-06, "loss": 2.6099, "step": 79 }, { "epoch": 0.1083276912660799, "grad_norm": 0.824778949464918, "learning_rate": 9.995345108125698e-06, "loss": 2.69, "step": 80 }, { "epoch": 0.10968178740690589, "grad_norm": 0.8965013093653279, "learning_rate": 9.995109492647467e-06, "loss": 1.8424, "step": 81 }, { "epoch": 0.11103588354773189, "grad_norm": 1.0115160015541755, "learning_rate": 9.99486806334398e-06, "loss": 2.3693, "step": 82 }, { "epoch": 0.11238997968855789, "grad_norm": 0.8624086155070334, "learning_rate": 9.994620820496234e-06, "loss": 2.6173, "step": 83 }, { "epoch": 0.11374407582938388, "grad_norm": 0.7454258690069137, "learning_rate": 9.994367764391998e-06, "loss": 2.6624, "step": 84 }, { "epoch": 0.11509817197020988, "grad_norm": 1.0680963212316428, "learning_rate": 9.994108895325802e-06, "loss": 2.478, "step": 85 }, { "epoch": 0.11645226811103589, "grad_norm": 0.8212102801807549, "learning_rate": 9.993844213598949e-06, "loss": 2.0231, "step": 86 }, { "epoch": 0.11780636425186188, "grad_norm": 1.0684938905029466, "learning_rate": 9.993573719519498e-06, "loss": 2.595, "step": 87 }, { "epoch": 0.11916046039268788, "grad_norm": 0.9867071296775374, "learning_rate": 9.993297413402282e-06, "loss": 2.3636, "step": 88 }, { "epoch": 0.12051455653351388, "grad_norm": 1.404157430750734, "learning_rate": 9.993015295568893e-06, "loss": 2.1992, "step": 89 }, { "epoch": 0.12186865267433988, "grad_norm": 0.8693790763825426, "learning_rate": 9.992727366347688e-06, "loss": 2.7016, "step": 90 }, { "epoch": 0.12322274881516587, "grad_norm": 0.7501248710359847, "learning_rate": 9.99243362607379e-06, "loss": 3.2797, "step": 91 }, { "epoch": 0.12457684495599188, "grad_norm": 1.0643003422752406, "learning_rate": 9.992134075089085e-06, "loss": 2.5437, "step": 92 }, { "epoch": 0.12593094109681788, "grad_norm": 0.9103830863991241, "learning_rate": 9.991828713742218e-06, "loss": 3.0088, "step": 93 }, { "epoch": 0.12728503723764387, "grad_norm": 1.142388030489728, "learning_rate": 9.991517542388605e-06, "loss": 2.4977, "step": 94 }, { "epoch": 0.12863913337846988, "grad_norm": 0.9920213403910143, "learning_rate": 9.991200561390417e-06, "loss": 2.5349, "step": 95 }, { "epoch": 0.12999322951929587, "grad_norm": 0.8981930618864583, "learning_rate": 9.990877771116588e-06, "loss": 2.8531, "step": 96 }, { "epoch": 0.13134732566012186, "grad_norm": 1.154244133374351, "learning_rate": 9.990549171942817e-06, "loss": 2.5311, "step": 97 }, { "epoch": 0.13270142180094788, "grad_norm": 0.7466294529414305, "learning_rate": 9.99021476425156e-06, "loss": 2.4112, "step": 98 }, { "epoch": 0.13405551794177387, "grad_norm": 1.1851681560394296, "learning_rate": 9.989874548432037e-06, "loss": 2.2467, "step": 99 }, { "epoch": 0.13540961408259986, "grad_norm": 1.295896953150764, "learning_rate": 9.989528524880225e-06, "loss": 2.5688, "step": 100 }, { "epoch": 0.13676371022342587, "grad_norm": 0.8892799849200678, "learning_rate": 9.989176693998863e-06, "loss": 2.151, "step": 101 }, { "epoch": 0.13811780636425186, "grad_norm": 1.133636165142649, "learning_rate": 9.988819056197448e-06, "loss": 1.8402, "step": 102 }, { "epoch": 0.13947190250507785, "grad_norm": 0.8016376414959372, "learning_rate": 9.988455611892237e-06, "loss": 2.3469, "step": 103 }, { "epoch": 0.14082599864590387, "grad_norm": 0.964428420697917, "learning_rate": 9.98808636150624e-06, "loss": 2.3062, "step": 104 }, { "epoch": 0.14218009478672985, "grad_norm": 0.7711620898648806, "learning_rate": 9.987711305469232e-06, "loss": 2.7206, "step": 105 }, { "epoch": 0.14353419092755584, "grad_norm": 0.7778439840699306, "learning_rate": 9.987330444217739e-06, "loss": 2.4899, "step": 106 }, { "epoch": 0.14488828706838186, "grad_norm": 0.9642875586985667, "learning_rate": 9.986943778195052e-06, "loss": 2.8345, "step": 107 }, { "epoch": 0.14624238320920785, "grad_norm": 0.8268272296488341, "learning_rate": 9.98655130785121e-06, "loss": 2.5926, "step": 108 }, { "epoch": 0.14759647935003387, "grad_norm": 0.7977529748864721, "learning_rate": 9.986153033643011e-06, "loss": 1.9835, "step": 109 }, { "epoch": 0.14895057549085985, "grad_norm": 1.140904527079228, "learning_rate": 9.985748956034007e-06, "loss": 2.2239, "step": 110 }, { "epoch": 0.15030467163168584, "grad_norm": 1.020890884482194, "learning_rate": 9.985339075494504e-06, "loss": 2.6764, "step": 111 }, { "epoch": 0.15165876777251186, "grad_norm": 0.8869762247213703, "learning_rate": 9.984923392501567e-06, "loss": 2.2497, "step": 112 }, { "epoch": 0.15301286391333785, "grad_norm": 0.9590605445485386, "learning_rate": 9.98450190753901e-06, "loss": 2.3991, "step": 113 }, { "epoch": 0.15436696005416384, "grad_norm": 1.27578790045338, "learning_rate": 9.984074621097397e-06, "loss": 2.3551, "step": 114 }, { "epoch": 0.15572105619498985, "grad_norm": 1.0374791947621171, "learning_rate": 9.983641533674053e-06, "loss": 2.6919, "step": 115 }, { "epoch": 0.15707515233581584, "grad_norm": 0.923336386037252, "learning_rate": 9.983202645773049e-06, "loss": 2.6477, "step": 116 }, { "epoch": 0.15842924847664183, "grad_norm": 0.8526247474365115, "learning_rate": 9.982757957905204e-06, "loss": 2.2264, "step": 117 }, { "epoch": 0.15978334461746785, "grad_norm": 1.1258963168949678, "learning_rate": 9.982307470588097e-06, "loss": 2.5224, "step": 118 }, { "epoch": 0.16113744075829384, "grad_norm": 0.7569639815489783, "learning_rate": 9.98185118434605e-06, "loss": 1.8855, "step": 119 }, { "epoch": 0.16249153689911983, "grad_norm": 1.0355226629532543, "learning_rate": 9.981389099710132e-06, "loss": 3.0085, "step": 120 }, { "epoch": 0.16384563303994584, "grad_norm": 1.1486309630139306, "learning_rate": 9.980921217218173e-06, "loss": 2.7178, "step": 121 }, { "epoch": 0.16519972918077183, "grad_norm": 0.7466296304493713, "learning_rate": 9.980447537414736e-06, "loss": 2.251, "step": 122 }, { "epoch": 0.16655382532159782, "grad_norm": 0.651170263411467, "learning_rate": 9.979968060851144e-06, "loss": 2.1519, "step": 123 }, { "epoch": 0.16790792146242384, "grad_norm": 1.008261184550054, "learning_rate": 9.979482788085455e-06, "loss": 2.2199, "step": 124 }, { "epoch": 0.16926201760324983, "grad_norm": 2.416142037739212, "learning_rate": 9.978991719682486e-06, "loss": 2.3314, "step": 125 }, { "epoch": 0.17061611374407584, "grad_norm": 0.8174937699006882, "learning_rate": 9.97849485621379e-06, "loss": 2.5571, "step": 126 }, { "epoch": 0.17197020988490183, "grad_norm": 1.0052455897777324, "learning_rate": 9.977992198257668e-06, "loss": 2.5827, "step": 127 }, { "epoch": 0.17332430602572782, "grad_norm": 0.8663685595604006, "learning_rate": 9.977483746399168e-06, "loss": 2.5957, "step": 128 }, { "epoch": 0.17467840216655384, "grad_norm": 0.8454778683126596, "learning_rate": 9.976969501230074e-06, "loss": 2.2387, "step": 129 }, { "epoch": 0.17603249830737983, "grad_norm": 1.1589088141787116, "learning_rate": 9.976449463348924e-06, "loss": 2.721, "step": 130 }, { "epoch": 0.17738659444820581, "grad_norm": 0.8357485092208772, "learning_rate": 9.975923633360985e-06, "loss": 2.1065, "step": 131 }, { "epoch": 0.17874069058903183, "grad_norm": 1.0483474542727864, "learning_rate": 9.975392011878278e-06, "loss": 2.5342, "step": 132 }, { "epoch": 0.18009478672985782, "grad_norm": 0.8310501434875607, "learning_rate": 9.974854599519557e-06, "loss": 2.9813, "step": 133 }, { "epoch": 0.1814488828706838, "grad_norm": 1.1998035390299338, "learning_rate": 9.974311396910317e-06, "loss": 2.5218, "step": 134 }, { "epoch": 0.18280297901150983, "grad_norm": 1.1877384820263168, "learning_rate": 9.973762404682795e-06, "loss": 3.0512, "step": 135 }, { "epoch": 0.18415707515233581, "grad_norm": 0.7431690298915412, "learning_rate": 9.973207623475964e-06, "loss": 1.9473, "step": 136 }, { "epoch": 0.1855111712931618, "grad_norm": 0.8085958372588746, "learning_rate": 9.972647053935536e-06, "loss": 2.2599, "step": 137 }, { "epoch": 0.18686526743398782, "grad_norm": 1.0088420757147314, "learning_rate": 9.972080696713962e-06, "loss": 2.3532, "step": 138 }, { "epoch": 0.1882193635748138, "grad_norm": 0.871056602633167, "learning_rate": 9.971508552470424e-06, "loss": 2.1344, "step": 139 }, { "epoch": 0.1895734597156398, "grad_norm": 0.9551730724308998, "learning_rate": 9.970930621870843e-06, "loss": 1.7945, "step": 140 }, { "epoch": 0.1909275558564658, "grad_norm": 0.9099059345338785, "learning_rate": 9.970346905587875e-06, "loss": 2.4697, "step": 141 }, { "epoch": 0.1922816519972918, "grad_norm": 2.3602640235819288, "learning_rate": 9.969757404300911e-06, "loss": 2.3396, "step": 142 }, { "epoch": 0.19363574813811782, "grad_norm": 1.0387196911625636, "learning_rate": 9.969162118696072e-06, "loss": 2.2526, "step": 143 }, { "epoch": 0.1949898442789438, "grad_norm": 1.0660871258756586, "learning_rate": 9.968561049466214e-06, "loss": 2.2518, "step": 144 }, { "epoch": 0.1963439404197698, "grad_norm": 0.994954020536128, "learning_rate": 9.967954197310922e-06, "loss": 2.1365, "step": 145 }, { "epoch": 0.1976980365605958, "grad_norm": 1.1172902304571317, "learning_rate": 9.967341562936515e-06, "loss": 2.4633, "step": 146 }, { "epoch": 0.1990521327014218, "grad_norm": 0.8851246859608983, "learning_rate": 9.966723147056036e-06, "loss": 2.3302, "step": 147 }, { "epoch": 0.2004062288422478, "grad_norm": 0.9673278398671098, "learning_rate": 9.966098950389268e-06, "loss": 2.3481, "step": 148 }, { "epoch": 0.2017603249830738, "grad_norm": 1.0923553526656322, "learning_rate": 9.965468973662712e-06, "loss": 2.7291, "step": 149 }, { "epoch": 0.2031144211238998, "grad_norm": 0.6885203668960329, "learning_rate": 9.9648332176096e-06, "loss": 2.4216, "step": 150 }, { "epoch": 0.20446851726472579, "grad_norm": 0.8835456349684598, "learning_rate": 9.964191682969891e-06, "loss": 2.5524, "step": 151 }, { "epoch": 0.2058226134055518, "grad_norm": 1.110246201094945, "learning_rate": 9.96354437049027e-06, "loss": 2.7307, "step": 152 }, { "epoch": 0.2071767095463778, "grad_norm": 0.8335391435494296, "learning_rate": 9.962891280924148e-06, "loss": 2.2845, "step": 153 }, { "epoch": 0.20853080568720378, "grad_norm": 0.7269532952637685, "learning_rate": 9.962232415031653e-06, "loss": 2.2107, "step": 154 }, { "epoch": 0.2098849018280298, "grad_norm": 0.9516173733706272, "learning_rate": 9.961567773579645e-06, "loss": 2.549, "step": 155 }, { "epoch": 0.21123899796885579, "grad_norm": 0.9192964960712486, "learning_rate": 9.960897357341703e-06, "loss": 2.382, "step": 156 }, { "epoch": 0.21259309410968177, "grad_norm": 0.8697364214950628, "learning_rate": 9.960221167098124e-06, "loss": 2.7404, "step": 157 }, { "epoch": 0.2139471902505078, "grad_norm": 0.854037317845471, "learning_rate": 9.959539203635931e-06, "loss": 2.2796, "step": 158 }, { "epoch": 0.21530128639133378, "grad_norm": 9.059153514312463, "learning_rate": 9.958851467748863e-06, "loss": 2.1798, "step": 159 }, { "epoch": 0.2166553825321598, "grad_norm": 0.9333843627998799, "learning_rate": 9.958157960237376e-06, "loss": 2.3693, "step": 160 }, { "epoch": 0.21800947867298578, "grad_norm": 1.3475036120106114, "learning_rate": 9.957458681908647e-06, "loss": 2.4024, "step": 161 }, { "epoch": 0.21936357481381177, "grad_norm": 1.2121402320200159, "learning_rate": 9.956753633576571e-06, "loss": 2.5439, "step": 162 }, { "epoch": 0.2207176709546378, "grad_norm": 0.7792210046361225, "learning_rate": 9.956042816061752e-06, "loss": 2.0299, "step": 163 }, { "epoch": 0.22207176709546378, "grad_norm": 0.8226985573354776, "learning_rate": 9.955326230191517e-06, "loss": 2.8253, "step": 164 }, { "epoch": 0.22342586323628977, "grad_norm": 0.8424020935830455, "learning_rate": 9.9546038767999e-06, "loss": 2.2754, "step": 165 }, { "epoch": 0.22477995937711578, "grad_norm": 1.1561344660760495, "learning_rate": 9.95387575672765e-06, "loss": 1.9543, "step": 166 }, { "epoch": 0.22613405551794177, "grad_norm": 0.8810904992087591, "learning_rate": 9.953141870822232e-06, "loss": 2.8316, "step": 167 }, { "epoch": 0.22748815165876776, "grad_norm": 1.1327457989077157, "learning_rate": 9.952402219937817e-06, "loss": 2.1659, "step": 168 }, { "epoch": 0.22884224779959378, "grad_norm": 0.847237764337931, "learning_rate": 9.951656804935284e-06, "loss": 2.5104, "step": 169 }, { "epoch": 0.23019634394041977, "grad_norm": 1.3311662339401327, "learning_rate": 9.950905626682229e-06, "loss": 2.7411, "step": 170 }, { "epoch": 0.23155044008124576, "grad_norm": 0.8754683872906716, "learning_rate": 9.950148686052948e-06, "loss": 2.2843, "step": 171 }, { "epoch": 0.23290453622207177, "grad_norm": 1.1984097361442936, "learning_rate": 9.949385983928446e-06, "loss": 2.6191, "step": 172 }, { "epoch": 0.23425863236289776, "grad_norm": 1.042317105372754, "learning_rate": 9.948617521196438e-06, "loss": 2.5479, "step": 173 }, { "epoch": 0.23561272850372375, "grad_norm": 0.7787896671463191, "learning_rate": 9.947843298751337e-06, "loss": 2.3346, "step": 174 }, { "epoch": 0.23696682464454977, "grad_norm": 1.2170235533278835, "learning_rate": 9.947063317494265e-06, "loss": 2.2332, "step": 175 }, { "epoch": 0.23832092078537576, "grad_norm": 1.0390706514693488, "learning_rate": 9.946277578333045e-06, "loss": 2.5912, "step": 176 }, { "epoch": 0.23967501692620177, "grad_norm": 1.4816304376017728, "learning_rate": 9.945486082182201e-06, "loss": 2.4719, "step": 177 }, { "epoch": 0.24102911306702776, "grad_norm": 0.8487826972928669, "learning_rate": 9.944688829962957e-06, "loss": 2.2601, "step": 178 }, { "epoch": 0.24238320920785375, "grad_norm": 1.0267911713585076, "learning_rate": 9.94388582260324e-06, "loss": 2.4376, "step": 179 }, { "epoch": 0.24373730534867977, "grad_norm": 0.8526086813518827, "learning_rate": 9.943077061037672e-06, "loss": 2.6301, "step": 180 }, { "epoch": 0.24509140148950576, "grad_norm": 1.356267698059979, "learning_rate": 9.942262546207572e-06, "loss": 2.723, "step": 181 }, { "epoch": 0.24644549763033174, "grad_norm": 0.7426455232138849, "learning_rate": 9.94144227906096e-06, "loss": 2.4959, "step": 182 }, { "epoch": 0.24779959377115776, "grad_norm": 0.8750438781520365, "learning_rate": 9.940616260552545e-06, "loss": 2.3425, "step": 183 }, { "epoch": 0.24915368991198375, "grad_norm": 0.8785910140656594, "learning_rate": 9.939784491643734e-06, "loss": 2.2364, "step": 184 }, { "epoch": 0.25050778605280977, "grad_norm": 0.8938618124220896, "learning_rate": 9.938946973302624e-06, "loss": 2.19, "step": 185 }, { "epoch": 0.25186188219363576, "grad_norm": 1.021044776141451, "learning_rate": 9.938103706504007e-06, "loss": 2.7688, "step": 186 }, { "epoch": 0.25321597833446174, "grad_norm": 1.2815667789992267, "learning_rate": 9.937254692229363e-06, "loss": 2.1036, "step": 187 }, { "epoch": 0.25457007447528773, "grad_norm": 0.9597963383831872, "learning_rate": 9.936399931466866e-06, "loss": 2.7931, "step": 188 }, { "epoch": 0.2559241706161137, "grad_norm": 1.1496492688566942, "learning_rate": 9.935539425211371e-06, "loss": 2.0287, "step": 189 }, { "epoch": 0.25727826675693977, "grad_norm": 0.9029859331791751, "learning_rate": 9.934673174464426e-06, "loss": 2.555, "step": 190 }, { "epoch": 0.25863236289776576, "grad_norm": 0.8623354344497336, "learning_rate": 9.933801180234263e-06, "loss": 2.4571, "step": 191 }, { "epoch": 0.25998645903859174, "grad_norm": 1.172827727876581, "learning_rate": 9.932923443535798e-06, "loss": 2.5339, "step": 192 }, { "epoch": 0.26134055517941773, "grad_norm": 0.7844413148427782, "learning_rate": 9.932039965390634e-06, "loss": 2.232, "step": 193 }, { "epoch": 0.2626946513202437, "grad_norm": 2.4674022908729993, "learning_rate": 9.931150746827055e-06, "loss": 2.4686, "step": 194 }, { "epoch": 0.2640487474610697, "grad_norm": 0.8938856211560369, "learning_rate": 9.930255788880021e-06, "loss": 2.9519, "step": 195 }, { "epoch": 0.26540284360189575, "grad_norm": 0.9967031966791289, "learning_rate": 9.92935509259118e-06, "loss": 2.3506, "step": 196 }, { "epoch": 0.26675693974272174, "grad_norm": 0.8181579212619565, "learning_rate": 9.928448659008856e-06, "loss": 2.3992, "step": 197 }, { "epoch": 0.26811103588354773, "grad_norm": 0.9019033329287899, "learning_rate": 9.927536489188047e-06, "loss": 1.8896, "step": 198 }, { "epoch": 0.2694651320243737, "grad_norm": 1.196294290686767, "learning_rate": 9.926618584190435e-06, "loss": 2.6578, "step": 199 }, { "epoch": 0.2708192281651997, "grad_norm": 0.8815838170577491, "learning_rate": 9.925694945084369e-06, "loss": 2.8371, "step": 200 }, { "epoch": 0.27217332430602575, "grad_norm": 0.933774886972981, "learning_rate": 9.924765572944879e-06, "loss": 2.0347, "step": 201 }, { "epoch": 0.27352742044685174, "grad_norm": 0.9459867483651119, "learning_rate": 9.923830468853662e-06, "loss": 2.3687, "step": 202 }, { "epoch": 0.27488151658767773, "grad_norm": 0.8281062486347639, "learning_rate": 9.92288963389909e-06, "loss": 2.1651, "step": 203 }, { "epoch": 0.2762356127285037, "grad_norm": 0.7955305725702541, "learning_rate": 9.921943069176203e-06, "loss": 2.5368, "step": 204 }, { "epoch": 0.2775897088693297, "grad_norm": 1.026304391265509, "learning_rate": 9.920990775786712e-06, "loss": 2.9058, "step": 205 }, { "epoch": 0.2789438050101557, "grad_norm": 0.9971825829470802, "learning_rate": 9.920032754838994e-06, "loss": 2.8694, "step": 206 }, { "epoch": 0.28029790115098174, "grad_norm": 1.2484687747540484, "learning_rate": 9.919069007448093e-06, "loss": 2.4603, "step": 207 }, { "epoch": 0.28165199729180773, "grad_norm": 0.864015784119935, "learning_rate": 9.91809953473572e-06, "loss": 2.2217, "step": 208 }, { "epoch": 0.2830060934326337, "grad_norm": 0.9647460693812894, "learning_rate": 9.917124337830242e-06, "loss": 2.4799, "step": 209 }, { "epoch": 0.2843601895734597, "grad_norm": 1.0623146158662309, "learning_rate": 9.916143417866702e-06, "loss": 2.3235, "step": 210 }, { "epoch": 0.2857142857142857, "grad_norm": 1.5128627766738143, "learning_rate": 9.915156775986789e-06, "loss": 2.6927, "step": 211 }, { "epoch": 0.2870683818551117, "grad_norm": 0.9157505604273191, "learning_rate": 9.914164413338863e-06, "loss": 2.056, "step": 212 }, { "epoch": 0.28842247799593773, "grad_norm": 2.1672553239870114, "learning_rate": 9.913166331077937e-06, "loss": 2.3637, "step": 213 }, { "epoch": 0.2897765741367637, "grad_norm": 0.8565503250451203, "learning_rate": 9.912162530365683e-06, "loss": 2.3108, "step": 214 }, { "epoch": 0.2911306702775897, "grad_norm": 0.8071561412028346, "learning_rate": 9.911153012370427e-06, "loss": 2.4094, "step": 215 }, { "epoch": 0.2924847664184157, "grad_norm": 0.960350260627165, "learning_rate": 9.910137778267153e-06, "loss": 2.2326, "step": 216 }, { "epoch": 0.2938388625592417, "grad_norm": 1.1434301374173532, "learning_rate": 9.909116829237492e-06, "loss": 2.1396, "step": 217 }, { "epoch": 0.29519295870006773, "grad_norm": 1.0512962564460284, "learning_rate": 9.908090166469733e-06, "loss": 2.4862, "step": 218 }, { "epoch": 0.2965470548408937, "grad_norm": 0.8803334872680652, "learning_rate": 9.90705779115881e-06, "loss": 2.3063, "step": 219 }, { "epoch": 0.2979011509817197, "grad_norm": 0.7957632137271137, "learning_rate": 9.90601970450631e-06, "loss": 2.1209, "step": 220 }, { "epoch": 0.2992552471225457, "grad_norm": 1.5559922150186727, "learning_rate": 9.904975907720465e-06, "loss": 2.745, "step": 221 }, { "epoch": 0.3006093432633717, "grad_norm": 1.0287819888236789, "learning_rate": 9.903926402016153e-06, "loss": 2.3034, "step": 222 }, { "epoch": 0.3019634394041977, "grad_norm": 0.8755596479469875, "learning_rate": 9.902871188614898e-06, "loss": 2.6008, "step": 223 }, { "epoch": 0.3033175355450237, "grad_norm": 2.222977688536351, "learning_rate": 9.901810268744868e-06, "loss": 2.5897, "step": 224 }, { "epoch": 0.3046716316858497, "grad_norm": 0.8294734280823934, "learning_rate": 9.90074364364087e-06, "loss": 2.373, "step": 225 }, { "epoch": 0.3060257278266757, "grad_norm": 0.9955963385213202, "learning_rate": 9.899671314544352e-06, "loss": 2.8267, "step": 226 }, { "epoch": 0.3073798239675017, "grad_norm": 0.9863487128858249, "learning_rate": 9.898593282703402e-06, "loss": 2.3585, "step": 227 }, { "epoch": 0.3087339201083277, "grad_norm": 1.3476414208683485, "learning_rate": 9.897509549372745e-06, "loss": 2.0764, "step": 228 }, { "epoch": 0.31008801624915366, "grad_norm": 1.1281976478830502, "learning_rate": 9.896420115813741e-06, "loss": 2.1232, "step": 229 }, { "epoch": 0.3114421123899797, "grad_norm": 0.9905003772016358, "learning_rate": 9.89532498329439e-06, "loss": 2.0276, "step": 230 }, { "epoch": 0.3127962085308057, "grad_norm": 0.9642354729606564, "learning_rate": 9.894224153089313e-06, "loss": 1.7903, "step": 231 }, { "epoch": 0.3141503046716317, "grad_norm": 1.8609542881386758, "learning_rate": 9.893117626479778e-06, "loss": 2.6118, "step": 232 }, { "epoch": 0.3155044008124577, "grad_norm": 1.0931602473722466, "learning_rate": 9.892005404753669e-06, "loss": 2.4775, "step": 233 }, { "epoch": 0.31685849695328366, "grad_norm": 0.7800560012460497, "learning_rate": 9.890887489205507e-06, "loss": 2.1569, "step": 234 }, { "epoch": 0.3182125930941097, "grad_norm": 1.150743273577776, "learning_rate": 9.889763881136439e-06, "loss": 2.4256, "step": 235 }, { "epoch": 0.3195666892349357, "grad_norm": 0.9159836558576258, "learning_rate": 9.888634581854235e-06, "loss": 2.3495, "step": 236 }, { "epoch": 0.3209207853757617, "grad_norm": 1.0174141433627475, "learning_rate": 9.88749959267329e-06, "loss": 2.8219, "step": 237 }, { "epoch": 0.3222748815165877, "grad_norm": 0.85962610230906, "learning_rate": 9.886358914914624e-06, "loss": 2.3995, "step": 238 }, { "epoch": 0.32362897765741366, "grad_norm": 0.8714138576280448, "learning_rate": 9.885212549905874e-06, "loss": 2.1309, "step": 239 }, { "epoch": 0.32498307379823965, "grad_norm": 0.985120553099594, "learning_rate": 9.884060498981297e-06, "loss": 2.3078, "step": 240 }, { "epoch": 0.3263371699390657, "grad_norm": 1.1509228640558309, "learning_rate": 9.88290276348177e-06, "loss": 2.762, "step": 241 }, { "epoch": 0.3276912660798917, "grad_norm": 0.8569907806447795, "learning_rate": 9.881739344754789e-06, "loss": 2.4162, "step": 242 }, { "epoch": 0.3290453622207177, "grad_norm": 0.8953655946187061, "learning_rate": 9.880570244154455e-06, "loss": 2.1708, "step": 243 }, { "epoch": 0.33039945836154366, "grad_norm": 2.6172719168994782, "learning_rate": 9.879395463041493e-06, "loss": 2.4244, "step": 244 }, { "epoch": 0.33175355450236965, "grad_norm": 1.1692362620244634, "learning_rate": 9.87821500278323e-06, "loss": 2.3573, "step": 245 }, { "epoch": 0.33310765064319564, "grad_norm": 1.3260737783720347, "learning_rate": 9.877028864753614e-06, "loss": 2.2204, "step": 246 }, { "epoch": 0.3344617467840217, "grad_norm": 1.1284130571617974, "learning_rate": 9.87583705033319e-06, "loss": 2.3806, "step": 247 }, { "epoch": 0.3358158429248477, "grad_norm": 0.8740876722841778, "learning_rate": 9.874639560909118e-06, "loss": 1.8413, "step": 248 }, { "epoch": 0.33716993906567366, "grad_norm": 0.8380346921284223, "learning_rate": 9.87343639787516e-06, "loss": 2.0977, "step": 249 }, { "epoch": 0.33852403520649965, "grad_norm": 0.8382850128591168, "learning_rate": 9.87222756263168e-06, "loss": 2.1709, "step": 250 }, { "epoch": 0.33987813134732564, "grad_norm": 1.1238927174638607, "learning_rate": 9.871013056585646e-06, "loss": 2.4974, "step": 251 }, { "epoch": 0.3412322274881517, "grad_norm": 0.7904179666008488, "learning_rate": 9.869792881150624e-06, "loss": 2.6544, "step": 252 }, { "epoch": 0.3425863236289777, "grad_norm": 1.5604245138186315, "learning_rate": 9.868567037746784e-06, "loss": 2.4196, "step": 253 }, { "epoch": 0.34394041976980366, "grad_norm": 0.9541984067972759, "learning_rate": 9.867335527800887e-06, "loss": 2.437, "step": 254 }, { "epoch": 0.34529451591062965, "grad_norm": 1.1385820836347318, "learning_rate": 9.866098352746295e-06, "loss": 2.6164, "step": 255 }, { "epoch": 0.34664861205145564, "grad_norm": 0.8579010274341586, "learning_rate": 9.864855514022955e-06, "loss": 2.0521, "step": 256 }, { "epoch": 0.34800270819228163, "grad_norm": 1.0456339797858754, "learning_rate": 9.863607013077414e-06, "loss": 1.8313, "step": 257 }, { "epoch": 0.3493568043331077, "grad_norm": 0.9189984940983889, "learning_rate": 9.862352851362808e-06, "loss": 2.2923, "step": 258 }, { "epoch": 0.35071090047393366, "grad_norm": 1.0071081750591437, "learning_rate": 9.861093030338859e-06, "loss": 2.5982, "step": 259 }, { "epoch": 0.35206499661475965, "grad_norm": 0.7791457829988662, "learning_rate": 9.859827551471877e-06, "loss": 2.1629, "step": 260 }, { "epoch": 0.35341909275558564, "grad_norm": 1.3508700131079088, "learning_rate": 9.858556416234755e-06, "loss": 2.3283, "step": 261 }, { "epoch": 0.35477318889641163, "grad_norm": 1.4779336761646127, "learning_rate": 9.857279626106975e-06, "loss": 2.1788, "step": 262 }, { "epoch": 0.3561272850372376, "grad_norm": 0.8287354033035986, "learning_rate": 9.855997182574598e-06, "loss": 2.4326, "step": 263 }, { "epoch": 0.35748138117806366, "grad_norm": 1.2376776693679805, "learning_rate": 9.854709087130261e-06, "loss": 1.9501, "step": 264 }, { "epoch": 0.35883547731888965, "grad_norm": 0.9140324089635932, "learning_rate": 9.853415341273185e-06, "loss": 2.1236, "step": 265 }, { "epoch": 0.36018957345971564, "grad_norm": 0.8399357799885816, "learning_rate": 9.852115946509163e-06, "loss": 2.2648, "step": 266 }, { "epoch": 0.36154366960054163, "grad_norm": 1.0405854607468001, "learning_rate": 9.85081090435057e-06, "loss": 2.2106, "step": 267 }, { "epoch": 0.3628977657413676, "grad_norm": 0.8825320659362774, "learning_rate": 9.849500216316346e-06, "loss": 1.9139, "step": 268 }, { "epoch": 0.36425186188219366, "grad_norm": 0.8884433268338788, "learning_rate": 9.848183883932003e-06, "loss": 2.3441, "step": 269 }, { "epoch": 0.36560595802301965, "grad_norm": 0.8144408969478175, "learning_rate": 9.846861908729628e-06, "loss": 2.7794, "step": 270 }, { "epoch": 0.36696005416384564, "grad_norm": 3.0643775465082093, "learning_rate": 9.845534292247872e-06, "loss": 2.6927, "step": 271 }, { "epoch": 0.36831415030467163, "grad_norm": 1.0918166740808275, "learning_rate": 9.844201036031952e-06, "loss": 2.6845, "step": 272 }, { "epoch": 0.3696682464454976, "grad_norm": 1.0782629441989242, "learning_rate": 9.84286214163365e-06, "loss": 2.1323, "step": 273 }, { "epoch": 0.3710223425863236, "grad_norm": 0.899216159697703, "learning_rate": 9.841517610611309e-06, "loss": 2.5939, "step": 274 }, { "epoch": 0.37237643872714965, "grad_norm": 0.8784697558034725, "learning_rate": 9.840167444529834e-06, "loss": 2.0398, "step": 275 }, { "epoch": 0.37373053486797564, "grad_norm": 0.7893703178184768, "learning_rate": 9.838811644960686e-06, "loss": 2.3489, "step": 276 }, { "epoch": 0.37508463100880163, "grad_norm": 1.5759025892565637, "learning_rate": 9.837450213481888e-06, "loss": 2.1618, "step": 277 }, { "epoch": 0.3764387271496276, "grad_norm": 0.8915508527852146, "learning_rate": 9.836083151678014e-06, "loss": 2.2966, "step": 278 }, { "epoch": 0.3777928232904536, "grad_norm": 0.9656457496381649, "learning_rate": 9.834710461140191e-06, "loss": 2.2487, "step": 279 }, { "epoch": 0.3791469194312796, "grad_norm": 1.0512896665744822, "learning_rate": 9.833332143466099e-06, "loss": 2.4065, "step": 280 }, { "epoch": 0.38050101557210564, "grad_norm": 1.0658387523528818, "learning_rate": 9.831948200259966e-06, "loss": 2.4469, "step": 281 }, { "epoch": 0.3818551117129316, "grad_norm": 1.117757922229749, "learning_rate": 9.830558633132568e-06, "loss": 2.2469, "step": 282 }, { "epoch": 0.3832092078537576, "grad_norm": 0.8846882799705557, "learning_rate": 9.82916344370123e-06, "loss": 1.8672, "step": 283 }, { "epoch": 0.3845633039945836, "grad_norm": 0.8829678982284991, "learning_rate": 9.827762633589813e-06, "loss": 2.3709, "step": 284 }, { "epoch": 0.3859174001354096, "grad_norm": 1.5559611510853741, "learning_rate": 9.826356204428726e-06, "loss": 2.531, "step": 285 }, { "epoch": 0.38727149627623564, "grad_norm": 0.8832988192797496, "learning_rate": 9.82494415785492e-06, "loss": 2.0908, "step": 286 }, { "epoch": 0.3886255924170616, "grad_norm": 0.893313058823839, "learning_rate": 9.82352649551188e-06, "loss": 2.2141, "step": 287 }, { "epoch": 0.3899796885578876, "grad_norm": 1.915840114647375, "learning_rate": 9.822103219049625e-06, "loss": 2.3759, "step": 288 }, { "epoch": 0.3913337846987136, "grad_norm": 1.091811159406276, "learning_rate": 9.820674330124716e-06, "loss": 1.949, "step": 289 }, { "epoch": 0.3926878808395396, "grad_norm": 0.9101585142251976, "learning_rate": 9.819239830400238e-06, "loss": 2.2837, "step": 290 }, { "epoch": 0.3940419769803656, "grad_norm": 0.8669554630795423, "learning_rate": 9.81779972154581e-06, "loss": 2.3568, "step": 291 }, { "epoch": 0.3953960731211916, "grad_norm": 1.0812216653348674, "learning_rate": 9.816354005237583e-06, "loss": 2.5594, "step": 292 }, { "epoch": 0.3967501692620176, "grad_norm": 1.0135729926732555, "learning_rate": 9.814902683158227e-06, "loss": 2.2677, "step": 293 }, { "epoch": 0.3981042654028436, "grad_norm": 1.1354280196245004, "learning_rate": 9.813445756996946e-06, "loss": 2.1235, "step": 294 }, { "epoch": 0.3994583615436696, "grad_norm": 0.8872371772513353, "learning_rate": 9.811983228449457e-06, "loss": 2.6214, "step": 295 }, { "epoch": 0.4008124576844956, "grad_norm": 1.198158875138794, "learning_rate": 9.810515099218004e-06, "loss": 2.5482, "step": 296 }, { "epoch": 0.40216655382532157, "grad_norm": 1.122073024930223, "learning_rate": 9.809041371011347e-06, "loss": 2.3414, "step": 297 }, { "epoch": 0.4035206499661476, "grad_norm": 0.7450105063153143, "learning_rate": 9.807562045544764e-06, "loss": 2.5729, "step": 298 }, { "epoch": 0.4048747461069736, "grad_norm": 1.096232419286508, "learning_rate": 9.806077124540045e-06, "loss": 2.5012, "step": 299 }, { "epoch": 0.4062288422477996, "grad_norm": 1.0045118062203406, "learning_rate": 9.804586609725499e-06, "loss": 2.0684, "step": 300 }, { "epoch": 0.4075829383886256, "grad_norm": 1.0368766866449026, "learning_rate": 9.803090502835938e-06, "loss": 2.1287, "step": 301 }, { "epoch": 0.40893703452945157, "grad_norm": 1.0208588144634514, "learning_rate": 9.801588805612685e-06, "loss": 2.3341, "step": 302 }, { "epoch": 0.4102911306702776, "grad_norm": 1.0454209454651813, "learning_rate": 9.800081519803575e-06, "loss": 2.088, "step": 303 }, { "epoch": 0.4116452268111036, "grad_norm": 1.0546734132886375, "learning_rate": 9.798568647162939e-06, "loss": 2.3576, "step": 304 }, { "epoch": 0.4129993229519296, "grad_norm": 1.1121493752708365, "learning_rate": 9.797050189451615e-06, "loss": 2.1073, "step": 305 }, { "epoch": 0.4143534190927556, "grad_norm": 1.1189378911389287, "learning_rate": 9.795526148436945e-06, "loss": 2.308, "step": 306 }, { "epoch": 0.41570751523358157, "grad_norm": 0.9899429575476786, "learning_rate": 9.793996525892762e-06, "loss": 2.2555, "step": 307 }, { "epoch": 0.41706161137440756, "grad_norm": 1.0479984863056457, "learning_rate": 9.7924613235994e-06, "loss": 2.3882, "step": 308 }, { "epoch": 0.4184157075152336, "grad_norm": 0.8437095271965467, "learning_rate": 9.790920543343686e-06, "loss": 2.8099, "step": 309 }, { "epoch": 0.4197698036560596, "grad_norm": 1.1267645343143333, "learning_rate": 9.78937418691894e-06, "loss": 2.377, "step": 310 }, { "epoch": 0.4211238997968856, "grad_norm": 0.954265542677354, "learning_rate": 9.787822256124972e-06, "loss": 2.2119, "step": 311 }, { "epoch": 0.42247799593771157, "grad_norm": 1.0897928699498936, "learning_rate": 9.78626475276808e-06, "loss": 2.5392, "step": 312 }, { "epoch": 0.42383209207853756, "grad_norm": 0.8557954442168854, "learning_rate": 9.784701678661045e-06, "loss": 2.0243, "step": 313 }, { "epoch": 0.42518618821936355, "grad_norm": 1.007998713387866, "learning_rate": 9.783133035623136e-06, "loss": 2.0593, "step": 314 }, { "epoch": 0.4265402843601896, "grad_norm": 0.8245127331943697, "learning_rate": 9.781558825480104e-06, "loss": 1.9544, "step": 315 }, { "epoch": 0.4278943805010156, "grad_norm": 1.1343475642376186, "learning_rate": 9.779979050064174e-06, "loss": 2.1917, "step": 316 }, { "epoch": 0.42924847664184157, "grad_norm": 1.1336908348453985, "learning_rate": 9.778393711214054e-06, "loss": 2.3615, "step": 317 }, { "epoch": 0.43060257278266756, "grad_norm": 1.0074004622651764, "learning_rate": 9.776802810774924e-06, "loss": 2.4642, "step": 318 }, { "epoch": 0.43195666892349355, "grad_norm": 0.869836917179775, "learning_rate": 9.77520635059844e-06, "loss": 2.2773, "step": 319 }, { "epoch": 0.4333107650643196, "grad_norm": 0.8822131547317252, "learning_rate": 9.77360433254273e-06, "loss": 2.5491, "step": 320 }, { "epoch": 0.4346648612051456, "grad_norm": 0.9871053514468092, "learning_rate": 9.771996758472381e-06, "loss": 2.5086, "step": 321 }, { "epoch": 0.43601895734597157, "grad_norm": 0.8810949134669234, "learning_rate": 9.770383630258463e-06, "loss": 2.4492, "step": 322 }, { "epoch": 0.43737305348679756, "grad_norm": 0.9634447042117443, "learning_rate": 9.768764949778495e-06, "loss": 2.4698, "step": 323 }, { "epoch": 0.43872714962762355, "grad_norm": 0.9578841465826307, "learning_rate": 9.767140718916467e-06, "loss": 2.8468, "step": 324 }, { "epoch": 0.44008124576844954, "grad_norm": 1.045773660121023, "learning_rate": 9.765510939562827e-06, "loss": 2.4331, "step": 325 }, { "epoch": 0.4414353419092756, "grad_norm": 1.5236244191611785, "learning_rate": 9.763875613614482e-06, "loss": 2.4025, "step": 326 }, { "epoch": 0.44278943805010157, "grad_norm": 0.9193628457370401, "learning_rate": 9.762234742974793e-06, "loss": 2.2136, "step": 327 }, { "epoch": 0.44414353419092756, "grad_norm": 0.8949925268143644, "learning_rate": 9.76058832955357e-06, "loss": 2.4088, "step": 328 }, { "epoch": 0.44549763033175355, "grad_norm": 2.0556356286018413, "learning_rate": 9.758936375267087e-06, "loss": 2.1351, "step": 329 }, { "epoch": 0.44685172647257954, "grad_norm": 1.0444826822145854, "learning_rate": 9.757278882038056e-06, "loss": 2.7799, "step": 330 }, { "epoch": 0.4482058226134055, "grad_norm": 1.0824437177931403, "learning_rate": 9.755615851795639e-06, "loss": 1.8668, "step": 331 }, { "epoch": 0.44955991875423157, "grad_norm": 1.1111629995250478, "learning_rate": 9.753947286475442e-06, "loss": 2.754, "step": 332 }, { "epoch": 0.45091401489505756, "grad_norm": 0.9573737568401282, "learning_rate": 9.752273188019514e-06, "loss": 2.5329, "step": 333 }, { "epoch": 0.45226811103588355, "grad_norm": 1.0357238977124108, "learning_rate": 9.750593558376347e-06, "loss": 2.2007, "step": 334 }, { "epoch": 0.45362220717670954, "grad_norm": 0.9095487712099384, "learning_rate": 9.748908399500863e-06, "loss": 1.8825, "step": 335 }, { "epoch": 0.4549763033175355, "grad_norm": 1.7445295348550816, "learning_rate": 9.747217713354428e-06, "loss": 2.5305, "step": 336 }, { "epoch": 0.45633039945836157, "grad_norm": 0.9070558170854458, "learning_rate": 9.745521501904835e-06, "loss": 2.1951, "step": 337 }, { "epoch": 0.45768449559918756, "grad_norm": 0.8995516521874707, "learning_rate": 9.743819767126312e-06, "loss": 2.2137, "step": 338 }, { "epoch": 0.45903859174001355, "grad_norm": 1.9426471289696312, "learning_rate": 9.742112510999516e-06, "loss": 2.3716, "step": 339 }, { "epoch": 0.46039268788083954, "grad_norm": 0.9725576998378328, "learning_rate": 9.740399735511524e-06, "loss": 2.6405, "step": 340 }, { "epoch": 0.4617467840216655, "grad_norm": 0.8305802783581133, "learning_rate": 9.738681442655842e-06, "loss": 2.2231, "step": 341 }, { "epoch": 0.4631008801624915, "grad_norm": 0.9734526439933455, "learning_rate": 9.736957634432398e-06, "loss": 2.3041, "step": 342 }, { "epoch": 0.46445497630331756, "grad_norm": 0.824808147309629, "learning_rate": 9.73522831284754e-06, "loss": 2.3024, "step": 343 }, { "epoch": 0.46580907244414355, "grad_norm": 0.9599600802177057, "learning_rate": 9.733493479914031e-06, "loss": 2.3563, "step": 344 }, { "epoch": 0.46716316858496953, "grad_norm": 0.9340063834780592, "learning_rate": 9.731753137651047e-06, "loss": 2.1237, "step": 345 }, { "epoch": 0.4685172647257955, "grad_norm": 1.562533249030556, "learning_rate": 9.730007288084178e-06, "loss": 2.3575, "step": 346 }, { "epoch": 0.4698713608666215, "grad_norm": 0.9490993069469829, "learning_rate": 9.728255933245428e-06, "loss": 2.8822, "step": 347 }, { "epoch": 0.4712254570074475, "grad_norm": 0.9566724872274933, "learning_rate": 9.726499075173201e-06, "loss": 2.3402, "step": 348 }, { "epoch": 0.47257955314827355, "grad_norm": 1.008513385617003, "learning_rate": 9.724736715912313e-06, "loss": 2.1406, "step": 349 }, { "epoch": 0.47393364928909953, "grad_norm": 1.1766610712644368, "learning_rate": 9.72296885751398e-06, "loss": 2.1934, "step": 350 }, { "epoch": 0.4752877454299255, "grad_norm": 0.9325244223791722, "learning_rate": 9.721195502035817e-06, "loss": 1.9835, "step": 351 }, { "epoch": 0.4766418415707515, "grad_norm": 0.837427966240756, "learning_rate": 9.719416651541839e-06, "loss": 2.481, "step": 352 }, { "epoch": 0.4779959377115775, "grad_norm": 1.02007044942659, "learning_rate": 9.717632308102455e-06, "loss": 2.7148, "step": 353 }, { "epoch": 0.47935003385240355, "grad_norm": 0.959116878107154, "learning_rate": 9.715842473794472e-06, "loss": 2.4738, "step": 354 }, { "epoch": 0.48070412999322953, "grad_norm": 1.0102624905029922, "learning_rate": 9.714047150701082e-06, "loss": 2.0945, "step": 355 }, { "epoch": 0.4820582261340555, "grad_norm": 2.0984222479912438, "learning_rate": 9.712246340911866e-06, "loss": 2.1697, "step": 356 }, { "epoch": 0.4834123222748815, "grad_norm": 0.917396475775467, "learning_rate": 9.710440046522797e-06, "loss": 2.2475, "step": 357 }, { "epoch": 0.4847664184157075, "grad_norm": 1.005361407668732, "learning_rate": 9.708628269636224e-06, "loss": 2.2307, "step": 358 }, { "epoch": 0.4861205145565335, "grad_norm": 1.0309046462532379, "learning_rate": 9.706811012360882e-06, "loss": 2.4654, "step": 359 }, { "epoch": 0.48747461069735953, "grad_norm": 3.0659420771722834, "learning_rate": 9.704988276811883e-06, "loss": 2.7097, "step": 360 }, { "epoch": 0.4888287068381855, "grad_norm": 1.0772251854315056, "learning_rate": 9.703160065110716e-06, "loss": 2.3728, "step": 361 }, { "epoch": 0.4901828029790115, "grad_norm": 1.1656105776873809, "learning_rate": 9.701326379385238e-06, "loss": 2.6172, "step": 362 }, { "epoch": 0.4915368991198375, "grad_norm": 1.0293153712278362, "learning_rate": 9.699487221769687e-06, "loss": 2.427, "step": 363 }, { "epoch": 0.4928909952606635, "grad_norm": 1.1554858971025437, "learning_rate": 9.697642594404666e-06, "loss": 2.3854, "step": 364 }, { "epoch": 0.4942450914014895, "grad_norm": 1.1552799937404215, "learning_rate": 9.69579249943714e-06, "loss": 2.2642, "step": 365 }, { "epoch": 0.4955991875423155, "grad_norm": 1.3024770839893858, "learning_rate": 9.693936939020441e-06, "loss": 2.7555, "step": 366 }, { "epoch": 0.4969532836831415, "grad_norm": 0.9139562471712885, "learning_rate": 9.692075915314265e-06, "loss": 1.9849, "step": 367 }, { "epoch": 0.4983073798239675, "grad_norm": 1.1108979731718802, "learning_rate": 9.69020943048466e-06, "loss": 2.7987, "step": 368 }, { "epoch": 0.4996614759647935, "grad_norm": 1.043554360812148, "learning_rate": 9.688337486704038e-06, "loss": 2.379, "step": 369 }, { "epoch": 0.5010155721056195, "grad_norm": 0.8482819857067955, "learning_rate": 9.686460086151159e-06, "loss": 2.3031, "step": 370 }, { "epoch": 0.5023696682464455, "grad_norm": 0.882893031313843, "learning_rate": 9.684577231011134e-06, "loss": 1.9398, "step": 371 }, { "epoch": 0.5037237643872715, "grad_norm": 0.8900569461037698, "learning_rate": 9.68268892347543e-06, "loss": 2.3579, "step": 372 }, { "epoch": 0.5050778605280974, "grad_norm": 1.027440419577199, "learning_rate": 9.680795165741849e-06, "loss": 1.8953, "step": 373 }, { "epoch": 0.5064319566689235, "grad_norm": 1.2020524441224305, "learning_rate": 9.678895960014545e-06, "loss": 2.6735, "step": 374 }, { "epoch": 0.5077860528097495, "grad_norm": 1.6359343554936903, "learning_rate": 9.676991308504012e-06, "loss": 2.5313, "step": 375 }, { "epoch": 0.5091401489505755, "grad_norm": 0.9318914822552815, "learning_rate": 9.675081213427076e-06, "loss": 2.7319, "step": 376 }, { "epoch": 0.5104942450914015, "grad_norm": 1.0629414040228036, "learning_rate": 9.673165677006906e-06, "loss": 2.5041, "step": 377 }, { "epoch": 0.5118483412322274, "grad_norm": 0.9015543787336042, "learning_rate": 9.671244701472999e-06, "loss": 2.2052, "step": 378 }, { "epoch": 0.5132024373730535, "grad_norm": 0.9603399577043747, "learning_rate": 9.669318289061191e-06, "loss": 2.5069, "step": 379 }, { "epoch": 0.5145565335138795, "grad_norm": 0.8421778244290412, "learning_rate": 9.667386442013634e-06, "loss": 2.6313, "step": 380 }, { "epoch": 0.5159106296547055, "grad_norm": 0.8356138567382542, "learning_rate": 9.665449162578814e-06, "loss": 2.0145, "step": 381 }, { "epoch": 0.5172647257955315, "grad_norm": 0.9669755721462949, "learning_rate": 9.663506453011538e-06, "loss": 2.2227, "step": 382 }, { "epoch": 0.5186188219363574, "grad_norm": 1.163959225455655, "learning_rate": 9.66155831557293e-06, "loss": 2.5775, "step": 383 }, { "epoch": 0.5199729180771835, "grad_norm": 0.9362192144637581, "learning_rate": 9.659604752530434e-06, "loss": 2.2014, "step": 384 }, { "epoch": 0.5213270142180095, "grad_norm": 0.9892445662855336, "learning_rate": 9.657645766157813e-06, "loss": 2.4195, "step": 385 }, { "epoch": 0.5226811103588355, "grad_norm": 0.8722826858897498, "learning_rate": 9.655681358735134e-06, "loss": 2.6966, "step": 386 }, { "epoch": 0.5240352064996615, "grad_norm": 1.0196349369430946, "learning_rate": 9.653711532548778e-06, "loss": 2.4237, "step": 387 }, { "epoch": 0.5253893026404874, "grad_norm": 1.0011773469402157, "learning_rate": 9.651736289891434e-06, "loss": 2.3765, "step": 388 }, { "epoch": 0.5267433987813135, "grad_norm": 0.9843650150083161, "learning_rate": 9.649755633062092e-06, "loss": 2.4698, "step": 389 }, { "epoch": 0.5280974949221394, "grad_norm": 1.2199752234970775, "learning_rate": 9.647769564366048e-06, "loss": 2.2025, "step": 390 }, { "epoch": 0.5294515910629655, "grad_norm": 1.048526738243433, "learning_rate": 9.645778086114892e-06, "loss": 2.5315, "step": 391 }, { "epoch": 0.5308056872037915, "grad_norm": 1.0079058862169128, "learning_rate": 9.643781200626512e-06, "loss": 2.2688, "step": 392 }, { "epoch": 0.5321597833446174, "grad_norm": 0.9175841000628026, "learning_rate": 9.641778910225093e-06, "loss": 2.4924, "step": 393 }, { "epoch": 0.5335138794854435, "grad_norm": 0.9394126148321758, "learning_rate": 9.639771217241104e-06, "loss": 2.6604, "step": 394 }, { "epoch": 0.5348679756262694, "grad_norm": 1.8536643574976448, "learning_rate": 9.637758124011307e-06, "loss": 2.3891, "step": 395 }, { "epoch": 0.5362220717670955, "grad_norm": 1.353118661391877, "learning_rate": 9.63573963287875e-06, "loss": 2.2486, "step": 396 }, { "epoch": 0.5375761679079215, "grad_norm": 0.9114437873284958, "learning_rate": 9.633715746192762e-06, "loss": 2.9009, "step": 397 }, { "epoch": 0.5389302640487474, "grad_norm": 0.9178272536878976, "learning_rate": 9.631686466308947e-06, "loss": 2.6545, "step": 398 }, { "epoch": 0.5402843601895735, "grad_norm": 0.8349824532422451, "learning_rate": 9.629651795589197e-06, "loss": 2.3238, "step": 399 }, { "epoch": 0.5416384563303994, "grad_norm": 0.8002054931156397, "learning_rate": 9.627611736401668e-06, "loss": 2.1503, "step": 400 }, { "epoch": 0.5429925524712255, "grad_norm": 1.014043339956385, "learning_rate": 9.625566291120794e-06, "loss": 2.3366, "step": 401 }, { "epoch": 0.5443466486120515, "grad_norm": 0.9753524140226457, "learning_rate": 9.623515462127276e-06, "loss": 2.4777, "step": 402 }, { "epoch": 0.5457007447528774, "grad_norm": 0.9904758899407581, "learning_rate": 9.621459251808078e-06, "loss": 2.5355, "step": 403 }, { "epoch": 0.5470548408937035, "grad_norm": 0.9176434471710123, "learning_rate": 9.619397662556434e-06, "loss": 2.1775, "step": 404 }, { "epoch": 0.5484089370345294, "grad_norm": 0.9994849139704428, "learning_rate": 9.617330696771834e-06, "loss": 2.3618, "step": 405 }, { "epoch": 0.5497630331753555, "grad_norm": 0.9718579619606619, "learning_rate": 9.615258356860027e-06, "loss": 2.065, "step": 406 }, { "epoch": 0.5511171293161814, "grad_norm": 0.8890137365695405, "learning_rate": 9.613180645233014e-06, "loss": 2.153, "step": 407 }, { "epoch": 0.5524712254570074, "grad_norm": 1.001837019128747, "learning_rate": 9.611097564309054e-06, "loss": 2.4168, "step": 408 }, { "epoch": 0.5538253215978335, "grad_norm": 1.205118639472601, "learning_rate": 9.609009116512648e-06, "loss": 2.5229, "step": 409 }, { "epoch": 0.5551794177386594, "grad_norm": 0.9757578479246173, "learning_rate": 9.60691530427455e-06, "loss": 2.1174, "step": 410 }, { "epoch": 0.5565335138794855, "grad_norm": 0.9985946290929901, "learning_rate": 9.60481613003176e-06, "loss": 1.9395, "step": 411 }, { "epoch": 0.5578876100203114, "grad_norm": 0.9058862928790223, "learning_rate": 9.602711596227507e-06, "loss": 2.1988, "step": 412 }, { "epoch": 0.5592417061611374, "grad_norm": 0.9620888369560685, "learning_rate": 9.600601705311267e-06, "loss": 2.3826, "step": 413 }, { "epoch": 0.5605958023019635, "grad_norm": 1.0478032599035778, "learning_rate": 9.598486459738751e-06, "loss": 2.4528, "step": 414 }, { "epoch": 0.5619498984427894, "grad_norm": 0.8838781940692847, "learning_rate": 9.5963658619719e-06, "loss": 2.2238, "step": 415 }, { "epoch": 0.5633039945836155, "grad_norm": 1.044653451464992, "learning_rate": 9.594239914478886e-06, "loss": 2.8379, "step": 416 }, { "epoch": 0.5646580907244414, "grad_norm": 0.9143098376517602, "learning_rate": 9.592108619734107e-06, "loss": 2.3621, "step": 417 }, { "epoch": 0.5660121868652674, "grad_norm": 1.3056702719267006, "learning_rate": 9.58997198021818e-06, "loss": 2.3495, "step": 418 }, { "epoch": 0.5673662830060935, "grad_norm": 1.2114169289052528, "learning_rate": 9.587829998417953e-06, "loss": 2.6471, "step": 419 }, { "epoch": 0.5687203791469194, "grad_norm": 1.046324944660913, "learning_rate": 9.58568267682648e-06, "loss": 2.402, "step": 420 }, { "epoch": 0.5700744752877455, "grad_norm": 0.9349583810327691, "learning_rate": 9.58353001794304e-06, "loss": 2.2225, "step": 421 }, { "epoch": 0.5714285714285714, "grad_norm": 1.481772765240176, "learning_rate": 9.581372024273121e-06, "loss": 1.915, "step": 422 }, { "epoch": 0.5727826675693974, "grad_norm": 0.928476736332317, "learning_rate": 9.579208698328419e-06, "loss": 1.8008, "step": 423 }, { "epoch": 0.5741367637102234, "grad_norm": 0.9260270264242221, "learning_rate": 9.577040042626832e-06, "loss": 2.417, "step": 424 }, { "epoch": 0.5754908598510494, "grad_norm": 1.1064706208462298, "learning_rate": 9.574866059692471e-06, "loss": 2.5265, "step": 425 }, { "epoch": 0.5768449559918755, "grad_norm": 2.139248004116943, "learning_rate": 9.57268675205564e-06, "loss": 2.1862, "step": 426 }, { "epoch": 0.5781990521327014, "grad_norm": 1.2117707663910726, "learning_rate": 9.570502122252844e-06, "loss": 2.8615, "step": 427 }, { "epoch": 0.5795531482735274, "grad_norm": 0.8678663266783263, "learning_rate": 9.568312172826779e-06, "loss": 2.2196, "step": 428 }, { "epoch": 0.5809072444143534, "grad_norm": 1.183740118318461, "learning_rate": 9.566116906326336e-06, "loss": 2.7205, "step": 429 }, { "epoch": 0.5822613405551794, "grad_norm": 0.9774576381702014, "learning_rate": 9.563916325306595e-06, "loss": 2.2203, "step": 430 }, { "epoch": 0.5836154366960055, "grad_norm": 0.9824933411377383, "learning_rate": 9.561710432328817e-06, "loss": 2.7149, "step": 431 }, { "epoch": 0.5849695328368314, "grad_norm": 1.8285370131236038, "learning_rate": 9.55949922996045e-06, "loss": 2.1549, "step": 432 }, { "epoch": 0.5863236289776574, "grad_norm": 0.92698096294728, "learning_rate": 9.55728272077512e-06, "loss": 2.0743, "step": 433 }, { "epoch": 0.5876777251184834, "grad_norm": 1.0344108156736416, "learning_rate": 9.555060907352632e-06, "loss": 2.3762, "step": 434 }, { "epoch": 0.5890318212593094, "grad_norm": 1.0045554696436734, "learning_rate": 9.552833792278957e-06, "loss": 2.1954, "step": 435 }, { "epoch": 0.5903859174001355, "grad_norm": 0.7967159103822394, "learning_rate": 9.550601378146246e-06, "loss": 2.0138, "step": 436 }, { "epoch": 0.5917400135409614, "grad_norm": 0.8111896941287193, "learning_rate": 9.54836366755281e-06, "loss": 2.3701, "step": 437 }, { "epoch": 0.5930941096817874, "grad_norm": 0.9726524616144652, "learning_rate": 9.546120663103134e-06, "loss": 1.9217, "step": 438 }, { "epoch": 0.5944482058226134, "grad_norm": 1.0256397319731523, "learning_rate": 9.543872367407854e-06, "loss": 2.0282, "step": 439 }, { "epoch": 0.5958023019634394, "grad_norm": 1.0079882867447063, "learning_rate": 9.54161878308377e-06, "loss": 2.3361, "step": 440 }, { "epoch": 0.5971563981042654, "grad_norm": 0.9800767836635148, "learning_rate": 9.539359912753839e-06, "loss": 2.6659, "step": 441 }, { "epoch": 0.5985104942450914, "grad_norm": 1.0649583722690574, "learning_rate": 9.537095759047163e-06, "loss": 2.4143, "step": 442 }, { "epoch": 0.5998645903859174, "grad_norm": 1.174278203502222, "learning_rate": 9.534826324599002e-06, "loss": 2.22, "step": 443 }, { "epoch": 0.6012186865267434, "grad_norm": 1.3010142344912385, "learning_rate": 9.53255161205076e-06, "loss": 2.1634, "step": 444 }, { "epoch": 0.6025727826675694, "grad_norm": 0.8409212163726476, "learning_rate": 9.530271624049979e-06, "loss": 1.8782, "step": 445 }, { "epoch": 0.6039268788083954, "grad_norm": 0.8027451978851314, "learning_rate": 9.527986363250348e-06, "loss": 2.203, "step": 446 }, { "epoch": 0.6052809749492214, "grad_norm": 0.9039555297703499, "learning_rate": 9.525695832311688e-06, "loss": 2.3687, "step": 447 }, { "epoch": 0.6066350710900474, "grad_norm": 0.8912812069256285, "learning_rate": 9.523400033899957e-06, "loss": 2.2258, "step": 448 }, { "epoch": 0.6079891672308734, "grad_norm": 0.8488642026117442, "learning_rate": 9.52109897068724e-06, "loss": 2.4719, "step": 449 }, { "epoch": 0.6093432633716994, "grad_norm": 0.9189612986590668, "learning_rate": 9.518792645351757e-06, "loss": 2.1323, "step": 450 }, { "epoch": 0.6106973595125254, "grad_norm": 1.0241173909729102, "learning_rate": 9.516481060577847e-06, "loss": 2.3125, "step": 451 }, { "epoch": 0.6120514556533514, "grad_norm": 0.9700124960446453, "learning_rate": 9.514164219055969e-06, "loss": 2.1262, "step": 452 }, { "epoch": 0.6134055517941774, "grad_norm": 0.9865209216124129, "learning_rate": 9.511842123482703e-06, "loss": 2.0346, "step": 453 }, { "epoch": 0.6147596479350034, "grad_norm": 0.8339628050854324, "learning_rate": 9.509514776560747e-06, "loss": 2.2793, "step": 454 }, { "epoch": 0.6161137440758294, "grad_norm": 1.2124630572926367, "learning_rate": 9.507182180998906e-06, "loss": 2.1059, "step": 455 }, { "epoch": 0.6174678402166554, "grad_norm": 1.1523483755668846, "learning_rate": 9.504844339512096e-06, "loss": 2.1094, "step": 456 }, { "epoch": 0.6188219363574814, "grad_norm": 0.9250806368586671, "learning_rate": 9.50250125482134e-06, "loss": 2.6985, "step": 457 }, { "epoch": 0.6201760324983073, "grad_norm": 1.2028362769677976, "learning_rate": 9.500152929653764e-06, "loss": 2.3153, "step": 458 }, { "epoch": 0.6215301286391334, "grad_norm": 1.071969908607831, "learning_rate": 9.497799366742586e-06, "loss": 2.0759, "step": 459 }, { "epoch": 0.6228842247799594, "grad_norm": 1.3535021074481768, "learning_rate": 9.49544056882713e-06, "loss": 2.2538, "step": 460 }, { "epoch": 0.6242383209207854, "grad_norm": 1.1103088277032604, "learning_rate": 9.49307653865281e-06, "loss": 2.0217, "step": 461 }, { "epoch": 0.6255924170616114, "grad_norm": 0.9163278193375946, "learning_rate": 9.490707278971127e-06, "loss": 2.2543, "step": 462 }, { "epoch": 0.6269465132024373, "grad_norm": 0.953261634167317, "learning_rate": 9.488332792539672e-06, "loss": 2.6539, "step": 463 }, { "epoch": 0.6283006093432634, "grad_norm": 1.029379230332492, "learning_rate": 9.485953082122116e-06, "loss": 2.3507, "step": 464 }, { "epoch": 0.6296547054840894, "grad_norm": 0.9618296595544104, "learning_rate": 9.483568150488215e-06, "loss": 2.4448, "step": 465 }, { "epoch": 0.6310088016249153, "grad_norm": 0.9594228312002971, "learning_rate": 9.481178000413796e-06, "loss": 2.6108, "step": 466 }, { "epoch": 0.6323628977657414, "grad_norm": 0.9624426030456921, "learning_rate": 9.478782634680765e-06, "loss": 2.3803, "step": 467 }, { "epoch": 0.6337169939065673, "grad_norm": 0.9808684308680445, "learning_rate": 9.476382056077097e-06, "loss": 2.5205, "step": 468 }, { "epoch": 0.6350710900473934, "grad_norm": 1.0917634456153018, "learning_rate": 9.473976267396831e-06, "loss": 2.008, "step": 469 }, { "epoch": 0.6364251861882194, "grad_norm": 1.85759777161883, "learning_rate": 9.471565271440075e-06, "loss": 1.9461, "step": 470 }, { "epoch": 0.6377792823290453, "grad_norm": 0.9263896064370456, "learning_rate": 9.469149071012996e-06, "loss": 2.1459, "step": 471 }, { "epoch": 0.6391333784698714, "grad_norm": 1.2831155066680264, "learning_rate": 9.466727668927817e-06, "loss": 2.5968, "step": 472 }, { "epoch": 0.6404874746106973, "grad_norm": 2.5926011027880405, "learning_rate": 9.464301068002815e-06, "loss": 2.8532, "step": 473 }, { "epoch": 0.6418415707515234, "grad_norm": 1.3517479249404691, "learning_rate": 9.461869271062322e-06, "loss": 2.5085, "step": 474 }, { "epoch": 0.6431956668923493, "grad_norm": 1.0653079693612253, "learning_rate": 9.459432280936714e-06, "loss": 1.926, "step": 475 }, { "epoch": 0.6445497630331753, "grad_norm": 1.0222358313347504, "learning_rate": 9.456990100462411e-06, "loss": 2.0835, "step": 476 }, { "epoch": 0.6459038591740014, "grad_norm": 0.9472201990170999, "learning_rate": 9.454542732481876e-06, "loss": 2.5298, "step": 477 }, { "epoch": 0.6472579553148273, "grad_norm": 1.05236931626042, "learning_rate": 9.452090179843609e-06, "loss": 2.5688, "step": 478 }, { "epoch": 0.6486120514556534, "grad_norm": 0.9205333314410238, "learning_rate": 9.449632445402146e-06, "loss": 2.2376, "step": 479 }, { "epoch": 0.6499661475964793, "grad_norm": 0.936047746739389, "learning_rate": 9.44716953201805e-06, "loss": 2.2657, "step": 480 }, { "epoch": 0.6513202437373053, "grad_norm": 0.984012799628733, "learning_rate": 9.444701442557917e-06, "loss": 2.2747, "step": 481 }, { "epoch": 0.6526743398781314, "grad_norm": 1.1605171743997218, "learning_rate": 9.442228179894362e-06, "loss": 2.5819, "step": 482 }, { "epoch": 0.6540284360189573, "grad_norm": 1.0367078942789454, "learning_rate": 9.439749746906027e-06, "loss": 2.3326, "step": 483 }, { "epoch": 0.6553825321597834, "grad_norm": 0.8511526169640604, "learning_rate": 9.437266146477567e-06, "loss": 2.3916, "step": 484 }, { "epoch": 0.6567366283006093, "grad_norm": 0.9546785433848596, "learning_rate": 9.434777381499654e-06, "loss": 2.2932, "step": 485 }, { "epoch": 0.6580907244414353, "grad_norm": 0.897715373597665, "learning_rate": 9.43228345486897e-06, "loss": 2.2329, "step": 486 }, { "epoch": 0.6594448205822614, "grad_norm": 1.1576802918228548, "learning_rate": 9.429784369488205e-06, "loss": 2.5906, "step": 487 }, { "epoch": 0.6607989167230873, "grad_norm": 1.2669978365090186, "learning_rate": 9.427280128266049e-06, "loss": 2.0969, "step": 488 }, { "epoch": 0.6621530128639134, "grad_norm": 1.116706571792338, "learning_rate": 9.424770734117206e-06, "loss": 2.0978, "step": 489 }, { "epoch": 0.6635071090047393, "grad_norm": 1.0731484382258585, "learning_rate": 9.42225618996236e-06, "loss": 2.2606, "step": 490 }, { "epoch": 0.6648612051455653, "grad_norm": 1.0217610153976007, "learning_rate": 9.419736498728203e-06, "loss": 2.0359, "step": 491 }, { "epoch": 0.6662153012863913, "grad_norm": 0.9470105904564678, "learning_rate": 9.417211663347407e-06, "loss": 2.3133, "step": 492 }, { "epoch": 0.6675693974272173, "grad_norm": 1.1496175064197292, "learning_rate": 9.414681686758645e-06, "loss": 2.6914, "step": 493 }, { "epoch": 0.6689234935680434, "grad_norm": 1.1088711952773571, "learning_rate": 9.412146571906556e-06, "loss": 2.1629, "step": 494 }, { "epoch": 0.6702775897088693, "grad_norm": 2.4048301880979315, "learning_rate": 9.409606321741776e-06, "loss": 2.3511, "step": 495 }, { "epoch": 0.6716316858496953, "grad_norm": 1.0855961625720172, "learning_rate": 9.407060939220907e-06, "loss": 3.0198, "step": 496 }, { "epoch": 0.6729857819905213, "grad_norm": 0.831939027060162, "learning_rate": 9.404510427306533e-06, "loss": 2.0721, "step": 497 }, { "epoch": 0.6743398781313473, "grad_norm": 0.9189030221620978, "learning_rate": 9.401954788967199e-06, "loss": 2.1533, "step": 498 }, { "epoch": 0.6756939742721734, "grad_norm": 1.1215682593702503, "learning_rate": 9.39939402717742e-06, "loss": 2.3017, "step": 499 }, { "epoch": 0.6770480704129993, "grad_norm": 0.9997121388139094, "learning_rate": 9.396828144917682e-06, "loss": 2.1092, "step": 500 }, { "epoch": 0.6784021665538253, "grad_norm": 1.0419306713261767, "learning_rate": 9.39425714517442e-06, "loss": 2.3772, "step": 501 }, { "epoch": 0.6797562626946513, "grad_norm": 0.8546730463318373, "learning_rate": 9.391681030940031e-06, "loss": 2.4692, "step": 502 }, { "epoch": 0.6811103588354773, "grad_norm": 1.036666833583794, "learning_rate": 9.389099805212862e-06, "loss": 2.4967, "step": 503 }, { "epoch": 0.6824644549763034, "grad_norm": 0.9028903653709767, "learning_rate": 9.38651347099721e-06, "loss": 2.2658, "step": 504 }, { "epoch": 0.6838185511171293, "grad_norm": 1.004993430273087, "learning_rate": 9.38392203130332e-06, "loss": 2.2413, "step": 505 }, { "epoch": 0.6851726472579553, "grad_norm": 4.244116799684234, "learning_rate": 9.38132548914738e-06, "loss": 2.2353, "step": 506 }, { "epoch": 0.6865267433987813, "grad_norm": 2.0623005376684516, "learning_rate": 9.37872384755151e-06, "loss": 2.3859, "step": 507 }, { "epoch": 0.6878808395396073, "grad_norm": 1.099884816816885, "learning_rate": 9.376117109543769e-06, "loss": 1.8825, "step": 508 }, { "epoch": 0.6892349356804333, "grad_norm": 0.9736281444679528, "learning_rate": 9.373505278158152e-06, "loss": 2.4243, "step": 509 }, { "epoch": 0.6905890318212593, "grad_norm": 0.9265991736695152, "learning_rate": 9.370888356434577e-06, "loss": 2.5581, "step": 510 }, { "epoch": 0.6919431279620853, "grad_norm": 0.9742740668751364, "learning_rate": 9.368266347418891e-06, "loss": 2.2436, "step": 511 }, { "epoch": 0.6932972241029113, "grad_norm": 0.903250624410521, "learning_rate": 9.365639254162855e-06, "loss": 2.3704, "step": 512 }, { "epoch": 0.6946513202437373, "grad_norm": 1.011884451867896, "learning_rate": 9.363007079724153e-06, "loss": 2.3554, "step": 513 }, { "epoch": 0.6960054163845633, "grad_norm": 2.1268078439425504, "learning_rate": 9.360369827166385e-06, "loss": 2.2531, "step": 514 }, { "epoch": 0.6973595125253893, "grad_norm": 0.9226076560386388, "learning_rate": 9.357727499559055e-06, "loss": 2.1536, "step": 515 }, { "epoch": 0.6987136086662153, "grad_norm": 0.9235717043788523, "learning_rate": 9.355080099977579e-06, "loss": 2.5795, "step": 516 }, { "epoch": 0.7000677048070413, "grad_norm": 0.997624756104421, "learning_rate": 9.352427631503274e-06, "loss": 2.4086, "step": 517 }, { "epoch": 0.7014218009478673, "grad_norm": 1.0325935042936567, "learning_rate": 9.349770097223356e-06, "loss": 2.1905, "step": 518 }, { "epoch": 0.7027758970886933, "grad_norm": 0.9779488073579523, "learning_rate": 9.347107500230941e-06, "loss": 2.1902, "step": 519 }, { "epoch": 0.7041299932295193, "grad_norm": 0.9071035111230149, "learning_rate": 9.344439843625034e-06, "loss": 2.4966, "step": 520 }, { "epoch": 0.7054840893703453, "grad_norm": 0.9294082146423078, "learning_rate": 9.341767130510529e-06, "loss": 2.1556, "step": 521 }, { "epoch": 0.7068381855111713, "grad_norm": 0.8010254757576656, "learning_rate": 9.339089363998206e-06, "loss": 2.2359, "step": 522 }, { "epoch": 0.7081922816519973, "grad_norm": 1.2492381753400523, "learning_rate": 9.336406547204726e-06, "loss": 2.0768, "step": 523 }, { "epoch": 0.7095463777928233, "grad_norm": 0.960396750578361, "learning_rate": 9.333718683252631e-06, "loss": 2.2373, "step": 524 }, { "epoch": 0.7109004739336493, "grad_norm": 0.9089783184513304, "learning_rate": 9.331025775270335e-06, "loss": 2.0008, "step": 525 }, { "epoch": 0.7122545700744752, "grad_norm": 1.0902851866790608, "learning_rate": 9.32832782639212e-06, "loss": 2.452, "step": 526 }, { "epoch": 0.7136086662153013, "grad_norm": 1.0819110431428438, "learning_rate": 9.325624839758142e-06, "loss": 2.1797, "step": 527 }, { "epoch": 0.7149627623561273, "grad_norm": 1.035728859841243, "learning_rate": 9.322916818514414e-06, "loss": 2.0016, "step": 528 }, { "epoch": 0.7163168584969533, "grad_norm": 0.8429686871803489, "learning_rate": 9.32020376581281e-06, "loss": 2.1271, "step": 529 }, { "epoch": 0.7176709546377793, "grad_norm": 0.948124830150651, "learning_rate": 9.317485684811065e-06, "loss": 2.463, "step": 530 }, { "epoch": 0.7190250507786052, "grad_norm": 1.0966777201484916, "learning_rate": 9.31476257867276e-06, "loss": 2.7078, "step": 531 }, { "epoch": 0.7203791469194313, "grad_norm": 0.9601018625399497, "learning_rate": 9.312034450567331e-06, "loss": 2.0091, "step": 532 }, { "epoch": 0.7217332430602573, "grad_norm": 1.0195440951721928, "learning_rate": 9.309301303670053e-06, "loss": 2.2595, "step": 533 }, { "epoch": 0.7230873392010833, "grad_norm": 1.0365088177530009, "learning_rate": 9.306563141162046e-06, "loss": 2.5509, "step": 534 }, { "epoch": 0.7244414353419093, "grad_norm": 0.89557097021405, "learning_rate": 9.303819966230265e-06, "loss": 2.184, "step": 535 }, { "epoch": 0.7257955314827352, "grad_norm": 1.1201396043943863, "learning_rate": 9.301071782067504e-06, "loss": 2.2719, "step": 536 }, { "epoch": 0.7271496276235613, "grad_norm": 1.0416986531898793, "learning_rate": 9.298318591872381e-06, "loss": 2.9307, "step": 537 }, { "epoch": 0.7285037237643873, "grad_norm": 0.8874369002318888, "learning_rate": 9.295560398849348e-06, "loss": 2.0185, "step": 538 }, { "epoch": 0.7298578199052133, "grad_norm": 1.3062496312023244, "learning_rate": 9.29279720620867e-06, "loss": 2.4988, "step": 539 }, { "epoch": 0.7312119160460393, "grad_norm": 0.8723686506256735, "learning_rate": 9.290029017166439e-06, "loss": 2.1358, "step": 540 }, { "epoch": 0.7325660121868652, "grad_norm": 0.9558509794048232, "learning_rate": 9.287255834944563e-06, "loss": 2.5911, "step": 541 }, { "epoch": 0.7339201083276913, "grad_norm": 0.9765924363071334, "learning_rate": 9.284477662770753e-06, "loss": 2.2083, "step": 542 }, { "epoch": 0.7352742044685172, "grad_norm": 0.9439430068504877, "learning_rate": 9.281694503878536e-06, "loss": 2.2064, "step": 543 }, { "epoch": 0.7366283006093433, "grad_norm": 1.1103666583629448, "learning_rate": 9.278906361507238e-06, "loss": 2.2118, "step": 544 }, { "epoch": 0.7379823967501693, "grad_norm": 1.015972947443489, "learning_rate": 9.276113238901992e-06, "loss": 2.3865, "step": 545 }, { "epoch": 0.7393364928909952, "grad_norm": 0.9787500482208973, "learning_rate": 9.273315139313719e-06, "loss": 2.0949, "step": 546 }, { "epoch": 0.7406905890318213, "grad_norm": 1.1247301478380771, "learning_rate": 9.270512065999139e-06, "loss": 1.9341, "step": 547 }, { "epoch": 0.7420446851726472, "grad_norm": 0.8630459962648358, "learning_rate": 9.267704022220758e-06, "loss": 2.1934, "step": 548 }, { "epoch": 0.7433987813134733, "grad_norm": 0.9456846452275267, "learning_rate": 9.264891011246867e-06, "loss": 2.1091, "step": 549 }, { "epoch": 0.7447528774542993, "grad_norm": 0.9901864770446673, "learning_rate": 9.26207303635154e-06, "loss": 2.0778, "step": 550 }, { "epoch": 0.7461069735951252, "grad_norm": 0.9659375715716257, "learning_rate": 9.25925010081463e-06, "loss": 2.2679, "step": 551 }, { "epoch": 0.7474610697359513, "grad_norm": 0.9764027265044928, "learning_rate": 9.256422207921757e-06, "loss": 2.7468, "step": 552 }, { "epoch": 0.7488151658767772, "grad_norm": 0.899162453540954, "learning_rate": 9.25358936096432e-06, "loss": 2.312, "step": 553 }, { "epoch": 0.7501692620176033, "grad_norm": 0.7884101704032132, "learning_rate": 9.250751563239473e-06, "loss": 1.8254, "step": 554 }, { "epoch": 0.7515233581584293, "grad_norm": 1.0492022449068705, "learning_rate": 9.247908818050146e-06, "loss": 2.6885, "step": 555 }, { "epoch": 0.7528774542992552, "grad_norm": 1.0133767522172044, "learning_rate": 9.245061128705017e-06, "loss": 2.2732, "step": 556 }, { "epoch": 0.7542315504400813, "grad_norm": 1.1740684334910383, "learning_rate": 9.242208498518522e-06, "loss": 2.4037, "step": 557 }, { "epoch": 0.7555856465809072, "grad_norm": 1.0034773672404191, "learning_rate": 9.239350930810843e-06, "loss": 2.6555, "step": 558 }, { "epoch": 0.7569397427217333, "grad_norm": 1.0559371991027349, "learning_rate": 9.236488428907919e-06, "loss": 1.8506, "step": 559 }, { "epoch": 0.7582938388625592, "grad_norm": 0.9422190702574703, "learning_rate": 9.233620996141421e-06, "loss": 2.393, "step": 560 }, { "epoch": 0.7596479350033852, "grad_norm": 1.701435588173234, "learning_rate": 9.230748635848768e-06, "loss": 2.4996, "step": 561 }, { "epoch": 0.7610020311442113, "grad_norm": 0.9848241894053293, "learning_rate": 9.227871351373108e-06, "loss": 2.2563, "step": 562 }, { "epoch": 0.7623561272850372, "grad_norm": 0.9551626201609839, "learning_rate": 9.224989146063322e-06, "loss": 2.3359, "step": 563 }, { "epoch": 0.7637102234258633, "grad_norm": 1.0501196818584786, "learning_rate": 9.22210202327402e-06, "loss": 2.3484, "step": 564 }, { "epoch": 0.7650643195666892, "grad_norm": 1.0118693595069355, "learning_rate": 9.219209986365533e-06, "loss": 2.0198, "step": 565 }, { "epoch": 0.7664184157075152, "grad_norm": 1.000596790716306, "learning_rate": 9.216313038703914e-06, "loss": 2.4147, "step": 566 }, { "epoch": 0.7677725118483413, "grad_norm": 1.0925850682201712, "learning_rate": 9.21341118366093e-06, "loss": 2.0607, "step": 567 }, { "epoch": 0.7691266079891672, "grad_norm": 0.8195614498636685, "learning_rate": 9.21050442461406e-06, "loss": 2.3161, "step": 568 }, { "epoch": 0.7704807041299933, "grad_norm": 1.4095142340461897, "learning_rate": 9.207592764946491e-06, "loss": 2.3281, "step": 569 }, { "epoch": 0.7718348002708192, "grad_norm": 1.0305488353359658, "learning_rate": 9.204676208047112e-06, "loss": 2.209, "step": 570 }, { "epoch": 0.7731888964116452, "grad_norm": 0.9457457073106245, "learning_rate": 9.201754757310518e-06, "loss": 1.9644, "step": 571 }, { "epoch": 0.7745429925524713, "grad_norm": 0.8798589503356994, "learning_rate": 9.198828416136991e-06, "loss": 2.5588, "step": 572 }, { "epoch": 0.7758970886932972, "grad_norm": 1.1021679000741063, "learning_rate": 9.195897187932513e-06, "loss": 2.2556, "step": 573 }, { "epoch": 0.7772511848341233, "grad_norm": 1.2881391770879362, "learning_rate": 9.192961076108748e-06, "loss": 2.2786, "step": 574 }, { "epoch": 0.7786052809749492, "grad_norm": 0.8079890380239451, "learning_rate": 9.190020084083048e-06, "loss": 1.9979, "step": 575 }, { "epoch": 0.7799593771157752, "grad_norm": 2.3299606691457706, "learning_rate": 9.187074215278444e-06, "loss": 2.3944, "step": 576 }, { "epoch": 0.7813134732566012, "grad_norm": 0.8875887654620506, "learning_rate": 9.184123473123643e-06, "loss": 3.159, "step": 577 }, { "epoch": 0.7826675693974272, "grad_norm": 1.2732756612102565, "learning_rate": 9.181167861053024e-06, "loss": 2.5586, "step": 578 }, { "epoch": 0.7840216655382533, "grad_norm": 1.1186141048074447, "learning_rate": 9.178207382506634e-06, "loss": 2.3656, "step": 579 }, { "epoch": 0.7853757616790792, "grad_norm": 1.0038802401880105, "learning_rate": 9.175242040930185e-06, "loss": 2.3621, "step": 580 }, { "epoch": 0.7867298578199052, "grad_norm": 1.0084194487112716, "learning_rate": 9.172271839775046e-06, "loss": 2.2325, "step": 581 }, { "epoch": 0.7880839539607312, "grad_norm": 0.8495420286871743, "learning_rate": 9.169296782498249e-06, "loss": 2.4249, "step": 582 }, { "epoch": 0.7894380501015572, "grad_norm": 1.0108960715157491, "learning_rate": 9.166316872562467e-06, "loss": 2.0571, "step": 583 }, { "epoch": 0.7907921462423833, "grad_norm": 0.7599417123654713, "learning_rate": 9.163332113436031e-06, "loss": 2.2401, "step": 584 }, { "epoch": 0.7921462423832092, "grad_norm": 0.9153720931911937, "learning_rate": 9.160342508592916e-06, "loss": 2.0296, "step": 585 }, { "epoch": 0.7935003385240352, "grad_norm": 1.0606631317988464, "learning_rate": 9.157348061512728e-06, "loss": 1.995, "step": 586 }, { "epoch": 0.7948544346648612, "grad_norm": 0.9696771933921041, "learning_rate": 9.154348775680714e-06, "loss": 2.4553, "step": 587 }, { "epoch": 0.7962085308056872, "grad_norm": 1.1359820431460632, "learning_rate": 9.151344654587758e-06, "loss": 2.6147, "step": 588 }, { "epoch": 0.7975626269465133, "grad_norm": 1.1620764175753133, "learning_rate": 9.148335701730363e-06, "loss": 2.1694, "step": 589 }, { "epoch": 0.7989167230873392, "grad_norm": 1.0906791903175577, "learning_rate": 9.145321920610662e-06, "loss": 2.0159, "step": 590 }, { "epoch": 0.8002708192281652, "grad_norm": 0.9262503779509312, "learning_rate": 9.142303314736405e-06, "loss": 1.8704, "step": 591 }, { "epoch": 0.8016249153689912, "grad_norm": 0.8897378574716819, "learning_rate": 9.139279887620955e-06, "loss": 2.2212, "step": 592 }, { "epoch": 0.8029790115098172, "grad_norm": 0.8785520958211444, "learning_rate": 9.136251642783294e-06, "loss": 2.2422, "step": 593 }, { "epoch": 0.8043331076506431, "grad_norm": 1.4708082564367666, "learning_rate": 9.133218583748002e-06, "loss": 2.4264, "step": 594 }, { "epoch": 0.8056872037914692, "grad_norm": 0.9127946818898369, "learning_rate": 9.130180714045271e-06, "loss": 1.985, "step": 595 }, { "epoch": 0.8070412999322952, "grad_norm": 1.0273656011649834, "learning_rate": 9.127138037210884e-06, "loss": 2.1752, "step": 596 }, { "epoch": 0.8083953960731212, "grad_norm": 1.0442611846851202, "learning_rate": 9.124090556786227e-06, "loss": 2.2373, "step": 597 }, { "epoch": 0.8097494922139472, "grad_norm": 1.0020569110815791, "learning_rate": 9.12103827631827e-06, "loss": 2.6107, "step": 598 }, { "epoch": 0.8111035883547731, "grad_norm": 1.0433877333186745, "learning_rate": 9.117981199359575e-06, "loss": 2.2095, "step": 599 }, { "epoch": 0.8124576844955992, "grad_norm": 0.9172575390929391, "learning_rate": 9.114919329468283e-06, "loss": 2.0521, "step": 600 }, { "epoch": 0.8138117806364252, "grad_norm": 1.0426202271128402, "learning_rate": 9.111852670208116e-06, "loss": 1.8817, "step": 601 }, { "epoch": 0.8151658767772512, "grad_norm": 1.1301209307772506, "learning_rate": 9.108781225148369e-06, "loss": 3.0088, "step": 602 }, { "epoch": 0.8165199729180772, "grad_norm": 0.9133026858806589, "learning_rate": 9.105704997863907e-06, "loss": 2.5472, "step": 603 }, { "epoch": 0.8178740690589031, "grad_norm": 1.1108413215075008, "learning_rate": 9.102623991935163e-06, "loss": 2.5752, "step": 604 }, { "epoch": 0.8192281651997292, "grad_norm": 0.9702100994035456, "learning_rate": 9.099538210948128e-06, "loss": 2.4404, "step": 605 }, { "epoch": 0.8205822613405552, "grad_norm": 0.9485690500451401, "learning_rate": 9.096447658494357e-06, "loss": 2.2257, "step": 606 }, { "epoch": 0.8219363574813812, "grad_norm": 26.292288625318744, "learning_rate": 9.09335233817095e-06, "loss": 2.676, "step": 607 }, { "epoch": 0.8232904536222072, "grad_norm": 0.9757213752439725, "learning_rate": 9.090252253580565e-06, "loss": 2.083, "step": 608 }, { "epoch": 0.8246445497630331, "grad_norm": 0.977754402583003, "learning_rate": 9.087147408331399e-06, "loss": 2.396, "step": 609 }, { "epoch": 0.8259986459038592, "grad_norm": 1.1224622683564631, "learning_rate": 9.084037806037194e-06, "loss": 2.5127, "step": 610 }, { "epoch": 0.8273527420446851, "grad_norm": 1.0702372697619176, "learning_rate": 9.080923450317226e-06, "loss": 2.44, "step": 611 }, { "epoch": 0.8287068381855112, "grad_norm": 0.9854940413106023, "learning_rate": 9.077804344796302e-06, "loss": 2.3179, "step": 612 }, { "epoch": 0.8300609343263372, "grad_norm": 1.0317420348048132, "learning_rate": 9.074680493104764e-06, "loss": 2.2255, "step": 613 }, { "epoch": 0.8314150304671631, "grad_norm": 1.2553199368667127, "learning_rate": 9.071551898878471e-06, "loss": 2.196, "step": 614 }, { "epoch": 0.8327691266079892, "grad_norm": 0.9428355967192141, "learning_rate": 9.068418565758805e-06, "loss": 2.4896, "step": 615 }, { "epoch": 0.8341232227488151, "grad_norm": 0.919605019517011, "learning_rate": 9.065280497392663e-06, "loss": 2.724, "step": 616 }, { "epoch": 0.8354773188896412, "grad_norm": 0.9006367820269072, "learning_rate": 9.062137697432457e-06, "loss": 2.3021, "step": 617 }, { "epoch": 0.8368314150304672, "grad_norm": 0.9254413019481625, "learning_rate": 9.058990169536098e-06, "loss": 2.8054, "step": 618 }, { "epoch": 0.8381855111712931, "grad_norm": 0.9723010342216781, "learning_rate": 9.055837917367006e-06, "loss": 2.1242, "step": 619 }, { "epoch": 0.8395396073121192, "grad_norm": 0.9646095993807202, "learning_rate": 9.052680944594101e-06, "loss": 2.7603, "step": 620 }, { "epoch": 0.8408937034529451, "grad_norm": 1.3152197929562175, "learning_rate": 9.049519254891793e-06, "loss": 2.1762, "step": 621 }, { "epoch": 0.8422477995937712, "grad_norm": 0.8097257743614227, "learning_rate": 9.046352851939981e-06, "loss": 2.4746, "step": 622 }, { "epoch": 0.8436018957345972, "grad_norm": 0.9993019751259021, "learning_rate": 9.043181739424054e-06, "loss": 1.8778, "step": 623 }, { "epoch": 0.8449559918754231, "grad_norm": 1.1394511401717762, "learning_rate": 9.040005921034884e-06, "loss": 2.2527, "step": 624 }, { "epoch": 0.8463100880162492, "grad_norm": 1.0167098829317125, "learning_rate": 9.036825400468814e-06, "loss": 2.4564, "step": 625 }, { "epoch": 0.8476641841570751, "grad_norm": 1.0799560174241793, "learning_rate": 9.033640181427661e-06, "loss": 2.1033, "step": 626 }, { "epoch": 0.8490182802979012, "grad_norm": 1.174120607908624, "learning_rate": 9.030450267618717e-06, "loss": 2.4757, "step": 627 }, { "epoch": 0.8503723764387271, "grad_norm": 1.050685050196375, "learning_rate": 9.02725566275473e-06, "loss": 2.238, "step": 628 }, { "epoch": 0.8517264725795531, "grad_norm": 0.9338038071047584, "learning_rate": 9.024056370553916e-06, "loss": 2.1313, "step": 629 }, { "epoch": 0.8530805687203792, "grad_norm": 1.156893096752152, "learning_rate": 9.020852394739936e-06, "loss": 2.2162, "step": 630 }, { "epoch": 0.8544346648612051, "grad_norm": 1.2811232350478499, "learning_rate": 9.017643739041914e-06, "loss": 2.9062, "step": 631 }, { "epoch": 0.8557887610020312, "grad_norm": 1.3380924542089319, "learning_rate": 9.014430407194413e-06, "loss": 2.4905, "step": 632 }, { "epoch": 0.8571428571428571, "grad_norm": 1.068909763409773, "learning_rate": 9.011212402937441e-06, "loss": 1.973, "step": 633 }, { "epoch": 0.8584969532836831, "grad_norm": 1.8932248755263266, "learning_rate": 9.007989730016446e-06, "loss": 2.4409, "step": 634 }, { "epoch": 0.8598510494245092, "grad_norm": 1.2891934792560118, "learning_rate": 9.004762392182307e-06, "loss": 2.3235, "step": 635 }, { "epoch": 0.8612051455653351, "grad_norm": 0.858242619324268, "learning_rate": 9.001530393191334e-06, "loss": 1.9865, "step": 636 }, { "epoch": 0.8625592417061612, "grad_norm": 1.1164311194466396, "learning_rate": 8.998293736805262e-06, "loss": 2.3765, "step": 637 }, { "epoch": 0.8639133378469871, "grad_norm": 1.2157891571025403, "learning_rate": 8.995052426791247e-06, "loss": 2.9348, "step": 638 }, { "epoch": 0.8652674339878131, "grad_norm": 1.0909914707099782, "learning_rate": 8.991806466921858e-06, "loss": 2.8517, "step": 639 }, { "epoch": 0.8666215301286392, "grad_norm": 0.9001967266754044, "learning_rate": 8.988555860975082e-06, "loss": 2.3269, "step": 640 }, { "epoch": 0.8679756262694651, "grad_norm": 1.0105961392957552, "learning_rate": 8.98530061273431e-06, "loss": 2.101, "step": 641 }, { "epoch": 0.8693297224102912, "grad_norm": 0.9677003303068539, "learning_rate": 8.982040725988337e-06, "loss": 2.2758, "step": 642 }, { "epoch": 0.8706838185511171, "grad_norm": 0.9813548892015744, "learning_rate": 8.978776204531354e-06, "loss": 2.33, "step": 643 }, { "epoch": 0.8720379146919431, "grad_norm": 1.0150274086140143, "learning_rate": 8.975507052162954e-06, "loss": 2.3066, "step": 644 }, { "epoch": 0.8733920108327691, "grad_norm": 1.0301765250355548, "learning_rate": 8.972233272688107e-06, "loss": 2.1587, "step": 645 }, { "epoch": 0.8747461069735951, "grad_norm": 1.1970855668426001, "learning_rate": 8.968954869917183e-06, "loss": 2.3845, "step": 646 }, { "epoch": 0.8761002031144212, "grad_norm": 1.1034962442803717, "learning_rate": 8.965671847665925e-06, "loss": 1.7817, "step": 647 }, { "epoch": 0.8774542992552471, "grad_norm": 0.9455749614353804, "learning_rate": 8.962384209755453e-06, "loss": 1.8517, "step": 648 }, { "epoch": 0.8788083953960731, "grad_norm": 0.9934755324282731, "learning_rate": 8.95909196001226e-06, "loss": 2.2541, "step": 649 }, { "epoch": 0.8801624915368991, "grad_norm": 1.0284578656567072, "learning_rate": 8.955795102268206e-06, "loss": 2.3968, "step": 650 }, { "epoch": 0.8815165876777251, "grad_norm": 1.037663714131099, "learning_rate": 8.952493640360518e-06, "loss": 2.6138, "step": 651 }, { "epoch": 0.8828706838185512, "grad_norm": 1.0926666615727456, "learning_rate": 8.949187578131777e-06, "loss": 1.9031, "step": 652 }, { "epoch": 0.8842247799593771, "grad_norm": 1.110544847466399, "learning_rate": 8.945876919429922e-06, "loss": 2.2067, "step": 653 }, { "epoch": 0.8855788761002031, "grad_norm": 1.0868074118477926, "learning_rate": 8.942561668108236e-06, "loss": 2.3532, "step": 654 }, { "epoch": 0.8869329722410291, "grad_norm": 1.1283691827694036, "learning_rate": 8.939241828025356e-06, "loss": 2.2511, "step": 655 }, { "epoch": 0.8882870683818551, "grad_norm": 1.9053437775194237, "learning_rate": 8.935917403045251e-06, "loss": 2.1626, "step": 656 }, { "epoch": 0.8896411645226812, "grad_norm": 1.2141944808281238, "learning_rate": 8.932588397037236e-06, "loss": 2.3409, "step": 657 }, { "epoch": 0.8909952606635071, "grad_norm": 1.0922208185194722, "learning_rate": 8.92925481387595e-06, "loss": 1.7841, "step": 658 }, { "epoch": 0.8923493568043331, "grad_norm": 1.5140227295690092, "learning_rate": 8.92591665744136e-06, "loss": 2.369, "step": 659 }, { "epoch": 0.8937034529451591, "grad_norm": 0.8526913110142397, "learning_rate": 8.922573931618763e-06, "loss": 2.0121, "step": 660 }, { "epoch": 0.8950575490859851, "grad_norm": 1.043736998424973, "learning_rate": 8.919226640298763e-06, "loss": 2.2168, "step": 661 }, { "epoch": 0.896411645226811, "grad_norm": 0.8649236144416748, "learning_rate": 8.915874787377289e-06, "loss": 1.9414, "step": 662 }, { "epoch": 0.8977657413676371, "grad_norm": 1.0840827090139065, "learning_rate": 8.912518376755572e-06, "loss": 3.087, "step": 663 }, { "epoch": 0.8991198375084631, "grad_norm": 1.1561825732631927, "learning_rate": 8.90915741234015e-06, "loss": 2.1006, "step": 664 }, { "epoch": 0.9004739336492891, "grad_norm": 0.9793291395752391, "learning_rate": 8.905791898042861e-06, "loss": 1.85, "step": 665 }, { "epoch": 0.9018280297901151, "grad_norm": 1.2185211503560396, "learning_rate": 8.902421837780839e-06, "loss": 2.7844, "step": 666 }, { "epoch": 0.903182125930941, "grad_norm": 1.0236959874012588, "learning_rate": 8.89904723547651e-06, "loss": 2.1372, "step": 667 }, { "epoch": 0.9045362220717671, "grad_norm": 1.0598101921330767, "learning_rate": 8.895668095057584e-06, "loss": 2.413, "step": 668 }, { "epoch": 0.9058903182125931, "grad_norm": 0.9973400634976877, "learning_rate": 8.892284420457054e-06, "loss": 2.1688, "step": 669 }, { "epoch": 0.9072444143534191, "grad_norm": 1.11854939708825, "learning_rate": 8.888896215613192e-06, "loss": 2.3154, "step": 670 }, { "epoch": 0.9085985104942451, "grad_norm": 1.0425526725686658, "learning_rate": 8.885503484469539e-06, "loss": 2.2115, "step": 671 }, { "epoch": 0.909952606635071, "grad_norm": 0.9579083361963218, "learning_rate": 8.88210623097491e-06, "loss": 2.0552, "step": 672 }, { "epoch": 0.9113067027758971, "grad_norm": 0.8237407553812768, "learning_rate": 8.878704459083376e-06, "loss": 2.8482, "step": 673 }, { "epoch": 0.9126607989167231, "grad_norm": 0.8200385146933777, "learning_rate": 8.875298172754274e-06, "loss": 2.1437, "step": 674 }, { "epoch": 0.9140148950575491, "grad_norm": 1.2074180626236117, "learning_rate": 8.871887375952192e-06, "loss": 2.0688, "step": 675 }, { "epoch": 0.9153689911983751, "grad_norm": 1.2749052909022498, "learning_rate": 8.868472072646968e-06, "loss": 2.4633, "step": 676 }, { "epoch": 0.916723087339201, "grad_norm": 1.219617129030152, "learning_rate": 8.865052266813686e-06, "loss": 2.3246, "step": 677 }, { "epoch": 0.9180771834800271, "grad_norm": 0.981406917677342, "learning_rate": 8.861627962432669e-06, "loss": 2.2168, "step": 678 }, { "epoch": 0.919431279620853, "grad_norm": 1.1941352888187047, "learning_rate": 8.858199163489476e-06, "loss": 2.0688, "step": 679 }, { "epoch": 0.9207853757616791, "grad_norm": 1.0302188684262572, "learning_rate": 8.854765873974898e-06, "loss": 2.6023, "step": 680 }, { "epoch": 0.9221394719025051, "grad_norm": 1.0315351443360778, "learning_rate": 8.851328097884956e-06, "loss": 2.4473, "step": 681 }, { "epoch": 0.923493568043331, "grad_norm": 0.9270825061496434, "learning_rate": 8.847885839220884e-06, "loss": 2.3396, "step": 682 }, { "epoch": 0.9248476641841571, "grad_norm": 1.5668051937578615, "learning_rate": 8.844439101989145e-06, "loss": 2.1618, "step": 683 }, { "epoch": 0.926201760324983, "grad_norm": 1.063281721298216, "learning_rate": 8.840987890201404e-06, "loss": 2.0825, "step": 684 }, { "epoch": 0.9275558564658091, "grad_norm": 5.49994743188365, "learning_rate": 8.837532207874539e-06, "loss": 2.3203, "step": 685 }, { "epoch": 0.9289099526066351, "grad_norm": 0.805384496899143, "learning_rate": 8.834072059030631e-06, "loss": 2.2733, "step": 686 }, { "epoch": 0.930264048747461, "grad_norm": 1.2279223754931932, "learning_rate": 8.83060744769696e-06, "loss": 2.519, "step": 687 }, { "epoch": 0.9316181448882871, "grad_norm": 0.9224586328427765, "learning_rate": 8.827138377905999e-06, "loss": 2.8326, "step": 688 }, { "epoch": 0.932972241029113, "grad_norm": 1.6089966176184487, "learning_rate": 8.823664853695408e-06, "loss": 1.6853, "step": 689 }, { "epoch": 0.9343263371699391, "grad_norm": 0.8870008681490367, "learning_rate": 8.820186879108038e-06, "loss": 2.4482, "step": 690 }, { "epoch": 0.9356804333107651, "grad_norm": 0.91606230626226, "learning_rate": 8.816704458191913e-06, "loss": 2.4619, "step": 691 }, { "epoch": 0.937034529451591, "grad_norm": 1.103378118700351, "learning_rate": 8.813217595000234e-06, "loss": 2.0875, "step": 692 }, { "epoch": 0.9383886255924171, "grad_norm": 1.1782658973751619, "learning_rate": 8.809726293591376e-06, "loss": 2.2522, "step": 693 }, { "epoch": 0.939742721733243, "grad_norm": 3.1082108827531383, "learning_rate": 8.806230558028874e-06, "loss": 2.4234, "step": 694 }, { "epoch": 0.9410968178740691, "grad_norm": 1.2771263022311399, "learning_rate": 8.80273039238143e-06, "loss": 2.2953, "step": 695 }, { "epoch": 0.942450914014895, "grad_norm": 1.0851177832663736, "learning_rate": 8.799225800722895e-06, "loss": 2.1023, "step": 696 }, { "epoch": 0.943805010155721, "grad_norm": 0.8982930726085163, "learning_rate": 8.795716787132278e-06, "loss": 2.261, "step": 697 }, { "epoch": 0.9451591062965471, "grad_norm": 0.9710609235588211, "learning_rate": 8.792203355693731e-06, "loss": 2.0593, "step": 698 }, { "epoch": 0.946513202437373, "grad_norm": 1.1589767335158037, "learning_rate": 8.788685510496549e-06, "loss": 2.2502, "step": 699 }, { "epoch": 0.9478672985781991, "grad_norm": 0.923602460813735, "learning_rate": 8.785163255635166e-06, "loss": 2.1052, "step": 700 }, { "epoch": 0.949221394719025, "grad_norm": 1.1331497754597915, "learning_rate": 8.781636595209145e-06, "loss": 2.2342, "step": 701 }, { "epoch": 0.950575490859851, "grad_norm": 1.0911649916589345, "learning_rate": 8.778105533323177e-06, "loss": 2.5234, "step": 702 }, { "epoch": 0.9519295870006771, "grad_norm": 0.7939604108123635, "learning_rate": 8.77457007408708e-06, "loss": 2.3947, "step": 703 }, { "epoch": 0.953283683141503, "grad_norm": 0.9952541607703701, "learning_rate": 8.771030221615786e-06, "loss": 2.4793, "step": 704 }, { "epoch": 0.9546377792823291, "grad_norm": 1.0107073531709874, "learning_rate": 8.767485980029342e-06, "loss": 2.0951, "step": 705 }, { "epoch": 0.955991875423155, "grad_norm": 1.0100357247296368, "learning_rate": 8.763937353452902e-06, "loss": 2.1011, "step": 706 }, { "epoch": 0.957345971563981, "grad_norm": 0.9804384454423593, "learning_rate": 8.760384346016726e-06, "loss": 1.7807, "step": 707 }, { "epoch": 0.9587000677048071, "grad_norm": 1.948525190001482, "learning_rate": 8.756826961856171e-06, "loss": 2.2514, "step": 708 }, { "epoch": 0.960054163845633, "grad_norm": 2.325650876165679, "learning_rate": 8.75326520511169e-06, "loss": 2.43, "step": 709 }, { "epoch": 0.9614082599864591, "grad_norm": 1.4411906872847189, "learning_rate": 8.74969907992882e-06, "loss": 2.5396, "step": 710 }, { "epoch": 0.962762356127285, "grad_norm": 1.1588565975941216, "learning_rate": 8.746128590458191e-06, "loss": 2.2853, "step": 711 }, { "epoch": 0.964116452268111, "grad_norm": 1.018506381733888, "learning_rate": 8.742553740855507e-06, "loss": 2.3352, "step": 712 }, { "epoch": 0.965470548408937, "grad_norm": 0.9885672968116678, "learning_rate": 8.738974535281545e-06, "loss": 2.3807, "step": 713 }, { "epoch": 0.966824644549763, "grad_norm": 0.9360638976400335, "learning_rate": 8.735390977902159e-06, "loss": 2.522, "step": 714 }, { "epoch": 0.9681787406905891, "grad_norm": 1.0138074790076992, "learning_rate": 8.731803072888262e-06, "loss": 2.0687, "step": 715 }, { "epoch": 0.969532836831415, "grad_norm": 0.8593670335614861, "learning_rate": 8.728210824415829e-06, "loss": 2.3171, "step": 716 }, { "epoch": 0.970886932972241, "grad_norm": 0.8782947340827698, "learning_rate": 8.724614236665889e-06, "loss": 2.1826, "step": 717 }, { "epoch": 0.972241029113067, "grad_norm": 1.041877641264968, "learning_rate": 8.721013313824527e-06, "loss": 2.5999, "step": 718 }, { "epoch": 0.973595125253893, "grad_norm": 1.0184265790630844, "learning_rate": 8.717408060082865e-06, "loss": 2.3649, "step": 719 }, { "epoch": 0.9749492213947191, "grad_norm": 0.8805171555606713, "learning_rate": 8.713798479637073e-06, "loss": 2.5529, "step": 720 }, { "epoch": 0.976303317535545, "grad_norm": 0.8250466080220131, "learning_rate": 8.710184576688353e-06, "loss": 2.1707, "step": 721 }, { "epoch": 0.977657413676371, "grad_norm": 0.9735738962932305, "learning_rate": 8.70656635544294e-06, "loss": 1.9474, "step": 722 }, { "epoch": 0.979011509817197, "grad_norm": 0.9574755177161404, "learning_rate": 8.702943820112094e-06, "loss": 2.0991, "step": 723 }, { "epoch": 0.980365605958023, "grad_norm": 1.1008036232420575, "learning_rate": 8.699316974912097e-06, "loss": 2.6044, "step": 724 }, { "epoch": 0.9817197020988491, "grad_norm": 1.0106221199017615, "learning_rate": 8.695685824064246e-06, "loss": 2.4405, "step": 725 }, { "epoch": 0.983073798239675, "grad_norm": 1.167558559334278, "learning_rate": 8.692050371794849e-06, "loss": 2.6635, "step": 726 }, { "epoch": 0.984427894380501, "grad_norm": 0.8542048524860696, "learning_rate": 8.688410622335222e-06, "loss": 2.5169, "step": 727 }, { "epoch": 0.985781990521327, "grad_norm": 1.0044160146409207, "learning_rate": 8.684766579921684e-06, "loss": 2.477, "step": 728 }, { "epoch": 0.987136086662153, "grad_norm": 0.9438656631341891, "learning_rate": 8.681118248795548e-06, "loss": 1.9811, "step": 729 }, { "epoch": 0.988490182802979, "grad_norm": 0.9095360753452296, "learning_rate": 8.677465633203117e-06, "loss": 2.3501, "step": 730 }, { "epoch": 0.989844278943805, "grad_norm": 1.5751129180562193, "learning_rate": 8.673808737395686e-06, "loss": 1.9711, "step": 731 }, { "epoch": 0.991198375084631, "grad_norm": 1.0244041655385774, "learning_rate": 8.670147565629526e-06, "loss": 2.1411, "step": 732 }, { "epoch": 0.992552471225457, "grad_norm": 1.0999564192877611, "learning_rate": 8.66648212216589e-06, "loss": 2.7566, "step": 733 }, { "epoch": 0.993906567366283, "grad_norm": 1.0783864346910368, "learning_rate": 8.662812411270997e-06, "loss": 2.2272, "step": 734 }, { "epoch": 0.995260663507109, "grad_norm": 1.011628347758097, "learning_rate": 8.65913843721604e-06, "loss": 2.5275, "step": 735 }, { "epoch": 0.996614759647935, "grad_norm": 1.2637326655889927, "learning_rate": 8.655460204277167e-06, "loss": 2.7871, "step": 736 }, { "epoch": 0.997968855788761, "grad_norm": 0.8940421539736993, "learning_rate": 8.651777716735488e-06, "loss": 2.8055, "step": 737 }, { "epoch": 0.999322951929587, "grad_norm": 0.8894453146981818, "learning_rate": 8.648090978877063e-06, "loss": 2.2385, "step": 738 } ], "logging_steps": 1, "max_steps": 2952, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 738, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9498603445747712e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }