{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987239472564866, "eval_steps": 500, "global_step": 587, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017014036580178648, "grad_norm": 6.9911926276762415, "learning_rate": 1.111111111111111e-06, "loss": 1.7618, "step": 1 }, { "epoch": 0.0034028073160357296, "grad_norm": 5.967485415913987, "learning_rate": 2.222222222222222e-06, "loss": 1.816, "step": 2 }, { "epoch": 0.005104210974053594, "grad_norm": 5.237633301735092, "learning_rate": 3.3333333333333333e-06, "loss": 1.7219, "step": 3 }, { "epoch": 0.006805614632071459, "grad_norm": 5.640571574498427, "learning_rate": 4.444444444444444e-06, "loss": 1.8444, "step": 4 }, { "epoch": 0.008507018290089324, "grad_norm": 4.459110210993377, "learning_rate": 5.555555555555557e-06, "loss": 1.705, "step": 5 }, { "epoch": 0.010208421948107189, "grad_norm": 4.47503295247886, "learning_rate": 6.666666666666667e-06, "loss": 1.5832, "step": 6 }, { "epoch": 0.011909825606125054, "grad_norm": 4.084281383937613, "learning_rate": 7.77777777777778e-06, "loss": 1.5393, "step": 7 }, { "epoch": 0.013611229264142918, "grad_norm": 3.669506717278801, "learning_rate": 8.888888888888888e-06, "loss": 1.4783, "step": 8 }, { "epoch": 0.015312632922160783, "grad_norm": 2.7312576917476763, "learning_rate": 1e-05, "loss": 1.3517, "step": 9 }, { "epoch": 0.017014036580178648, "grad_norm": 3.675807427395712, "learning_rate": 1.1111111111111113e-05, "loss": 1.3203, "step": 10 }, { "epoch": 0.01871544023819651, "grad_norm": 2.610967088571799, "learning_rate": 1.2222222222222224e-05, "loss": 1.184, "step": 11 }, { "epoch": 0.020416843896214378, "grad_norm": 3.3404049751198204, "learning_rate": 1.3333333333333333e-05, "loss": 1.2124, "step": 12 }, { "epoch": 0.02211824755423224, "grad_norm": 2.749767662527906, "learning_rate": 1.4444444444444446e-05, "loss": 1.0995, "step": 13 }, { "epoch": 0.023819651212250107, "grad_norm": 3.372563247688594, "learning_rate": 1.555555555555556e-05, "loss": 1.0566, "step": 14 }, { "epoch": 0.02552105487026797, "grad_norm": 3.6606428955549446, "learning_rate": 1.6666666666666667e-05, "loss": 0.9939, "step": 15 }, { "epoch": 0.027222458528285837, "grad_norm": 2.6528456388837065, "learning_rate": 1.7777777777777777e-05, "loss": 0.8771, "step": 16 }, { "epoch": 0.0289238621863037, "grad_norm": 3.4243275408942195, "learning_rate": 1.888888888888889e-05, "loss": 0.888, "step": 17 }, { "epoch": 0.030625265844321566, "grad_norm": 2.609060463016623, "learning_rate": 2e-05, "loss": 0.6589, "step": 18 }, { "epoch": 0.03232666950233943, "grad_norm": 2.9242170923508106, "learning_rate": 1.9999847579243196e-05, "loss": 0.7496, "step": 19 }, { "epoch": 0.034028073160357296, "grad_norm": 2.6492050387366466, "learning_rate": 1.9999390321619196e-05, "loss": 0.6022, "step": 20 }, { "epoch": 0.03572947681837516, "grad_norm": 2.550062716080491, "learning_rate": 1.9998628241067113e-05, "loss": 0.6182, "step": 21 }, { "epoch": 0.03743088047639302, "grad_norm": 2.1416608664408936, "learning_rate": 1.9997561360818322e-05, "loss": 0.5865, "step": 22 }, { "epoch": 0.03913228413441089, "grad_norm": 2.7322643613244613, "learning_rate": 1.999618971339577e-05, "loss": 0.5472, "step": 23 }, { "epoch": 0.040833687792428755, "grad_norm": 2.2213657292081224, "learning_rate": 1.9994513340612957e-05, "loss": 0.5155, "step": 24 }, { "epoch": 0.04253509145044662, "grad_norm": 2.303603778443028, "learning_rate": 1.9992532293572688e-05, "loss": 0.4958, "step": 25 }, { "epoch": 0.04423649510846448, "grad_norm": 1.7534676737667882, "learning_rate": 1.9990246632665503e-05, "loss": 0.3881, "step": 26 }, { "epoch": 0.04593789876648235, "grad_norm": 3.2948686963336242, "learning_rate": 1.998765642756783e-05, "loss": 0.4327, "step": 27 }, { "epoch": 0.047639302424500214, "grad_norm": 1.779238054183184, "learning_rate": 1.9984761757239878e-05, "loss": 0.3997, "step": 28 }, { "epoch": 0.04934070608251808, "grad_norm": 1.9200152427050028, "learning_rate": 1.998156270992321e-05, "loss": 0.3813, "step": 29 }, { "epoch": 0.05104210974053594, "grad_norm": 2.001461463300215, "learning_rate": 1.9978059383138073e-05, "loss": 0.4106, "step": 30 }, { "epoch": 0.05274351339855381, "grad_norm": 1.7813646875429028, "learning_rate": 1.997425188368041e-05, "loss": 0.3742, "step": 31 }, { "epoch": 0.05444491705657167, "grad_norm": 1.995594487847021, "learning_rate": 1.9970140327618612e-05, "loss": 0.3599, "step": 32 }, { "epoch": 0.05614632071458953, "grad_norm": 1.6098560851301231, "learning_rate": 1.9965724840289972e-05, "loss": 0.2282, "step": 33 }, { "epoch": 0.0578477243726074, "grad_norm": 1.5795395141948803, "learning_rate": 1.9961005556296875e-05, "loss": 0.3284, "step": 34 }, { "epoch": 0.059549128030625266, "grad_norm": 1.797868976276165, "learning_rate": 1.9955982619502693e-05, "loss": 0.288, "step": 35 }, { "epoch": 0.06125053168864313, "grad_norm": 1.9470932909182852, "learning_rate": 1.9950656183027392e-05, "loss": 0.3571, "step": 36 }, { "epoch": 0.06295193534666099, "grad_norm": 1.6744432746691509, "learning_rate": 1.994502640924286e-05, "loss": 0.3255, "step": 37 }, { "epoch": 0.06465333900467886, "grad_norm": 1.1682129416657814, "learning_rate": 1.993909346976798e-05, "loss": 0.2146, "step": 38 }, { "epoch": 0.06635474266269673, "grad_norm": 1.6768837338494562, "learning_rate": 1.993285754546338e-05, "loss": 0.2801, "step": 39 }, { "epoch": 0.06805614632071459, "grad_norm": 1.2215370489316386, "learning_rate": 1.9926318826425905e-05, "loss": 0.2286, "step": 40 }, { "epoch": 0.06975754997873246, "grad_norm": 1.3879380238369607, "learning_rate": 1.9919477511982873e-05, "loss": 0.2383, "step": 41 }, { "epoch": 0.07145895363675032, "grad_norm": 1.406525770078526, "learning_rate": 1.991233381068594e-05, "loss": 0.2455, "step": 42 }, { "epoch": 0.07316035729476818, "grad_norm": 1.5179467212387738, "learning_rate": 1.990488794030478e-05, "loss": 0.2933, "step": 43 }, { "epoch": 0.07486176095278604, "grad_norm": 1.6385645309602779, "learning_rate": 1.9897140127820432e-05, "loss": 0.2177, "step": 44 }, { "epoch": 0.07656316461080391, "grad_norm": 1.1617109957020149, "learning_rate": 1.9889090609418384e-05, "loss": 0.2346, "step": 45 }, { "epoch": 0.07826456826882178, "grad_norm": 1.2407685878736125, "learning_rate": 1.9880739630481376e-05, "loss": 0.206, "step": 46 }, { "epoch": 0.07996597192683964, "grad_norm": 1.385423536690397, "learning_rate": 1.9872087445581912e-05, "loss": 0.2126, "step": 47 }, { "epoch": 0.08166737558485751, "grad_norm": 1.1282002542462168, "learning_rate": 1.9863134318474504e-05, "loss": 0.1781, "step": 48 }, { "epoch": 0.08336877924287538, "grad_norm": 1.551014126299438, "learning_rate": 1.985388052208764e-05, "loss": 0.2023, "step": 49 }, { "epoch": 0.08507018290089324, "grad_norm": 1.3643327541760562, "learning_rate": 1.9844326338515444e-05, "loss": 0.205, "step": 50 }, { "epoch": 0.0867715865589111, "grad_norm": 1.3588992640234894, "learning_rate": 1.9834472059009097e-05, "loss": 0.1885, "step": 51 }, { "epoch": 0.08847299021692896, "grad_norm": 1.1226842944866124, "learning_rate": 1.982431798396794e-05, "loss": 0.1957, "step": 52 }, { "epoch": 0.09017439387494683, "grad_norm": 1.5828760965632824, "learning_rate": 1.9813864422930345e-05, "loss": 0.2829, "step": 53 }, { "epoch": 0.0918757975329647, "grad_norm": 1.205011521011545, "learning_rate": 1.9803111694564246e-05, "loss": 0.1703, "step": 54 }, { "epoch": 0.09357720119098256, "grad_norm": 1.2862224239348294, "learning_rate": 1.9792060126657437e-05, "loss": 0.1896, "step": 55 }, { "epoch": 0.09527860484900043, "grad_norm": 0.996017309514186, "learning_rate": 1.9780710056107587e-05, "loss": 0.1431, "step": 56 }, { "epoch": 0.0969800085070183, "grad_norm": 1.3079914833792219, "learning_rate": 1.976906182891197e-05, "loss": 0.1675, "step": 57 }, { "epoch": 0.09868141216503616, "grad_norm": 1.2961500263200691, "learning_rate": 1.97571158001569e-05, "loss": 0.2208, "step": 58 }, { "epoch": 0.10038281582305401, "grad_norm": 1.1298713738909105, "learning_rate": 1.9744872334006936e-05, "loss": 0.177, "step": 59 }, { "epoch": 0.10208421948107188, "grad_norm": 1.2798494623271113, "learning_rate": 1.973233180369374e-05, "loss": 0.1858, "step": 60 }, { "epoch": 0.10378562313908975, "grad_norm": 1.1639949502743814, "learning_rate": 1.9719494591504747e-05, "loss": 0.1374, "step": 61 }, { "epoch": 0.10548702679710761, "grad_norm": 1.0484473870712556, "learning_rate": 1.9706361088771474e-05, "loss": 0.1738, "step": 62 }, { "epoch": 0.10718843045512548, "grad_norm": 1.2241999313712146, "learning_rate": 1.96929316958576e-05, "loss": 0.1978, "step": 63 }, { "epoch": 0.10888983411314335, "grad_norm": 1.4772394862229892, "learning_rate": 1.9679206822146776e-05, "loss": 0.2373, "step": 64 }, { "epoch": 0.11059123777116121, "grad_norm": 1.097080265423446, "learning_rate": 1.9665186886030135e-05, "loss": 0.1733, "step": 65 }, { "epoch": 0.11229264142917907, "grad_norm": 0.9660234098323358, "learning_rate": 1.9650872314893523e-05, "loss": 0.1725, "step": 66 }, { "epoch": 0.11399404508719693, "grad_norm": 1.1576510656012562, "learning_rate": 1.9636263545104498e-05, "loss": 0.2095, "step": 67 }, { "epoch": 0.1156954487452148, "grad_norm": 1.4925958803001196, "learning_rate": 1.962136102199901e-05, "loss": 0.2475, "step": 68 }, { "epoch": 0.11739685240323267, "grad_norm": 1.2355340147204608, "learning_rate": 1.9606165199867822e-05, "loss": 0.147, "step": 69 }, { "epoch": 0.11909825606125053, "grad_norm": 1.3588414143397163, "learning_rate": 1.959067654194268e-05, "loss": 0.247, "step": 70 }, { "epoch": 0.1207996597192684, "grad_norm": 1.2247972500018036, "learning_rate": 1.9574895520382183e-05, "loss": 0.2284, "step": 71 }, { "epoch": 0.12250106337728627, "grad_norm": 1.3201500380185944, "learning_rate": 1.955882261625737e-05, "loss": 0.2222, "step": 72 }, { "epoch": 0.12420246703530413, "grad_norm": 1.0481960090401325, "learning_rate": 1.9542458319537094e-05, "loss": 0.1605, "step": 73 }, { "epoch": 0.12590387069332198, "grad_norm": 0.9723275202327377, "learning_rate": 1.9525803129073046e-05, "loss": 0.1214, "step": 74 }, { "epoch": 0.12760527435133986, "grad_norm": 1.1334387179775511, "learning_rate": 1.9508857552584574e-05, "loss": 0.1726, "step": 75 }, { "epoch": 0.12930667800935772, "grad_norm": 1.3165643507080502, "learning_rate": 1.9491622106643195e-05, "loss": 0.1892, "step": 76 }, { "epoch": 0.1310080816673756, "grad_norm": 1.054590261412754, "learning_rate": 1.9474097316656856e-05, "loss": 0.1585, "step": 77 }, { "epoch": 0.13270948532539345, "grad_norm": 1.376136178400177, "learning_rate": 1.9456283716853906e-05, "loss": 0.2075, "step": 78 }, { "epoch": 0.1344108889834113, "grad_norm": 1.224747493019771, "learning_rate": 1.9438181850266815e-05, "loss": 0.2071, "step": 79 }, { "epoch": 0.13611229264142918, "grad_norm": 1.1012108063521493, "learning_rate": 1.941979226871563e-05, "loss": 0.1641, "step": 80 }, { "epoch": 0.13781369629944704, "grad_norm": 1.0608643422778392, "learning_rate": 1.9401115532791134e-05, "loss": 0.1759, "step": 81 }, { "epoch": 0.13951509995746492, "grad_norm": 1.166727863718526, "learning_rate": 1.938215221183777e-05, "loss": 0.1873, "step": 82 }, { "epoch": 0.14121650361548277, "grad_norm": 1.2536986042781049, "learning_rate": 1.936290288393629e-05, "loss": 0.1715, "step": 83 }, { "epoch": 0.14291790727350065, "grad_norm": 1.099668804865605, "learning_rate": 1.9343368135886112e-05, "loss": 0.2042, "step": 84 }, { "epoch": 0.1446193109315185, "grad_norm": 0.9910009741184473, "learning_rate": 1.932354856318746e-05, "loss": 0.1656, "step": 85 }, { "epoch": 0.14632071458953635, "grad_norm": 1.7641820391925394, "learning_rate": 1.9303444770023184e-05, "loss": 0.2039, "step": 86 }, { "epoch": 0.14802211824755424, "grad_norm": 1.4259122950718577, "learning_rate": 1.9283057369240358e-05, "loss": 0.2408, "step": 87 }, { "epoch": 0.1497235219055721, "grad_norm": 1.2173305326266373, "learning_rate": 1.9262386982331596e-05, "loss": 0.1942, "step": 88 }, { "epoch": 0.15142492556358997, "grad_norm": 1.16090353646738, "learning_rate": 1.9241434239416093e-05, "loss": 0.1882, "step": 89 }, { "epoch": 0.15312632922160782, "grad_norm": 1.0574403120559566, "learning_rate": 1.922019977922045e-05, "loss": 0.1709, "step": 90 }, { "epoch": 0.1548277328796257, "grad_norm": 2.0338394122982755, "learning_rate": 1.919868424905915e-05, "loss": 0.1898, "step": 91 }, { "epoch": 0.15652913653764355, "grad_norm": 1.002950600937973, "learning_rate": 1.9176888304814882e-05, "loss": 0.128, "step": 92 }, { "epoch": 0.15823054019566143, "grad_norm": 1.4905981697484776, "learning_rate": 1.9154812610918503e-05, "loss": 0.2261, "step": 93 }, { "epoch": 0.1599319438536793, "grad_norm": 3.6410188347303554, "learning_rate": 1.913245784032881e-05, "loss": 0.1545, "step": 94 }, { "epoch": 0.16163334751169714, "grad_norm": 0.9791986885782344, "learning_rate": 1.9109824674512014e-05, "loss": 0.143, "step": 95 }, { "epoch": 0.16333475116971502, "grad_norm": 1.616911556183482, "learning_rate": 1.9086913803420966e-05, "loss": 0.2317, "step": 96 }, { "epoch": 0.16503615482773287, "grad_norm": 1.8630327951285053, "learning_rate": 1.906372592547413e-05, "loss": 0.2108, "step": 97 }, { "epoch": 0.16673755848575075, "grad_norm": 1.4145751439464074, "learning_rate": 1.9040261747534282e-05, "loss": 0.1645, "step": 98 }, { "epoch": 0.1684389621437686, "grad_norm": 1.6383401430353326, "learning_rate": 1.9016521984886984e-05, "loss": 0.1964, "step": 99 }, { "epoch": 0.1701403658017865, "grad_norm": 1.1305872234143883, "learning_rate": 1.8992507361218743e-05, "loss": 0.1719, "step": 100 }, { "epoch": 0.17184176945980434, "grad_norm": 0.9212110648235973, "learning_rate": 1.8968218608594987e-05, "loss": 0.1041, "step": 101 }, { "epoch": 0.1735431731178222, "grad_norm": 1.4867476390524645, "learning_rate": 1.8943656467437726e-05, "loss": 0.2201, "step": 102 }, { "epoch": 0.17524457677584007, "grad_norm": 1.232901326719632, "learning_rate": 1.8918821686502992e-05, "loss": 0.1844, "step": 103 }, { "epoch": 0.17694598043385792, "grad_norm": 0.9249397462332263, "learning_rate": 1.8893715022858e-05, "loss": 0.1266, "step": 104 }, { "epoch": 0.1786473840918758, "grad_norm": 1.10293476258483, "learning_rate": 1.886833724185809e-05, "loss": 0.1702, "step": 105 }, { "epoch": 0.18034878774989366, "grad_norm": 0.9437280287141773, "learning_rate": 1.8842689117123377e-05, "loss": 0.1091, "step": 106 }, { "epoch": 0.18205019140791154, "grad_norm": 1.1163247821615636, "learning_rate": 1.8816771430515178e-05, "loss": 0.1705, "step": 107 }, { "epoch": 0.1837515950659294, "grad_norm": 1.173349634021317, "learning_rate": 1.8790584972112174e-05, "loss": 0.1617, "step": 108 }, { "epoch": 0.18545299872394724, "grad_norm": 1.6711842060073634, "learning_rate": 1.876413054018633e-05, "loss": 0.273, "step": 109 }, { "epoch": 0.18715440238196512, "grad_norm": 1.241159770325543, "learning_rate": 1.873740894117854e-05, "loss": 0.1867, "step": 110 }, { "epoch": 0.18885580603998298, "grad_norm": 0.9298106529169693, "learning_rate": 1.8710420989674093e-05, "loss": 0.1442, "step": 111 }, { "epoch": 0.19055720969800086, "grad_norm": 0.8514950506084779, "learning_rate": 1.8683167508377775e-05, "loss": 0.1394, "step": 112 }, { "epoch": 0.1922586133560187, "grad_norm": 1.159669352047277, "learning_rate": 1.8655649328088836e-05, "loss": 0.1174, "step": 113 }, { "epoch": 0.1939600170140366, "grad_norm": 1.117625315370739, "learning_rate": 1.862786728767565e-05, "loss": 0.1603, "step": 114 }, { "epoch": 0.19566142067205444, "grad_norm": 1.3416593805973367, "learning_rate": 1.8599822234050143e-05, "loss": 0.1875, "step": 115 }, { "epoch": 0.19736282433007232, "grad_norm": 0.9921414446012113, "learning_rate": 1.8571515022141974e-05, "loss": 0.158, "step": 116 }, { "epoch": 0.19906422798809018, "grad_norm": 1.2663199688342266, "learning_rate": 1.8542946514872478e-05, "loss": 0.1639, "step": 117 }, { "epoch": 0.20076563164610803, "grad_norm": 1.502373927040382, "learning_rate": 1.851411758312835e-05, "loss": 0.1102, "step": 118 }, { "epoch": 0.2024670353041259, "grad_norm": 0.9112340285667759, "learning_rate": 1.8485029105735112e-05, "loss": 0.1373, "step": 119 }, { "epoch": 0.20416843896214376, "grad_norm": 0.9308775387978712, "learning_rate": 1.8455681969430307e-05, "loss": 0.1367, "step": 120 }, { "epoch": 0.20586984262016164, "grad_norm": 1.0498855289840079, "learning_rate": 1.8426077068836487e-05, "loss": 0.1484, "step": 121 }, { "epoch": 0.2075712462781795, "grad_norm": 0.7911114454233587, "learning_rate": 1.839621530643392e-05, "loss": 0.117, "step": 122 }, { "epoch": 0.20927264993619737, "grad_norm": 1.0125626321631322, "learning_rate": 1.8366097592533095e-05, "loss": 0.1112, "step": 123 }, { "epoch": 0.21097405359421523, "grad_norm": 1.2070151224720589, "learning_rate": 1.8335724845246948e-05, "loss": 0.2101, "step": 124 }, { "epoch": 0.21267545725223308, "grad_norm": 0.8912405247679976, "learning_rate": 1.830509799046292e-05, "loss": 0.1536, "step": 125 }, { "epoch": 0.21437686091025096, "grad_norm": 0.943905994134866, "learning_rate": 1.8274217961814682e-05, "loss": 0.1234, "step": 126 }, { "epoch": 0.2160782645682688, "grad_norm": 1.0629115609465312, "learning_rate": 1.8243085700653698e-05, "loss": 0.164, "step": 127 }, { "epoch": 0.2177796682262867, "grad_norm": 0.9167444126586589, "learning_rate": 1.821170215602053e-05, "loss": 0.1366, "step": 128 }, { "epoch": 0.21948107188430455, "grad_norm": 1.3561117487616148, "learning_rate": 1.818006828461591e-05, "loss": 0.1491, "step": 129 }, { "epoch": 0.22118247554232243, "grad_norm": 1.0924892277382052, "learning_rate": 1.8148185050771554e-05, "loss": 0.1609, "step": 130 }, { "epoch": 0.22288387920034028, "grad_norm": 0.8222865380048915, "learning_rate": 1.8116053426420793e-05, "loss": 0.1542, "step": 131 }, { "epoch": 0.22458528285835813, "grad_norm": 0.9936845495660046, "learning_rate": 1.8083674391068925e-05, "loss": 0.1603, "step": 132 }, { "epoch": 0.226286686516376, "grad_norm": 0.9125563797717536, "learning_rate": 1.8051048931763366e-05, "loss": 0.1351, "step": 133 }, { "epoch": 0.22798809017439386, "grad_norm": 0.8848502502479684, "learning_rate": 1.8018178043063554e-05, "loss": 0.139, "step": 134 }, { "epoch": 0.22968949383241175, "grad_norm": 0.8829591378486032, "learning_rate": 1.798506272701064e-05, "loss": 0.1071, "step": 135 }, { "epoch": 0.2313908974904296, "grad_norm": 1.2518522764811424, "learning_rate": 1.795170399309692e-05, "loss": 0.1845, "step": 136 }, { "epoch": 0.23309230114844748, "grad_norm": 1.1836157164291814, "learning_rate": 1.7918102858235103e-05, "loss": 0.1843, "step": 137 }, { "epoch": 0.23479370480646533, "grad_norm": 1.0028146137981997, "learning_rate": 1.7884260346727257e-05, "loss": 0.1523, "step": 138 }, { "epoch": 0.2364951084644832, "grad_norm": 0.9312803934338035, "learning_rate": 1.7850177490233635e-05, "loss": 0.1505, "step": 139 }, { "epoch": 0.23819651212250106, "grad_norm": 1.4062348205690975, "learning_rate": 1.7815855327741185e-05, "loss": 0.184, "step": 140 }, { "epoch": 0.23989791578051892, "grad_norm": 1.202402031733493, "learning_rate": 1.7781294905531908e-05, "loss": 0.1635, "step": 141 }, { "epoch": 0.2415993194385368, "grad_norm": 1.0855843964177576, "learning_rate": 1.774649727715094e-05, "loss": 0.1504, "step": 142 }, { "epoch": 0.24330072309655465, "grad_norm": 0.776506687452474, "learning_rate": 1.7711463503374466e-05, "loss": 0.0911, "step": 143 }, { "epoch": 0.24500212675457253, "grad_norm": 0.9545701276843519, "learning_rate": 1.7676194652177333e-05, "loss": 0.1245, "step": 144 }, { "epoch": 0.24670353041259038, "grad_norm": 1.014164593806905, "learning_rate": 1.764069179870055e-05, "loss": 0.181, "step": 145 }, { "epoch": 0.24840493407060826, "grad_norm": 0.7754970071101359, "learning_rate": 1.760495602521847e-05, "loss": 0.114, "step": 146 }, { "epoch": 0.2501063377286261, "grad_norm": 1.1564504154977566, "learning_rate": 1.756898842110582e-05, "loss": 0.209, "step": 147 }, { "epoch": 0.25180774138664397, "grad_norm": 1.1287141860391865, "learning_rate": 1.753279008280449e-05, "loss": 0.1298, "step": 148 }, { "epoch": 0.2535091450446618, "grad_norm": 0.9691449559553242, "learning_rate": 1.74963621137901e-05, "loss": 0.1258, "step": 149 }, { "epoch": 0.25521054870267973, "grad_norm": 1.035658869069728, "learning_rate": 1.7459705624538383e-05, "loss": 0.1419, "step": 150 }, { "epoch": 0.2569119523606976, "grad_norm": 1.0753488541101328, "learning_rate": 1.7422821732491297e-05, "loss": 0.135, "step": 151 }, { "epoch": 0.25861335601871543, "grad_norm": 0.8229354151040922, "learning_rate": 1.7385711562022988e-05, "loss": 0.1164, "step": 152 }, { "epoch": 0.2603147596767333, "grad_norm": 1.016446132319777, "learning_rate": 1.734837624440551e-05, "loss": 0.1661, "step": 153 }, { "epoch": 0.2620161633347512, "grad_norm": 0.9558420891892299, "learning_rate": 1.731081691777434e-05, "loss": 0.1197, "step": 154 }, { "epoch": 0.26371756699276905, "grad_norm": 0.9986826502725739, "learning_rate": 1.7273034727093677e-05, "loss": 0.1654, "step": 155 }, { "epoch": 0.2654189706507869, "grad_norm": 1.2437727283048936, "learning_rate": 1.7235030824121542e-05, "loss": 0.1472, "step": 156 }, { "epoch": 0.26712037430880475, "grad_norm": 0.9201621015876281, "learning_rate": 1.7196806367374656e-05, "loss": 0.1427, "step": 157 }, { "epoch": 0.2688217779668226, "grad_norm": 0.8981299082841511, "learning_rate": 1.7158362522093153e-05, "loss": 0.1356, "step": 158 }, { "epoch": 0.2705231816248405, "grad_norm": 0.8438284299933941, "learning_rate": 1.7119700460205026e-05, "loss": 0.1537, "step": 159 }, { "epoch": 0.27222458528285837, "grad_norm": 0.9992590548158468, "learning_rate": 1.7080821360290426e-05, "loss": 0.1149, "step": 160 }, { "epoch": 0.2739259889408762, "grad_norm": 1.0293167471941735, "learning_rate": 1.7041726407545716e-05, "loss": 0.1018, "step": 161 }, { "epoch": 0.27562739259889407, "grad_norm": 1.2090694788101504, "learning_rate": 1.7002416793747354e-05, "loss": 0.1409, "step": 162 }, { "epoch": 0.2773287962569119, "grad_norm": 1.0236086029383877, "learning_rate": 1.696289371721556e-05, "loss": 0.1702, "step": 163 }, { "epoch": 0.27903019991492983, "grad_norm": 1.3417300786636221, "learning_rate": 1.692315838277778e-05, "loss": 0.2483, "step": 164 }, { "epoch": 0.2807316035729477, "grad_norm": 1.0989869821812306, "learning_rate": 1.6883212001731956e-05, "loss": 0.1782, "step": 165 }, { "epoch": 0.28243300723096554, "grad_norm": 0.529248875869627, "learning_rate": 1.6843055791809623e-05, "loss": 0.0617, "step": 166 }, { "epoch": 0.2841344108889834, "grad_norm": 0.912576853390358, "learning_rate": 1.680269097713876e-05, "loss": 0.1415, "step": 167 }, { "epoch": 0.2858358145470013, "grad_norm": 0.9909341183986241, "learning_rate": 1.6762118788206488e-05, "loss": 0.1234, "step": 168 }, { "epoch": 0.28753721820501915, "grad_norm": 1.2344432350943482, "learning_rate": 1.6721340461821555e-05, "loss": 0.1964, "step": 169 }, { "epoch": 0.289238621863037, "grad_norm": 1.3663387049811897, "learning_rate": 1.6680357241076632e-05, "loss": 0.1883, "step": 170 }, { "epoch": 0.29094002552105486, "grad_norm": 1.059631703359694, "learning_rate": 1.6639170375310422e-05, "loss": 0.1116, "step": 171 }, { "epoch": 0.2926414291790727, "grad_norm": 1.0508255410491976, "learning_rate": 1.6597781120069584e-05, "loss": 0.1662, "step": 172 }, { "epoch": 0.2943428328370906, "grad_norm": 1.0537186165554597, "learning_rate": 1.655619073707043e-05, "loss": 0.1497, "step": 173 }, { "epoch": 0.29604423649510847, "grad_norm": 0.7500117116797194, "learning_rate": 1.6514400494160498e-05, "loss": 0.1014, "step": 174 }, { "epoch": 0.2977456401531263, "grad_norm": 0.9365762295606946, "learning_rate": 1.6472411665279872e-05, "loss": 0.1692, "step": 175 }, { "epoch": 0.2994470438111442, "grad_norm": 0.9936157472994437, "learning_rate": 1.643022553042237e-05, "loss": 0.1329, "step": 176 }, { "epoch": 0.3011484474691621, "grad_norm": 0.9502764427701426, "learning_rate": 1.6387843375596513e-05, "loss": 0.1497, "step": 177 }, { "epoch": 0.30284985112717994, "grad_norm": 0.9034816217862629, "learning_rate": 1.634526649278632e-05, "loss": 0.1341, "step": 178 }, { "epoch": 0.3045512547851978, "grad_norm": 1.2235420334078282, "learning_rate": 1.630249617991194e-05, "loss": 0.1691, "step": 179 }, { "epoch": 0.30625265844321564, "grad_norm": 1.070819167390794, "learning_rate": 1.6259533740790055e-05, "loss": 0.1893, "step": 180 }, { "epoch": 0.3079540621012335, "grad_norm": 1.0149277093510711, "learning_rate": 1.6216380485094164e-05, "loss": 0.1581, "step": 181 }, { "epoch": 0.3096554657592514, "grad_norm": 1.2727184403130332, "learning_rate": 1.617303772831465e-05, "loss": 0.1918, "step": 182 }, { "epoch": 0.31135686941726926, "grad_norm": 1.0175891234730905, "learning_rate": 1.6129506791718665e-05, "loss": 0.1503, "step": 183 }, { "epoch": 0.3130582730752871, "grad_norm": 0.8269197562005047, "learning_rate": 1.6085789002309873e-05, "loss": 0.1324, "step": 184 }, { "epoch": 0.31475967673330496, "grad_norm": 1.1533751660178018, "learning_rate": 1.6041885692787985e-05, "loss": 0.1574, "step": 185 }, { "epoch": 0.31646108039132287, "grad_norm": 0.8735591976051513, "learning_rate": 1.599779820150813e-05, "loss": 0.1139, "step": 186 }, { "epoch": 0.3181624840493407, "grad_norm": 6.2499538759448034, "learning_rate": 1.5953527872440063e-05, "loss": 0.2183, "step": 187 }, { "epoch": 0.3198638877073586, "grad_norm": 0.904041081802588, "learning_rate": 1.5909076055127202e-05, "loss": 0.1619, "step": 188 }, { "epoch": 0.3215652913653764, "grad_norm": 1.4719450867461397, "learning_rate": 1.5864444104645473e-05, "loss": 0.1781, "step": 189 }, { "epoch": 0.3232666950233943, "grad_norm": 1.2964548761824104, "learning_rate": 1.581963338156201e-05, "loss": 0.2205, "step": 190 }, { "epoch": 0.3249680986814122, "grad_norm": 0.9295125169199553, "learning_rate": 1.5774645251893673e-05, "loss": 0.1244, "step": 191 }, { "epoch": 0.32666950233943004, "grad_norm": 1.0204273767596037, "learning_rate": 1.5729481087065423e-05, "loss": 0.1404, "step": 192 }, { "epoch": 0.3283709059974479, "grad_norm": 0.8337505371214884, "learning_rate": 1.5684142263868493e-05, "loss": 0.1473, "step": 193 }, { "epoch": 0.33007230965546575, "grad_norm": 1.0928052339478989, "learning_rate": 1.5638630164418435e-05, "loss": 0.137, "step": 194 }, { "epoch": 0.3317737133134836, "grad_norm": 1.6830022787415706, "learning_rate": 1.5592946176112973e-05, "loss": 0.1891, "step": 195 }, { "epoch": 0.3334751169715015, "grad_norm": 1.0495686844810206, "learning_rate": 1.554709169158972e-05, "loss": 0.184, "step": 196 }, { "epoch": 0.33517652062951936, "grad_norm": 1.011837190532919, "learning_rate": 1.550106810868373e-05, "loss": 0.1265, "step": 197 }, { "epoch": 0.3368779242875372, "grad_norm": 1.1498721398868388, "learning_rate": 1.5454876830384868e-05, "loss": 0.1531, "step": 198 }, { "epoch": 0.33857932794555506, "grad_norm": 1.2864468646292577, "learning_rate": 1.540851926479505e-05, "loss": 0.1889, "step": 199 }, { "epoch": 0.340280731603573, "grad_norm": 1.0553252849977297, "learning_rate": 1.536199682508533e-05, "loss": 0.1345, "step": 200 }, { "epoch": 0.3419821352615908, "grad_norm": 0.889196583250441, "learning_rate": 1.531531092945279e-05, "loss": 0.1102, "step": 201 }, { "epoch": 0.3436835389196087, "grad_norm": 1.1994000234252027, "learning_rate": 1.526846300107734e-05, "loss": 0.2369, "step": 202 }, { "epoch": 0.34538494257762653, "grad_norm": 0.9518805403077071, "learning_rate": 1.5221454468078336e-05, "loss": 0.1712, "step": 203 }, { "epoch": 0.3470863462356444, "grad_norm": 1.2937076434972201, "learning_rate": 1.5174286763470995e-05, "loss": 0.1848, "step": 204 }, { "epoch": 0.3487877498936623, "grad_norm": 0.7845825707725936, "learning_rate": 1.5126961325122773e-05, "loss": 0.1097, "step": 205 }, { "epoch": 0.35048915355168014, "grad_norm": 1.0471712678416134, "learning_rate": 1.5079479595709493e-05, "loss": 0.2015, "step": 206 }, { "epoch": 0.352190557209698, "grad_norm": 0.6389298881241116, "learning_rate": 1.5031843022671377e-05, "loss": 0.0863, "step": 207 }, { "epoch": 0.35389196086771585, "grad_norm": 0.9730117762878676, "learning_rate": 1.4984053058168936e-05, "loss": 0.1233, "step": 208 }, { "epoch": 0.35559336452573376, "grad_norm": 1.0321283116382443, "learning_rate": 1.4936111159038677e-05, "loss": 0.1655, "step": 209 }, { "epoch": 0.3572947681837516, "grad_norm": 1.1161778201265327, "learning_rate": 1.4888018786748713e-05, "loss": 0.18, "step": 210 }, { "epoch": 0.35899617184176946, "grad_norm": 1.240719081282556, "learning_rate": 1.4839777407354194e-05, "loss": 0.2124, "step": 211 }, { "epoch": 0.3606975754997873, "grad_norm": 0.9976402494534861, "learning_rate": 1.4791388491452637e-05, "loss": 0.1894, "step": 212 }, { "epoch": 0.36239897915780517, "grad_norm": 1.1346780693826046, "learning_rate": 1.4742853514139076e-05, "loss": 0.181, "step": 213 }, { "epoch": 0.3641003828158231, "grad_norm": 1.082220133582736, "learning_rate": 1.4694173954961105e-05, "loss": 0.1781, "step": 214 }, { "epoch": 0.36580178647384093, "grad_norm": 0.9799894414710469, "learning_rate": 1.4645351297873774e-05, "loss": 0.1446, "step": 215 }, { "epoch": 0.3675031901318588, "grad_norm": 0.9322926757789775, "learning_rate": 1.4596387031194354e-05, "loss": 0.1458, "step": 216 }, { "epoch": 0.36920459378987663, "grad_norm": 0.9669419010934909, "learning_rate": 1.4547282647556964e-05, "loss": 0.1854, "step": 217 }, { "epoch": 0.3709059974478945, "grad_norm": 0.8073117793398128, "learning_rate": 1.449803964386706e-05, "loss": 0.1436, "step": 218 }, { "epoch": 0.3726074011059124, "grad_norm": 0.7928236938106081, "learning_rate": 1.4448659521255823e-05, "loss": 0.134, "step": 219 }, { "epoch": 0.37430880476393025, "grad_norm": 1.1023563168423864, "learning_rate": 1.4399143785034388e-05, "loss": 0.1806, "step": 220 }, { "epoch": 0.3760102084219481, "grad_norm": 0.8661885088173654, "learning_rate": 1.4349493944647953e-05, "loss": 0.1252, "step": 221 }, { "epoch": 0.37771161207996595, "grad_norm": 1.297973502891389, "learning_rate": 1.4299711513629759e-05, "loss": 0.1964, "step": 222 }, { "epoch": 0.37941301573798386, "grad_norm": 1.1335119776385323, "learning_rate": 1.4249798009554979e-05, "loss": 0.1982, "step": 223 }, { "epoch": 0.3811144193960017, "grad_norm": 1.1726247095139843, "learning_rate": 1.419975495399442e-05, "loss": 0.199, "step": 224 }, { "epoch": 0.38281582305401957, "grad_norm": 0.8891933996157408, "learning_rate": 1.4149583872468165e-05, "loss": 0.1292, "step": 225 }, { "epoch": 0.3845172267120374, "grad_norm": 0.7603931038731861, "learning_rate": 1.4099286294399051e-05, "loss": 0.0952, "step": 226 }, { "epoch": 0.38621863037005527, "grad_norm": 1.1672152863793608, "learning_rate": 1.404886375306607e-05, "loss": 0.1611, "step": 227 }, { "epoch": 0.3879200340280732, "grad_norm": 0.8620430776020629, "learning_rate": 1.3998317785557597e-05, "loss": 0.1348, "step": 228 }, { "epoch": 0.38962143768609103, "grad_norm": 0.9552569121402584, "learning_rate": 1.3947649932724563e-05, "loss": 0.1726, "step": 229 }, { "epoch": 0.3913228413441089, "grad_norm": 0.6526188073680776, "learning_rate": 1.3896861739133456e-05, "loss": 0.0828, "step": 230 }, { "epoch": 0.39302424500212674, "grad_norm": 1.1261254351408145, "learning_rate": 1.384595475301926e-05, "loss": 0.1666, "step": 231 }, { "epoch": 0.39472564866014465, "grad_norm": 1.0213490251792958, "learning_rate": 1.3794930526238246e-05, "loss": 0.1238, "step": 232 }, { "epoch": 0.3964270523181625, "grad_norm": 1.0422843362362886, "learning_rate": 1.3743790614220664e-05, "loss": 0.1542, "step": 233 }, { "epoch": 0.39812845597618035, "grad_norm": 1.1712283445792941, "learning_rate": 1.3692536575923334e-05, "loss": 0.1619, "step": 234 }, { "epoch": 0.3998298596341982, "grad_norm": 0.9722112293600863, "learning_rate": 1.3641169973782117e-05, "loss": 0.1487, "step": 235 }, { "epoch": 0.40153126329221606, "grad_norm": 0.9748390707529867, "learning_rate": 1.3589692373664288e-05, "loss": 0.1281, "step": 236 }, { "epoch": 0.40323266695023396, "grad_norm": 1.5657080809883168, "learning_rate": 1.3538105344820798e-05, "loss": 0.1728, "step": 237 }, { "epoch": 0.4049340706082518, "grad_norm": 0.8408747728353582, "learning_rate": 1.3486410459838448e-05, "loss": 0.1316, "step": 238 }, { "epoch": 0.40663547426626967, "grad_norm": 0.9029876737747633, "learning_rate": 1.343460929459193e-05, "loss": 0.157, "step": 239 }, { "epoch": 0.4083368779242875, "grad_norm": 1.1908098440690353, "learning_rate": 1.3382703428195812e-05, "loss": 0.2164, "step": 240 }, { "epoch": 0.4100382815823054, "grad_norm": 0.7354672073569528, "learning_rate": 1.3330694442956376e-05, "loss": 0.1052, "step": 241 }, { "epoch": 0.4117396852403233, "grad_norm": 0.7769967136278536, "learning_rate": 1.3278583924323405e-05, "loss": 0.1007, "step": 242 }, { "epoch": 0.41344108889834114, "grad_norm": 0.7667344440078264, "learning_rate": 1.3226373460841835e-05, "loss": 0.1383, "step": 243 }, { "epoch": 0.415142492556359, "grad_norm": 0.795735262361974, "learning_rate": 1.3174064644103334e-05, "loss": 0.1151, "step": 244 }, { "epoch": 0.41684389621437684, "grad_norm": 0.6474341608732114, "learning_rate": 1.3121659068697797e-05, "loss": 0.0632, "step": 245 }, { "epoch": 0.41854529987239475, "grad_norm": 1.2790933846234822, "learning_rate": 1.306915833216471e-05, "loss": 0.1294, "step": 246 }, { "epoch": 0.4202467035304126, "grad_norm": 1.0004086926301767, "learning_rate": 1.3016564034944473e-05, "loss": 0.1272, "step": 247 }, { "epoch": 0.42194810718843045, "grad_norm": 1.2932792230934473, "learning_rate": 1.29638777803296e-05, "loss": 0.201, "step": 248 }, { "epoch": 0.4236495108464483, "grad_norm": 0.8319761692099532, "learning_rate": 1.2911101174415861e-05, "loss": 0.1297, "step": 249 }, { "epoch": 0.42535091450446616, "grad_norm": 0.9446815368539271, "learning_rate": 1.2858235826053294e-05, "loss": 0.1336, "step": 250 }, { "epoch": 0.42705231816248407, "grad_norm": 1.016975563258099, "learning_rate": 1.2805283346797179e-05, "loss": 0.1411, "step": 251 }, { "epoch": 0.4287537218205019, "grad_norm": 0.732577169895616, "learning_rate": 1.2752245350858905e-05, "loss": 0.1278, "step": 252 }, { "epoch": 0.4304551254785198, "grad_norm": 0.6594182477673913, "learning_rate": 1.2699123455056777e-05, "loss": 0.0643, "step": 253 }, { "epoch": 0.4321565291365376, "grad_norm": 1.3288334602081366, "learning_rate": 1.26459192787667e-05, "loss": 0.1937, "step": 254 }, { "epoch": 0.43385793279455553, "grad_norm": 0.9662932023952568, "learning_rate": 1.2592634443872842e-05, "loss": 0.1635, "step": 255 }, { "epoch": 0.4355593364525734, "grad_norm": 1.0490078641576344, "learning_rate": 1.2539270574718172e-05, "loss": 0.1725, "step": 256 }, { "epoch": 0.43726074011059124, "grad_norm": 1.024114729528168, "learning_rate": 1.2485829298054952e-05, "loss": 0.1616, "step": 257 }, { "epoch": 0.4389621437686091, "grad_norm": 1.032340498139421, "learning_rate": 1.2432312242995158e-05, "loss": 0.2006, "step": 258 }, { "epoch": 0.44066354742662694, "grad_norm": 0.9662483951159501, "learning_rate": 1.2378721040960788e-05, "loss": 0.2017, "step": 259 }, { "epoch": 0.44236495108464485, "grad_norm": 1.7452442995933304, "learning_rate": 1.232505732563416e-05, "loss": 0.2189, "step": 260 }, { "epoch": 0.4440663547426627, "grad_norm": 1.4701622612115262, "learning_rate": 1.2271322732908091e-05, "loss": 0.1274, "step": 261 }, { "epoch": 0.44576775840068056, "grad_norm": 1.3198760014504118, "learning_rate": 1.2217518900836045e-05, "loss": 0.1733, "step": 262 }, { "epoch": 0.4474691620586984, "grad_norm": 0.7648660349856883, "learning_rate": 1.2163647469582181e-05, "loss": 0.0914, "step": 263 }, { "epoch": 0.44917056571671626, "grad_norm": 1.007435841972258, "learning_rate": 1.210971008137136e-05, "loss": 0.1805, "step": 264 }, { "epoch": 0.45087196937473417, "grad_norm": 1.215388205761567, "learning_rate": 1.2055708380439089e-05, "loss": 0.2073, "step": 265 }, { "epoch": 0.452573373032752, "grad_norm": 1.1544616644141075, "learning_rate": 1.2001644012981392e-05, "loss": 0.1398, "step": 266 }, { "epoch": 0.4542747766907699, "grad_norm": 1.0533254551023599, "learning_rate": 1.1947518627104637e-05, "loss": 0.1951, "step": 267 }, { "epoch": 0.45597618034878773, "grad_norm": 0.9812187726267535, "learning_rate": 1.1893333872775275e-05, "loss": 0.1742, "step": 268 }, { "epoch": 0.45767758400680564, "grad_norm": 0.7325445025398932, "learning_rate": 1.1839091401769559e-05, "loss": 0.1051, "step": 269 }, { "epoch": 0.4593789876648235, "grad_norm": 0.8301737461261519, "learning_rate": 1.1784792867623179e-05, "loss": 0.1138, "step": 270 }, { "epoch": 0.46108039132284134, "grad_norm": 0.9822341756525031, "learning_rate": 1.1730439925580876e-05, "loss": 0.1822, "step": 271 }, { "epoch": 0.4627817949808592, "grad_norm": 1.1111507637182583, "learning_rate": 1.1676034232545963e-05, "loss": 0.1669, "step": 272 }, { "epoch": 0.46448319863887705, "grad_norm": 1.1574890172808852, "learning_rate": 1.1621577447029816e-05, "loss": 0.1194, "step": 273 }, { "epoch": 0.46618460229689496, "grad_norm": 2.0493950053963674, "learning_rate": 1.1567071229101332e-05, "loss": 0.1955, "step": 274 }, { "epoch": 0.4678860059549128, "grad_norm": 0.9605842671373481, "learning_rate": 1.1512517240336304e-05, "loss": 0.1463, "step": 275 }, { "epoch": 0.46958740961293066, "grad_norm": 0.9613890557759487, "learning_rate": 1.1457917143766786e-05, "loss": 0.1348, "step": 276 }, { "epoch": 0.4712888132709485, "grad_norm": 0.8592623226268763, "learning_rate": 1.1403272603830384e-05, "loss": 0.122, "step": 277 }, { "epoch": 0.4729902169289664, "grad_norm": 0.823977210047947, "learning_rate": 1.1348585286319529e-05, "loss": 0.0959, "step": 278 }, { "epoch": 0.4746916205869843, "grad_norm": 1.2394663199353901, "learning_rate": 1.1293856858330678e-05, "loss": 0.1515, "step": 279 }, { "epoch": 0.47639302424500213, "grad_norm": 0.9415416594734269, "learning_rate": 1.1239088988213522e-05, "loss": 0.1402, "step": 280 }, { "epoch": 0.47809442790302, "grad_norm": 0.6692152879666028, "learning_rate": 1.11842833455201e-05, "loss": 0.1038, "step": 281 }, { "epoch": 0.47979583156103783, "grad_norm": 1.279963114211638, "learning_rate": 1.1129441600953916e-05, "loss": 0.179, "step": 282 }, { "epoch": 0.48149723521905574, "grad_norm": 1.0561646858074096, "learning_rate": 1.1074565426319014e-05, "loss": 0.1793, "step": 283 }, { "epoch": 0.4831986388770736, "grad_norm": 0.9873861580062439, "learning_rate": 1.101965649446901e-05, "loss": 0.1266, "step": 284 }, { "epoch": 0.48490004253509145, "grad_norm": 1.3992617370873837, "learning_rate": 1.0964716479256094e-05, "loss": 0.1779, "step": 285 }, { "epoch": 0.4866014461931093, "grad_norm": 0.6047306625037067, "learning_rate": 1.0909747055480004e-05, "loss": 0.0748, "step": 286 }, { "epoch": 0.4883028498511272, "grad_norm": 0.9243553903907195, "learning_rate": 1.0854749898836974e-05, "loss": 0.0768, "step": 287 }, { "epoch": 0.49000425350914506, "grad_norm": 0.981134680334624, "learning_rate": 1.0799726685868648e-05, "loss": 0.1082, "step": 288 }, { "epoch": 0.4917056571671629, "grad_norm": 0.8947071035708567, "learning_rate": 1.0744679093910987e-05, "loss": 0.1516, "step": 289 }, { "epoch": 0.49340706082518077, "grad_norm": 0.9633671611183833, "learning_rate": 1.0689608801043107e-05, "loss": 0.1431, "step": 290 }, { "epoch": 0.4951084644831986, "grad_norm": 0.9783028097642653, "learning_rate": 1.063451748603616e-05, "loss": 0.1725, "step": 291 }, { "epoch": 0.4968098681412165, "grad_norm": 1.1011125326638287, "learning_rate": 1.0579406828302124e-05, "loss": 0.1559, "step": 292 }, { "epoch": 0.4985112717992344, "grad_norm": 1.3678905512011608, "learning_rate": 1.0524278507842637e-05, "loss": 0.2389, "step": 293 }, { "epoch": 0.5002126754572522, "grad_norm": 0.9590558986038028, "learning_rate": 1.0469134205197762e-05, "loss": 0.167, "step": 294 }, { "epoch": 0.5019140791152701, "grad_norm": 0.7852144914992837, "learning_rate": 1.0413975601394765e-05, "loss": 0.14, "step": 295 }, { "epoch": 0.5036154827732879, "grad_norm": 0.9312639100976492, "learning_rate": 1.0358804377896876e-05, "loss": 0.1787, "step": 296 }, { "epoch": 0.5053168864313058, "grad_norm": 1.1537236217667737, "learning_rate": 1.0303622216552022e-05, "loss": 0.1578, "step": 297 }, { "epoch": 0.5070182900893236, "grad_norm": 0.6075562366273578, "learning_rate": 1.0248430799541564e-05, "loss": 0.0764, "step": 298 }, { "epoch": 0.5087196937473416, "grad_norm": 1.1242541214557573, "learning_rate": 1.019323180932901e-05, "loss": 0.1921, "step": 299 }, { "epoch": 0.5104210974053595, "grad_norm": 1.4234304726043698, "learning_rate": 1.013802692860873e-05, "loss": 0.1666, "step": 300 }, { "epoch": 0.5121225010633773, "grad_norm": 1.2757943782273478, "learning_rate": 1.0082817840254667e-05, "loss": 0.1887, "step": 301 }, { "epoch": 0.5138239047213952, "grad_norm": 0.996005178297661, "learning_rate": 1.0027606227269026e-05, "loss": 0.1747, "step": 302 }, { "epoch": 0.515525308379413, "grad_norm": 0.7387713580838607, "learning_rate": 9.972393772730975e-06, "loss": 0.1085, "step": 303 }, { "epoch": 0.5172267120374309, "grad_norm": 1.2995451243064031, "learning_rate": 9.917182159745335e-06, "loss": 0.1821, "step": 304 }, { "epoch": 0.5189281156954487, "grad_norm": 1.0921442436054, "learning_rate": 9.861973071391272e-06, "loss": 0.1875, "step": 305 }, { "epoch": 0.5206295193534666, "grad_norm": 0.8460606336221084, "learning_rate": 9.806768190670994e-06, "loss": 0.128, "step": 306 }, { "epoch": 0.5223309230114844, "grad_norm": 1.3555278122778684, "learning_rate": 9.751569200458438e-06, "loss": 0.2287, "step": 307 }, { "epoch": 0.5240323266695024, "grad_norm": 0.8944437738286967, "learning_rate": 9.69637778344798e-06, "loss": 0.1808, "step": 308 }, { "epoch": 0.5257337303275202, "grad_norm": 1.0653760514010453, "learning_rate": 9.641195622103126e-06, "loss": 0.1212, "step": 309 }, { "epoch": 0.5274351339855381, "grad_norm": 0.9102431997252817, "learning_rate": 9.586024398605238e-06, "loss": 0.1433, "step": 310 }, { "epoch": 0.529136537643556, "grad_norm": 0.6764029808372342, "learning_rate": 9.530865794802243e-06, "loss": 0.0951, "step": 311 }, { "epoch": 0.5308379413015738, "grad_norm": 0.9520815336747558, "learning_rate": 9.475721492157365e-06, "loss": 0.1077, "step": 312 }, { "epoch": 0.5325393449595917, "grad_norm": 1.063523212464188, "learning_rate": 9.420593171697876e-06, "loss": 0.2007, "step": 313 }, { "epoch": 0.5342407486176095, "grad_norm": 1.0407079644610695, "learning_rate": 9.365482513963844e-06, "loss": 0.1712, "step": 314 }, { "epoch": 0.5359421522756274, "grad_norm": 0.8825609862775818, "learning_rate": 9.310391198956896e-06, "loss": 0.1273, "step": 315 }, { "epoch": 0.5376435559336452, "grad_norm": 1.938917918187333, "learning_rate": 9.255320906089017e-06, "loss": 0.0991, "step": 316 }, { "epoch": 0.5393449595916632, "grad_norm": 0.7107490419953799, "learning_rate": 9.200273314131356e-06, "loss": 0.0971, "step": 317 }, { "epoch": 0.541046363249681, "grad_norm": 0.8846209277092395, "learning_rate": 9.145250101163032e-06, "loss": 0.113, "step": 318 }, { "epoch": 0.5427477669076989, "grad_norm": 1.4146454732178861, "learning_rate": 9.090252944520002e-06, "loss": 0.2643, "step": 319 }, { "epoch": 0.5444491705657167, "grad_norm": 0.8100753678201862, "learning_rate": 9.035283520743911e-06, "loss": 0.1318, "step": 320 }, { "epoch": 0.5461505742237346, "grad_norm": 0.8667951679113716, "learning_rate": 8.980343505530988e-06, "loss": 0.1278, "step": 321 }, { "epoch": 0.5478519778817524, "grad_norm": 1.147668283457312, "learning_rate": 8.925434573680986e-06, "loss": 0.1609, "step": 322 }, { "epoch": 0.5495533815397703, "grad_norm": 1.131156467858024, "learning_rate": 8.870558399046086e-06, "loss": 0.146, "step": 323 }, { "epoch": 0.5512547851977881, "grad_norm": 0.86377239081248, "learning_rate": 8.815716654479903e-06, "loss": 0.1308, "step": 324 }, { "epoch": 0.552956188855806, "grad_norm": 0.9720608614924628, "learning_rate": 8.76091101178648e-06, "loss": 0.165, "step": 325 }, { "epoch": 0.5546575925138238, "grad_norm": 0.8608166282424758, "learning_rate": 8.706143141669324e-06, "loss": 0.1218, "step": 326 }, { "epoch": 0.5563589961718418, "grad_norm": 1.1970393819234055, "learning_rate": 8.651414713680474e-06, "loss": 0.1962, "step": 327 }, { "epoch": 0.5580603998298597, "grad_norm": 0.8612813600703227, "learning_rate": 8.59672739616962e-06, "loss": 0.1417, "step": 328 }, { "epoch": 0.5597618034878775, "grad_norm": 1.0102595912727714, "learning_rate": 8.542082856233216e-06, "loss": 0.1564, "step": 329 }, { "epoch": 0.5614632071458954, "grad_norm": 0.9406498527873578, "learning_rate": 8.487482759663696e-06, "loss": 0.149, "step": 330 }, { "epoch": 0.5631646108039132, "grad_norm": 0.8125094662451422, "learning_rate": 8.43292877089867e-06, "loss": 0.1179, "step": 331 }, { "epoch": 0.5648660144619311, "grad_norm": 1.021325838301208, "learning_rate": 8.378422552970185e-06, "loss": 0.1969, "step": 332 }, { "epoch": 0.5665674181199489, "grad_norm": 0.8810448268824426, "learning_rate": 8.32396576745404e-06, "loss": 0.1531, "step": 333 }, { "epoch": 0.5682688217779668, "grad_norm": 0.8255230324993776, "learning_rate": 8.269560074419126e-06, "loss": 0.1082, "step": 334 }, { "epoch": 0.5699702254359846, "grad_norm": 1.133323698725157, "learning_rate": 8.215207132376824e-06, "loss": 0.1874, "step": 335 }, { "epoch": 0.5716716290940026, "grad_norm": 0.9575951327422774, "learning_rate": 8.160908598230448e-06, "loss": 0.1554, "step": 336 }, { "epoch": 0.5733730327520205, "grad_norm": 0.7406219285140028, "learning_rate": 8.10666612722473e-06, "loss": 0.121, "step": 337 }, { "epoch": 0.5750744364100383, "grad_norm": 0.7152136920800151, "learning_rate": 8.052481372895363e-06, "loss": 0.1093, "step": 338 }, { "epoch": 0.5767758400680562, "grad_norm": 1.0053517443579167, "learning_rate": 7.998355987018606e-06, "loss": 0.193, "step": 339 }, { "epoch": 0.578477243726074, "grad_norm": 0.761575926299739, "learning_rate": 7.944291619560914e-06, "loss": 0.0975, "step": 340 }, { "epoch": 0.5801786473840919, "grad_norm": 0.7751700853107011, "learning_rate": 7.890289918628644e-06, "loss": 0.1028, "step": 341 }, { "epoch": 0.5818800510421097, "grad_norm": 0.8869639117177723, "learning_rate": 7.836352530417824e-06, "loss": 0.1134, "step": 342 }, { "epoch": 0.5835814547001276, "grad_norm": 1.0462667085672495, "learning_rate": 7.782481099163958e-06, "loss": 0.1548, "step": 343 }, { "epoch": 0.5852828583581454, "grad_norm": 0.7001102008399384, "learning_rate": 7.728677267091912e-06, "loss": 0.11, "step": 344 }, { "epoch": 0.5869842620161634, "grad_norm": 0.7186481834763698, "learning_rate": 7.674942674365847e-06, "loss": 0.1133, "step": 345 }, { "epoch": 0.5886856656741812, "grad_norm": 0.6682605278304701, "learning_rate": 7.621278959039217e-06, "loss": 0.093, "step": 346 }, { "epoch": 0.5903870693321991, "grad_norm": 0.9377055159259335, "learning_rate": 7.567687757004843e-06, "loss": 0.0935, "step": 347 }, { "epoch": 0.5920884729902169, "grad_norm": 0.9075218346520041, "learning_rate": 7.514170701945047e-06, "loss": 0.1305, "step": 348 }, { "epoch": 0.5937898766482348, "grad_norm": 1.2321152146728744, "learning_rate": 7.460729425281831e-06, "loss": 0.1567, "step": 349 }, { "epoch": 0.5954912803062526, "grad_norm": 0.6819589386563149, "learning_rate": 7.407365556127162e-06, "loss": 0.1071, "step": 350 }, { "epoch": 0.5971926839642705, "grad_norm": 0.8602007969363553, "learning_rate": 7.354080721233303e-06, "loss": 0.0992, "step": 351 }, { "epoch": 0.5988940876222884, "grad_norm": 0.8186635068032451, "learning_rate": 7.300876544943227e-06, "loss": 0.1026, "step": 352 }, { "epoch": 0.6005954912803062, "grad_norm": 0.9440660658297884, "learning_rate": 7.247754649141097e-06, "loss": 0.1397, "step": 353 }, { "epoch": 0.6022968949383242, "grad_norm": 0.7406658935971615, "learning_rate": 7.194716653202826e-06, "loss": 0.1235, "step": 354 }, { "epoch": 0.603998298596342, "grad_norm": 1.023190929228503, "learning_rate": 7.1417641739467104e-06, "loss": 0.1499, "step": 355 }, { "epoch": 0.6056997022543599, "grad_norm": 0.9452102068231549, "learning_rate": 7.088898825584139e-06, "loss": 0.1302, "step": 356 }, { "epoch": 0.6074011059123777, "grad_norm": 0.7457983213357819, "learning_rate": 7.036122219670398e-06, "loss": 0.1384, "step": 357 }, { "epoch": 0.6091025095703956, "grad_norm": 0.7495503225450667, "learning_rate": 6.9834359650555305e-06, "loss": 0.1231, "step": 358 }, { "epoch": 0.6108039132284134, "grad_norm": 0.7997916558501216, "learning_rate": 6.930841667835295e-06, "loss": 0.1194, "step": 359 }, { "epoch": 0.6125053168864313, "grad_norm": 0.9510354173960561, "learning_rate": 6.878340931302208e-06, "loss": 0.1495, "step": 360 }, { "epoch": 0.6142067205444491, "grad_norm": 0.9065606885758422, "learning_rate": 6.825935355896669e-06, "loss": 0.1441, "step": 361 }, { "epoch": 0.615908124202467, "grad_norm": 1.2836448147049502, "learning_rate": 6.773626539158171e-06, "loss": 0.2146, "step": 362 }, { "epoch": 0.617609527860485, "grad_norm": 0.9624335481976285, "learning_rate": 6.721416075676601e-06, "loss": 0.1606, "step": 363 }, { "epoch": 0.6193109315185028, "grad_norm": 0.6357303231994696, "learning_rate": 6.669305557043626e-06, "loss": 0.0998, "step": 364 }, { "epoch": 0.6210123351765207, "grad_norm": 1.3683425531744842, "learning_rate": 6.617296571804191e-06, "loss": 0.2092, "step": 365 }, { "epoch": 0.6227137388345385, "grad_norm": 0.9206216658476813, "learning_rate": 6.565390705408072e-06, "loss": 0.1049, "step": 366 }, { "epoch": 0.6244151424925564, "grad_norm": 1.0820482105025058, "learning_rate": 6.513589540161556e-06, "loss": 0.1302, "step": 367 }, { "epoch": 0.6261165461505742, "grad_norm": 0.7483207917292378, "learning_rate": 6.461894655179204e-06, "loss": 0.1249, "step": 368 }, { "epoch": 0.6278179498085921, "grad_norm": 1.1325083196229184, "learning_rate": 6.410307626335717e-06, "loss": 0.159, "step": 369 }, { "epoch": 0.6295193534666099, "grad_norm": 1.243950543231958, "learning_rate": 6.358830026217887e-06, "loss": 0.179, "step": 370 }, { "epoch": 0.6312207571246278, "grad_norm": 0.9764890716406548, "learning_rate": 6.30746342407667e-06, "loss": 0.1553, "step": 371 }, { "epoch": 0.6329221607826457, "grad_norm": 0.876268059085391, "learning_rate": 6.256209385779341e-06, "loss": 0.1273, "step": 372 }, { "epoch": 0.6346235644406636, "grad_norm": 1.0844520376132052, "learning_rate": 6.205069473761756e-06, "loss": 0.1335, "step": 373 }, { "epoch": 0.6363249680986814, "grad_norm": 1.0020799137598357, "learning_rate": 6.154045246980742e-06, "loss": 0.1193, "step": 374 }, { "epoch": 0.6380263717566993, "grad_norm": 0.855728002090308, "learning_rate": 6.1031382608665456e-06, "loss": 0.1108, "step": 375 }, { "epoch": 0.6397277754147171, "grad_norm": 0.8773888809471199, "learning_rate": 6.052350067275441e-06, "loss": 0.1282, "step": 376 }, { "epoch": 0.641429179072735, "grad_norm": 0.9006481684565109, "learning_rate": 6.001682214442406e-06, "loss": 0.1301, "step": 377 }, { "epoch": 0.6431305827307529, "grad_norm": 0.9584790584065768, "learning_rate": 5.951136246933933e-06, "loss": 0.1487, "step": 378 }, { "epoch": 0.6448319863887707, "grad_norm": 1.104605217113324, "learning_rate": 5.900713705600951e-06, "loss": 0.1759, "step": 379 }, { "epoch": 0.6465333900467886, "grad_norm": 0.9879344636473479, "learning_rate": 5.850416127531841e-06, "loss": 0.1377, "step": 380 }, { "epoch": 0.6482347937048064, "grad_norm": 1.1463063310140142, "learning_rate": 5.800245046005585e-06, "loss": 0.1488, "step": 381 }, { "epoch": 0.6499361973628244, "grad_norm": 0.8327925347275126, "learning_rate": 5.750201990445024e-06, "loss": 0.1441, "step": 382 }, { "epoch": 0.6516376010208422, "grad_norm": 0.789671883690968, "learning_rate": 5.70028848637024e-06, "loss": 0.1302, "step": 383 }, { "epoch": 0.6533390046788601, "grad_norm": 0.9198435092469637, "learning_rate": 5.650506055352052e-06, "loss": 0.1164, "step": 384 }, { "epoch": 0.6550404083368779, "grad_norm": 0.9179626567799303, "learning_rate": 5.600856214965613e-06, "loss": 0.1362, "step": 385 }, { "epoch": 0.6567418119948958, "grad_norm": 0.8767201265948665, "learning_rate": 5.551340478744176e-06, "loss": 0.154, "step": 386 }, { "epoch": 0.6584432156529136, "grad_norm": 0.9544128683831228, "learning_rate": 5.501960356132945e-06, "loss": 0.156, "step": 387 }, { "epoch": 0.6601446193109315, "grad_norm": 0.9549458997168718, "learning_rate": 5.4527173524430395e-06, "loss": 0.1645, "step": 388 }, { "epoch": 0.6618460229689493, "grad_norm": 0.8850775843370307, "learning_rate": 5.403612968805649e-06, "loss": 0.0994, "step": 389 }, { "epoch": 0.6635474266269672, "grad_norm": 1.1024037997608411, "learning_rate": 5.354648702126229e-06, "loss": 0.1951, "step": 390 }, { "epoch": 0.6652488302849852, "grad_norm": 1.2094664207353945, "learning_rate": 5.305826045038899e-06, "loss": 0.1328, "step": 391 }, { "epoch": 0.666950233943003, "grad_norm": 0.9462460124134728, "learning_rate": 5.257146485860927e-06, "loss": 0.1769, "step": 392 }, { "epoch": 0.6686516376010209, "grad_norm": 0.8786684281110547, "learning_rate": 5.208611508547367e-06, "loss": 0.146, "step": 393 }, { "epoch": 0.6703530412590387, "grad_norm": 0.7842667507919817, "learning_rate": 5.160222592645808e-06, "loss": 0.117, "step": 394 }, { "epoch": 0.6720544449170566, "grad_norm": 1.1357067632204891, "learning_rate": 5.111981213251293e-06, "loss": 0.1792, "step": 395 }, { "epoch": 0.6737558485750744, "grad_norm": 0.9388409193539972, "learning_rate": 5.063888840961325e-06, "loss": 0.1562, "step": 396 }, { "epoch": 0.6754572522330923, "grad_norm": 0.801123367757538, "learning_rate": 5.015946941831064e-06, "loss": 0.1296, "step": 397 }, { "epoch": 0.6771586558911101, "grad_norm": 0.8535813470137829, "learning_rate": 4.968156977328626e-06, "loss": 0.1484, "step": 398 }, { "epoch": 0.678860059549128, "grad_norm": 1.2248435999982104, "learning_rate": 4.920520404290512e-06, "loss": 0.1464, "step": 399 }, { "epoch": 0.680561463207146, "grad_norm": 0.7210259524074228, "learning_rate": 4.87303867487723e-06, "loss": 0.1224, "step": 400 }, { "epoch": 0.6822628668651638, "grad_norm": 0.7005598922136381, "learning_rate": 4.825713236529005e-06, "loss": 0.0804, "step": 401 }, { "epoch": 0.6839642705231816, "grad_norm": 0.7084992648825957, "learning_rate": 4.778545531921668e-06, "loss": 0.1118, "step": 402 }, { "epoch": 0.6856656741811995, "grad_norm": 0.7988193456386986, "learning_rate": 4.731536998922657e-06, "loss": 0.1481, "step": 403 }, { "epoch": 0.6873670778392174, "grad_norm": 0.5955639940984955, "learning_rate": 4.684689070547216e-06, "loss": 0.0999, "step": 404 }, { "epoch": 0.6890684814972352, "grad_norm": 0.5771691889562411, "learning_rate": 4.638003174914675e-06, "loss": 0.0875, "step": 405 }, { "epoch": 0.6907698851552531, "grad_norm": 0.9925814202496841, "learning_rate": 4.591480735204953e-06, "loss": 0.1349, "step": 406 }, { "epoch": 0.6924712888132709, "grad_norm": 0.8994659904668703, "learning_rate": 4.545123169615134e-06, "loss": 0.1548, "step": 407 }, { "epoch": 0.6941726924712888, "grad_norm": 0.9580142998259727, "learning_rate": 4.49893189131627e-06, "loss": 0.1587, "step": 408 }, { "epoch": 0.6958740961293067, "grad_norm": 0.6788465105557742, "learning_rate": 4.45290830841028e-06, "loss": 0.0926, "step": 409 }, { "epoch": 0.6975754997873246, "grad_norm": 0.9024039513933189, "learning_rate": 4.407053823887033e-06, "loss": 0.1529, "step": 410 }, { "epoch": 0.6992769034453424, "grad_norm": 0.8820937053345296, "learning_rate": 4.361369835581569e-06, "loss": 0.1462, "step": 411 }, { "epoch": 0.7009783071033603, "grad_norm": 0.895339461723458, "learning_rate": 4.315857736131508e-06, "loss": 0.122, "step": 412 }, { "epoch": 0.7026797107613781, "grad_norm": 0.9930787706041434, "learning_rate": 4.2705189129345814e-06, "loss": 0.1714, "step": 413 }, { "epoch": 0.704381114419396, "grad_norm": 0.7941569999620287, "learning_rate": 4.225354748106328e-06, "loss": 0.1183, "step": 414 }, { "epoch": 0.7060825180774138, "grad_norm": 1.045533327522596, "learning_rate": 4.180366618437996e-06, "loss": 0.1748, "step": 415 }, { "epoch": 0.7077839217354317, "grad_norm": 0.8163342078381849, "learning_rate": 4.13555589535453e-06, "loss": 0.1101, "step": 416 }, { "epoch": 0.7094853253934496, "grad_norm": 0.6685227131497675, "learning_rate": 4.0909239448727985e-06, "loss": 0.1116, "step": 417 }, { "epoch": 0.7111867290514675, "grad_norm": 1.042711187603995, "learning_rate": 4.046472127559937e-06, "loss": 0.1476, "step": 418 }, { "epoch": 0.7128881327094854, "grad_norm": 0.7990571644153376, "learning_rate": 4.002201798491875e-06, "loss": 0.1241, "step": 419 }, { "epoch": 0.7145895363675032, "grad_norm": 0.8607452302823387, "learning_rate": 3.958114307212018e-06, "loss": 0.1479, "step": 420 }, { "epoch": 0.7162909400255211, "grad_norm": 0.9017446318408956, "learning_rate": 3.91421099769013e-06, "loss": 0.1281, "step": 421 }, { "epoch": 0.7179923436835389, "grad_norm": 0.8608148447694013, "learning_rate": 3.870493208281337e-06, "loss": 0.1392, "step": 422 }, { "epoch": 0.7196937473415568, "grad_norm": 0.9732198553868028, "learning_rate": 3.826962271685351e-06, "loss": 0.1443, "step": 423 }, { "epoch": 0.7213951509995746, "grad_norm": 0.7026930248963273, "learning_rate": 3.7836195149058386e-06, "loss": 0.1159, "step": 424 }, { "epoch": 0.7230965546575925, "grad_norm": 1.0616878292696266, "learning_rate": 3.7404662592099483e-06, "loss": 0.178, "step": 425 }, { "epoch": 0.7247979583156103, "grad_norm": 0.879922946105996, "learning_rate": 3.697503820088063e-06, "loss": 0.1345, "step": 426 }, { "epoch": 0.7264993619736282, "grad_norm": 0.8118424189657202, "learning_rate": 3.654733507213678e-06, "loss": 0.1107, "step": 427 }, { "epoch": 0.7282007656316462, "grad_norm": 1.0124821315311636, "learning_rate": 3.61215662440349e-06, "loss": 0.1597, "step": 428 }, { "epoch": 0.729902169289664, "grad_norm": 0.584812477581454, "learning_rate": 3.5697744695776326e-06, "loss": 0.0792, "step": 429 }, { "epoch": 0.7316035729476819, "grad_norm": 1.0457097189478148, "learning_rate": 3.5275883347201336e-06, "loss": 0.1489, "step": 430 }, { "epoch": 0.7333049766056997, "grad_norm": 0.9729869947589316, "learning_rate": 3.4855995058395066e-06, "loss": 0.1275, "step": 431 }, { "epoch": 0.7350063802637176, "grad_norm": 0.8868662034897864, "learning_rate": 3.443809262929575e-06, "loss": 0.168, "step": 432 }, { "epoch": 0.7367077839217354, "grad_norm": 0.9125187516121847, "learning_rate": 3.4022188799304214e-06, "loss": 0.1623, "step": 433 }, { "epoch": 0.7384091875797533, "grad_norm": 0.8317592980564871, "learning_rate": 3.36082962468958e-06, "loss": 0.1231, "step": 434 }, { "epoch": 0.7401105912377711, "grad_norm": 0.7739839576951205, "learning_rate": 3.3196427589233725e-06, "loss": 0.12, "step": 435 }, { "epoch": 0.741811994895789, "grad_norm": 0.797509591834182, "learning_rate": 3.2786595381784512e-06, "loss": 0.1299, "step": 436 }, { "epoch": 0.7435133985538069, "grad_norm": 1.2352332695500554, "learning_rate": 3.2378812117935154e-06, "loss": 0.2158, "step": 437 }, { "epoch": 0.7452148022118248, "grad_norm": 0.9970358273478818, "learning_rate": 3.1973090228612404e-06, "loss": 0.1889, "step": 438 }, { "epoch": 0.7469162058698426, "grad_norm": 0.9175281342866968, "learning_rate": 3.15694420819038e-06, "loss": 0.1451, "step": 439 }, { "epoch": 0.7486176095278605, "grad_norm": 0.7771607469867583, "learning_rate": 3.116787998268046e-06, "loss": 0.1273, "step": 440 }, { "epoch": 0.7503190131858783, "grad_norm": 1.1386198810950086, "learning_rate": 3.076841617222228e-06, "loss": 0.1755, "step": 441 }, { "epoch": 0.7520204168438962, "grad_norm": 0.7884645481853416, "learning_rate": 3.0371062827844434e-06, "loss": 0.137, "step": 442 }, { "epoch": 0.753721820501914, "grad_norm": 1.1651926590842914, "learning_rate": 2.997583206252647e-06, "loss": 0.1388, "step": 443 }, { "epoch": 0.7554232241599319, "grad_norm": 1.1937560745179017, "learning_rate": 2.958273592454285e-06, "loss": 0.1742, "step": 444 }, { "epoch": 0.7571246278179498, "grad_norm": 1.0502635521989399, "learning_rate": 2.9191786397095778e-06, "loss": 0.1498, "step": 445 }, { "epoch": 0.7588260314759677, "grad_norm": 0.7584831550160862, "learning_rate": 2.880299539794975e-06, "loss": 0.1086, "step": 446 }, { "epoch": 0.7605274351339856, "grad_norm": 1.1225417606344352, "learning_rate": 2.841637477906851e-06, "loss": 0.1934, "step": 447 }, { "epoch": 0.7622288387920034, "grad_norm": 0.9497099717053716, "learning_rate": 2.803193632625346e-06, "loss": 0.164, "step": 448 }, { "epoch": 0.7639302424500213, "grad_norm": 0.6837798268193367, "learning_rate": 2.7649691758784603e-06, "loss": 0.1114, "step": 449 }, { "epoch": 0.7656316461080391, "grad_norm": 0.6950024172256357, "learning_rate": 2.7269652729063233e-06, "loss": 0.0977, "step": 450 }, { "epoch": 0.767333049766057, "grad_norm": 0.9354904788771492, "learning_rate": 2.689183082225659e-06, "loss": 0.1591, "step": 451 }, { "epoch": 0.7690344534240748, "grad_norm": 1.2402009468321933, "learning_rate": 2.65162375559449e-06, "loss": 0.1772, "step": 452 }, { "epoch": 0.7707358570820927, "grad_norm": 0.7465094532185766, "learning_rate": 2.614288437977014e-06, "loss": 0.1195, "step": 453 }, { "epoch": 0.7724372607401105, "grad_norm": 1.0226055457032026, "learning_rate": 2.5771782675087078e-06, "loss": 0.2099, "step": 454 }, { "epoch": 0.7741386643981285, "grad_norm": 0.9551942400194852, "learning_rate": 2.5402943754616182e-06, "loss": 0.173, "step": 455 }, { "epoch": 0.7758400680561464, "grad_norm": 0.6594348635504637, "learning_rate": 2.5036378862099e-06, "loss": 0.0883, "step": 456 }, { "epoch": 0.7775414717141642, "grad_norm": 0.7901774411413286, "learning_rate": 2.467209917195513e-06, "loss": 0.1237, "step": 457 }, { "epoch": 0.7792428753721821, "grad_norm": 0.8910357359774559, "learning_rate": 2.4310115788941855e-06, "loss": 0.1289, "step": 458 }, { "epoch": 0.7809442790301999, "grad_norm": 1.136218990072521, "learning_rate": 2.3950439747815357e-06, "loss": 0.1519, "step": 459 }, { "epoch": 0.7826456826882178, "grad_norm": 0.8023078317049139, "learning_rate": 2.359308201299454e-06, "loss": 0.1147, "step": 460 }, { "epoch": 0.7843470863462356, "grad_norm": 0.7751079437436228, "learning_rate": 2.3238053478226665e-06, "loss": 0.1497, "step": 461 }, { "epoch": 0.7860484900042535, "grad_norm": 0.6377332784824975, "learning_rate": 2.2885364966255372e-06, "loss": 0.1229, "step": 462 }, { "epoch": 0.7877498936622713, "grad_norm": 1.2952811713921435, "learning_rate": 2.2535027228490582e-06, "loss": 0.1986, "step": 463 }, { "epoch": 0.7894512973202893, "grad_norm": 0.9212470224777787, "learning_rate": 2.2187050944680942e-06, "loss": 0.178, "step": 464 }, { "epoch": 0.7911527009783071, "grad_norm": 1.093266320340803, "learning_rate": 2.18414467225882e-06, "loss": 0.1485, "step": 465 }, { "epoch": 0.792854104636325, "grad_norm": 1.2237974652268393, "learning_rate": 2.1498225097663695e-06, "loss": 0.1855, "step": 466 }, { "epoch": 0.7945555082943428, "grad_norm": 1.010538661263038, "learning_rate": 2.115739653272747e-06, "loss": 0.1374, "step": 467 }, { "epoch": 0.7962569119523607, "grad_norm": 0.8405255367131824, "learning_rate": 2.0818971417649013e-06, "loss": 0.121, "step": 468 }, { "epoch": 0.7979583156103786, "grad_norm": 1.609522304609243, "learning_rate": 2.048296006903081e-06, "loss": 0.196, "step": 469 }, { "epoch": 0.7996597192683964, "grad_norm": 0.8809611996542851, "learning_rate": 2.0149372729893646e-06, "loss": 0.0851, "step": 470 }, { "epoch": 0.8013611229264143, "grad_norm": 1.0627552307732382, "learning_rate": 1.981821956936448e-06, "loss": 0.1401, "step": 471 }, { "epoch": 0.8030625265844321, "grad_norm": 1.005097101735857, "learning_rate": 1.9489510682366363e-06, "loss": 0.1487, "step": 472 }, { "epoch": 0.8047639302424501, "grad_norm": 0.6705089050955295, "learning_rate": 1.916325608931079e-06, "loss": 0.0876, "step": 473 }, { "epoch": 0.8064653339004679, "grad_norm": 0.8633874336922923, "learning_rate": 1.8839465735792095e-06, "loss": 0.1301, "step": 474 }, { "epoch": 0.8081667375584858, "grad_norm": 1.1059783421414893, "learning_rate": 1.8518149492284477e-06, "loss": 0.1877, "step": 475 }, { "epoch": 0.8098681412165036, "grad_norm": 0.8023451929643662, "learning_rate": 1.8199317153840933e-06, "loss": 0.1397, "step": 476 }, { "epoch": 0.8115695448745215, "grad_norm": 1.2934630086997319, "learning_rate": 1.7882978439794708e-06, "loss": 0.1874, "step": 477 }, { "epoch": 0.8132709485325393, "grad_norm": 0.8838382576359304, "learning_rate": 1.756914299346304e-06, "loss": 0.1401, "step": 478 }, { "epoch": 0.8149723521905572, "grad_norm": 0.7756837276148775, "learning_rate": 1.7257820381853197e-06, "loss": 0.1112, "step": 479 }, { "epoch": 0.816673755848575, "grad_norm": 0.7565281231601331, "learning_rate": 1.6949020095370816e-06, "loss": 0.1233, "step": 480 }, { "epoch": 0.8183751595065929, "grad_norm": 0.853948523291977, "learning_rate": 1.6642751547530512e-06, "loss": 0.1039, "step": 481 }, { "epoch": 0.8200765631646108, "grad_norm": 0.8352361826998009, "learning_rate": 1.6339024074669107e-06, "loss": 0.1534, "step": 482 }, { "epoch": 0.8217779668226287, "grad_norm": 0.8688740479788196, "learning_rate": 1.6037846935660807e-06, "loss": 0.1406, "step": 483 }, { "epoch": 0.8234793704806466, "grad_norm": 0.8005917780121297, "learning_rate": 1.5739229311635152e-06, "loss": 0.1378, "step": 484 }, { "epoch": 0.8251807741386644, "grad_norm": 0.7689221194951362, "learning_rate": 1.5443180305696948e-06, "loss": 0.1257, "step": 485 }, { "epoch": 0.8268821777966823, "grad_norm": 0.7665161442309016, "learning_rate": 1.5149708942648922e-06, "loss": 0.1179, "step": 486 }, { "epoch": 0.8285835814547001, "grad_norm": 1.169882211334333, "learning_rate": 1.4858824168716524e-06, "loss": 0.1262, "step": 487 }, { "epoch": 0.830284985112718, "grad_norm": 1.0029808688868465, "learning_rate": 1.4570534851275241e-06, "loss": 0.1709, "step": 488 }, { "epoch": 0.8319863887707358, "grad_norm": 0.9018685205076322, "learning_rate": 1.4284849778580279e-06, "loss": 0.1275, "step": 489 }, { "epoch": 0.8336877924287537, "grad_norm": 0.6540418058871127, "learning_rate": 1.4001777659498584e-06, "loss": 0.0756, "step": 490 }, { "epoch": 0.8353891960867715, "grad_norm": 1.5133534763569585, "learning_rate": 1.3721327123243533e-06, "loss": 0.1723, "step": 491 }, { "epoch": 0.8370905997447895, "grad_norm": 0.8207160486641186, "learning_rate": 1.3443506719111666e-06, "loss": 0.0973, "step": 492 }, { "epoch": 0.8387920034028074, "grad_norm": 0.8940821425158962, "learning_rate": 1.3168324916222296e-06, "loss": 0.1626, "step": 493 }, { "epoch": 0.8404934070608252, "grad_norm": 0.9443999012934854, "learning_rate": 1.28957901032591e-06, "loss": 0.1581, "step": 494 }, { "epoch": 0.8421948107188431, "grad_norm": 0.7851253215751375, "learning_rate": 1.2625910588214608e-06, "loss": 0.1166, "step": 495 }, { "epoch": 0.8438962143768609, "grad_norm": 0.888618596471637, "learning_rate": 1.2358694598136755e-06, "loss": 0.1159, "step": 496 }, { "epoch": 0.8455976180348788, "grad_norm": 1.4198001597035892, "learning_rate": 1.2094150278878303e-06, "loss": 0.166, "step": 497 }, { "epoch": 0.8472990216928966, "grad_norm": 0.5631587010169332, "learning_rate": 1.1832285694848255e-06, "loss": 0.0981, "step": 498 }, { "epoch": 0.8490004253509145, "grad_norm": 0.8202300136945526, "learning_rate": 1.1573108828766255e-06, "loss": 0.1413, "step": 499 }, { "epoch": 0.8507018290089323, "grad_norm": 0.9023011380495052, "learning_rate": 1.1316627581419137e-06, "loss": 0.1455, "step": 500 }, { "epoch": 0.8524032326669503, "grad_norm": 1.235848904880368, "learning_rate": 1.1062849771420025e-06, "loss": 0.2096, "step": 501 }, { "epoch": 0.8541046363249681, "grad_norm": 1.0464082421195386, "learning_rate": 1.0811783134970132e-06, "loss": 0.1646, "step": 502 }, { "epoch": 0.855806039982986, "grad_norm": 1.0819062403332675, "learning_rate": 1.0563435325622762e-06, "loss": 0.1738, "step": 503 }, { "epoch": 0.8575074436410038, "grad_norm": 1.473560386930757, "learning_rate": 1.0317813914050157e-06, "loss": 0.1692, "step": 504 }, { "epoch": 0.8592088472990217, "grad_norm": 0.978349330529313, "learning_rate": 1.007492638781259e-06, "loss": 0.1513, "step": 505 }, { "epoch": 0.8609102509570395, "grad_norm": 1.179643658189341, "learning_rate": 9.834780151130196e-07, "loss": 0.166, "step": 506 }, { "epoch": 0.8626116546150574, "grad_norm": 0.766029475252831, "learning_rate": 9.597382524657173e-07, "loss": 0.1134, "step": 507 }, { "epoch": 0.8643130582730753, "grad_norm": 0.9024327918183763, "learning_rate": 9.362740745258736e-07, "loss": 0.1149, "step": 508 }, { "epoch": 0.8660144619310931, "grad_norm": 0.7739449751707675, "learning_rate": 9.13086196579035e-07, "loss": 0.1597, "step": 509 }, { "epoch": 0.8677158655891111, "grad_norm": 0.9425165834042365, "learning_rate": 8.901753254879885e-07, "loss": 0.1085, "step": 510 }, { "epoch": 0.8694172692471289, "grad_norm": 0.7591763618142894, "learning_rate": 8.67542159671192e-07, "loss": 0.0942, "step": 511 }, { "epoch": 0.8711186729051468, "grad_norm": 1.025073614077726, "learning_rate": 8.451873890814988e-07, "loss": 0.1793, "step": 512 }, { "epoch": 0.8728200765631646, "grad_norm": 0.7371355385468613, "learning_rate": 8.231116951851204e-07, "loss": 0.096, "step": 513 }, { "epoch": 0.8745214802211825, "grad_norm": 1.101658847251625, "learning_rate": 8.013157509408509e-07, "loss": 0.1348, "step": 514 }, { "epoch": 0.8762228838792003, "grad_norm": 1.2039979968776104, "learning_rate": 7.79800220779554e-07, "loss": 0.2153, "step": 515 }, { "epoch": 0.8779242875372182, "grad_norm": 1.006732552857591, "learning_rate": 7.585657605839059e-07, "loss": 0.1703, "step": 516 }, { "epoch": 0.879625691195236, "grad_norm": 1.0145016790131924, "learning_rate": 7.376130176684082e-07, "loss": 0.1912, "step": 517 }, { "epoch": 0.8813270948532539, "grad_norm": 0.6883624712674228, "learning_rate": 7.169426307596428e-07, "loss": 0.1084, "step": 518 }, { "epoch": 0.8830284985112719, "grad_norm": 0.6860856488272892, "learning_rate": 6.965552299768186e-07, "loss": 0.1126, "step": 519 }, { "epoch": 0.8847299021692897, "grad_norm": 0.6619502831120293, "learning_rate": 6.764514368125419e-07, "loss": 0.1024, "step": 520 }, { "epoch": 0.8864313058273076, "grad_norm": 0.7410446301641695, "learning_rate": 6.566318641138902e-07, "loss": 0.1221, "step": 521 }, { "epoch": 0.8881327094853254, "grad_norm": 0.49451324336653424, "learning_rate": 6.370971160637129e-07, "loss": 0.0695, "step": 522 }, { "epoch": 0.8898341131433433, "grad_norm": 1.0914793541244312, "learning_rate": 6.178477881622325e-07, "loss": 0.1097, "step": 523 }, { "epoch": 0.8915355168013611, "grad_norm": 0.9323787575824379, "learning_rate": 5.98884467208869e-07, "loss": 0.1511, "step": 524 }, { "epoch": 0.893236920459379, "grad_norm": 0.9363542835251712, "learning_rate": 5.802077312843723e-07, "loss": 0.1318, "step": 525 }, { "epoch": 0.8949383241173968, "grad_norm": 0.603259894149764, "learning_rate": 5.618181497331865e-07, "loss": 0.0853, "step": 526 }, { "epoch": 0.8966397277754147, "grad_norm": 0.7303483060977899, "learning_rate": 5.437162831460962e-07, "loss": 0.1045, "step": 527 }, { "epoch": 0.8983411314334325, "grad_norm": 0.8018716290849719, "learning_rate": 5.259026833431468e-07, "loss": 0.0994, "step": 528 }, { "epoch": 0.9000425350914505, "grad_norm": 0.6742736755775497, "learning_rate": 5.083778933568073e-07, "loss": 0.1319, "step": 529 }, { "epoch": 0.9017439387494683, "grad_norm": 1.1269218361928497, "learning_rate": 4.911424474154314e-07, "loss": 0.179, "step": 530 }, { "epoch": 0.9034453424074862, "grad_norm": 0.8025742885030408, "learning_rate": 4.741968709269573e-07, "loss": 0.1092, "step": 531 }, { "epoch": 0.905146746065504, "grad_norm": 0.9305265429776364, "learning_rate": 4.575416804629085e-07, "loss": 0.1735, "step": 532 }, { "epoch": 0.9068481497235219, "grad_norm": 0.6809160303515213, "learning_rate": 4.411773837426303e-07, "loss": 0.1104, "step": 533 }, { "epoch": 0.9085495533815398, "grad_norm": 0.9045295991595489, "learning_rate": 4.2510447961782055e-07, "loss": 0.1734, "step": 534 }, { "epoch": 0.9102509570395576, "grad_norm": 0.8166470828418425, "learning_rate": 4.093234580573202e-07, "loss": 0.1523, "step": 535 }, { "epoch": 0.9119523606975755, "grad_norm": 0.708095759490807, "learning_rate": 3.938348001321812e-07, "loss": 0.0868, "step": 536 }, { "epoch": 0.9136537643555933, "grad_norm": 0.6245594804666802, "learning_rate": 3.786389780009958e-07, "loss": 0.0807, "step": 537 }, { "epoch": 0.9153551680136113, "grad_norm": 0.7746580581966721, "learning_rate": 3.637364548955047e-07, "loss": 0.0765, "step": 538 }, { "epoch": 0.9170565716716291, "grad_norm": 0.8160416161103592, "learning_rate": 3.491276851064784e-07, "loss": 0.1216, "step": 539 }, { "epoch": 0.918757975329647, "grad_norm": 0.8270184476140136, "learning_rate": 3.3481311396986626e-07, "loss": 0.1134, "step": 540 }, { "epoch": 0.9204593789876648, "grad_norm": 1.0313796709120198, "learning_rate": 3.2079317785322363e-07, "loss": 0.1766, "step": 541 }, { "epoch": 0.9221607826456827, "grad_norm": 1.0201847755126947, "learning_rate": 3.0706830414240164e-07, "loss": 0.1729, "step": 542 }, { "epoch": 0.9238621863037005, "grad_norm": 0.7776265936946829, "learning_rate": 2.9363891122853097e-07, "loss": 0.1241, "step": 543 }, { "epoch": 0.9255635899617184, "grad_norm": 0.7554593104785499, "learning_rate": 2.805054084952552e-07, "loss": 0.1108, "step": 544 }, { "epoch": 0.9272649936197362, "grad_norm": 0.9610672965393776, "learning_rate": 2.6766819630626216e-07, "loss": 0.1401, "step": 545 }, { "epoch": 0.9289663972777541, "grad_norm": 0.6998221281429967, "learning_rate": 2.5512766599306903e-07, "loss": 0.0957, "step": 546 }, { "epoch": 0.9306678009357721, "grad_norm": 0.8390384737658598, "learning_rate": 2.4288419984310086e-07, "loss": 0.1168, "step": 547 }, { "epoch": 0.9323692045937899, "grad_norm": 0.9236463852965615, "learning_rate": 2.3093817108803318e-07, "loss": 0.1634, "step": 548 }, { "epoch": 0.9340706082518078, "grad_norm": 0.7447775141776533, "learning_rate": 2.1928994389241454e-07, "loss": 0.1004, "step": 549 }, { "epoch": 0.9357720119098256, "grad_norm": 0.7255330710827995, "learning_rate": 2.0793987334256637e-07, "loss": 0.1272, "step": 550 }, { "epoch": 0.9374734155678435, "grad_norm": 0.9390672914885052, "learning_rate": 1.968883054357562e-07, "loss": 0.1273, "step": 551 }, { "epoch": 0.9391748192258613, "grad_norm": 0.9272533734619594, "learning_rate": 1.861355770696549e-07, "loss": 0.1338, "step": 552 }, { "epoch": 0.9408762228838792, "grad_norm": 0.8915590435767178, "learning_rate": 1.7568201603205827e-07, "loss": 0.1478, "step": 553 }, { "epoch": 0.942577626541897, "grad_norm": 0.833580814718019, "learning_rate": 1.6552794099090718e-07, "loss": 0.1641, "step": 554 }, { "epoch": 0.9442790301999149, "grad_norm": 0.7262763652837203, "learning_rate": 1.5567366148455887e-07, "loss": 0.0812, "step": 555 }, { "epoch": 0.9459804338579328, "grad_norm": 0.6130128705786809, "learning_rate": 1.4611947791236314e-07, "loss": 0.084, "step": 556 }, { "epoch": 0.9476818375159507, "grad_norm": 0.9063134327194557, "learning_rate": 1.3686568152549539e-07, "loss": 0.1301, "step": 557 }, { "epoch": 0.9493832411739686, "grad_norm": 1.1026165821069918, "learning_rate": 1.2791255441809037e-07, "loss": 0.1414, "step": 558 }, { "epoch": 0.9510846448319864, "grad_norm": 0.9387139772419816, "learning_rate": 1.1926036951862563e-07, "loss": 0.1326, "step": 559 }, { "epoch": 0.9527860484900043, "grad_norm": 0.5842678864666834, "learning_rate": 1.109093905816172e-07, "loss": 0.102, "step": 560 }, { "epoch": 0.9544874521480221, "grad_norm": 0.9322995560475983, "learning_rate": 1.0285987217957038e-07, "loss": 0.171, "step": 561 }, { "epoch": 0.95618885580604, "grad_norm": 0.8354465318723641, "learning_rate": 9.511205969522263e-08, "loss": 0.1443, "step": 562 }, { "epoch": 0.9578902594640578, "grad_norm": 0.9920641234095123, "learning_rate": 8.76661893140629e-08, "loss": 0.1504, "step": 563 }, { "epoch": 0.9595916631220757, "grad_norm": 0.9511533047381777, "learning_rate": 8.052248801712958e-08, "loss": 0.151, "step": 564 }, { "epoch": 0.9612930667800936, "grad_norm": 0.9354186186695939, "learning_rate": 7.36811735740961e-08, "loss": 0.1229, "step": 565 }, { "epoch": 0.9629944704381115, "grad_norm": 0.6362057824024612, "learning_rate": 6.714245453662504e-08, "loss": 0.091, "step": 566 }, { "epoch": 0.9646958740961293, "grad_norm": 1.0083389751537988, "learning_rate": 6.090653023201997e-08, "loss": 0.1456, "step": 567 }, { "epoch": 0.9663972777541472, "grad_norm": 0.8907631394850228, "learning_rate": 5.497359075714026e-08, "loss": 0.1594, "step": 568 }, { "epoch": 0.968098681412165, "grad_norm": 0.7531697026641528, "learning_rate": 4.934381697261015e-08, "loss": 0.1197, "step": 569 }, { "epoch": 0.9698000850701829, "grad_norm": 0.8062893099107721, "learning_rate": 4.401738049730653e-08, "loss": 0.0974, "step": 570 }, { "epoch": 0.9715014887282007, "grad_norm": 1.2377725717443462, "learning_rate": 3.899444370312533e-08, "loss": 0.2178, "step": 571 }, { "epoch": 0.9732028923862186, "grad_norm": 0.9551200579988982, "learning_rate": 3.4275159710032146e-08, "loss": 0.1566, "step": 572 }, { "epoch": 0.9749042960442365, "grad_norm": 0.8038517484152471, "learning_rate": 2.9859672381392644e-08, "loss": 0.1219, "step": 573 }, { "epoch": 0.9766056997022544, "grad_norm": 0.8743135576113755, "learning_rate": 2.574811631959273e-08, "loss": 0.1568, "step": 574 }, { "epoch": 0.9783071033602723, "grad_norm": 0.8522700286101278, "learning_rate": 2.1940616861929608e-08, "loss": 0.1487, "step": 575 }, { "epoch": 0.9800085070182901, "grad_norm": 0.8224250512901891, "learning_rate": 1.8437290076792624e-08, "loss": 0.1498, "step": 576 }, { "epoch": 0.981709910676308, "grad_norm": 0.8848861332502466, "learning_rate": 1.5238242760126088e-08, "loss": 0.1504, "step": 577 }, { "epoch": 0.9834113143343258, "grad_norm": 1.1465336786229596, "learning_rate": 1.234357243217188e-08, "loss": 0.2006, "step": 578 }, { "epoch": 0.9851127179923437, "grad_norm": 0.8142904720294101, "learning_rate": 9.753367334499608e-09, "loss": 0.1086, "step": 579 }, { "epoch": 0.9868141216503615, "grad_norm": 1.0733470427784377, "learning_rate": 7.467706427312093e-09, "loss": 0.1226, "step": 580 }, { "epoch": 0.9885155253083794, "grad_norm": 0.7419763368783449, "learning_rate": 5.486659387043958e-09, "loss": 0.072, "step": 581 }, { "epoch": 0.9902169289663972, "grad_norm": 0.9725691812544829, "learning_rate": 3.810286604232216e-09, "loss": 0.1285, "step": 582 }, { "epoch": 0.9919183326244151, "grad_norm": 1.006297166436942, "learning_rate": 2.4386391816777488e-09, "loss": 0.1627, "step": 583 }, { "epoch": 0.993619736282433, "grad_norm": 2.3873068728832902, "learning_rate": 1.3717589328898773e-09, "loss": 0.102, "step": 584 }, { "epoch": 0.9953211399404509, "grad_norm": 1.0284942567315343, "learning_rate": 6.096783808062778e-10, "loss": 0.1859, "step": 585 }, { "epoch": 0.9970225435984688, "grad_norm": 0.8620294782566316, "learning_rate": 1.524207568059932e-10, "loss": 0.1134, "step": 586 }, { "epoch": 0.9987239472564866, "grad_norm": 0.9608067188231881, "learning_rate": 0.0, "loss": 0.1686, "step": 587 }, { "epoch": 0.9987239472564866, "step": 587, "total_flos": 521029723553792.0, "train_loss": 0.19605895173712118, "train_runtime": 4311.1152, "train_samples_per_second": 17.449, "train_steps_per_second": 0.136 } ], "logging_steps": 1.0, "max_steps": 587, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 521029723553792.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }