{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9933642609665863, "eval_steps": 500, "global_step": 34000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011744670855599272, "grad_norm": 0.025151433423161507, "learning_rate": 0.0001997648165569144, "loss": 1.2331, "step": 100 }, { "epoch": 0.023489341711198545, "grad_norm": 0.028844915330410004, "learning_rate": 0.0001992944496707432, "loss": 1.1461, "step": 200 }, { "epoch": 0.03523401256679781, "grad_norm": 0.0280243381857872, "learning_rate": 0.00019882408278457198, "loss": 1.1523, "step": 300 }, { "epoch": 0.04697868342239709, "grad_norm": 0.029297035187482834, "learning_rate": 0.00019835371589840077, "loss": 1.1273, "step": 400 }, { "epoch": 0.05872335427799636, "grad_norm": 0.02906208299100399, "learning_rate": 0.00019788334901222956, "loss": 1.1149, "step": 500 }, { "epoch": 0.07046802513359562, "grad_norm": 0.027980709448456764, "learning_rate": 0.00019741298212605835, "loss": 1.1108, "step": 600 }, { "epoch": 0.08221269598919491, "grad_norm": 0.03009636141359806, "learning_rate": 0.00019694261523988714, "loss": 1.1041, "step": 700 }, { "epoch": 0.09395736684479418, "grad_norm": 0.02974865213036537, "learning_rate": 0.00019647224835371593, "loss": 1.1088, "step": 800 }, { "epoch": 0.10570203770039345, "grad_norm": 0.03112063929438591, "learning_rate": 0.0001960018814675447, "loss": 1.1052, "step": 900 }, { "epoch": 0.11744670855599272, "grad_norm": 0.03213803470134735, "learning_rate": 0.00019553151458137348, "loss": 1.1037, "step": 1000 }, { "epoch": 0.12919137941159198, "grad_norm": 0.029907317832112312, "learning_rate": 0.00019506114769520227, "loss": 1.0927, "step": 1100 }, { "epoch": 0.14093605026719125, "grad_norm": 0.03029988706111908, "learning_rate": 0.00019459078080903106, "loss": 1.0926, "step": 1200 }, { "epoch": 0.15268072112279055, "grad_norm": 0.03094552271068096, "learning_rate": 0.00019412041392285985, "loss": 1.0977, "step": 1300 }, { "epoch": 0.16442539197838982, "grad_norm": 0.031248079612851143, "learning_rate": 0.00019365004703668864, "loss": 1.0914, "step": 1400 }, { "epoch": 0.1761700628339891, "grad_norm": 0.03235971927642822, "learning_rate": 0.00019317968015051742, "loss": 1.1114, "step": 1500 }, { "epoch": 0.18791473368958836, "grad_norm": 0.031699199229478836, "learning_rate": 0.0001927093132643462, "loss": 1.0883, "step": 1600 }, { "epoch": 0.19965940454518763, "grad_norm": 0.032460007816553116, "learning_rate": 0.00019223894637817498, "loss": 1.0925, "step": 1700 }, { "epoch": 0.2114040754007869, "grad_norm": 0.03195160627365112, "learning_rate": 0.00019176857949200376, "loss": 1.0849, "step": 1800 }, { "epoch": 0.22314874625638617, "grad_norm": 0.031754713505506516, "learning_rate": 0.00019129821260583255, "loss": 1.0872, "step": 1900 }, { "epoch": 0.23489341711198544, "grad_norm": 0.0315755270421505, "learning_rate": 0.00019082784571966134, "loss": 1.0899, "step": 2000 }, { "epoch": 0.2466380879675847, "grad_norm": 0.03236432373523712, "learning_rate": 0.00019035747883349013, "loss": 1.0846, "step": 2100 }, { "epoch": 0.25838275882318396, "grad_norm": 0.034620147198438644, "learning_rate": 0.00018988711194731892, "loss": 1.0813, "step": 2200 }, { "epoch": 0.2701274296787832, "grad_norm": 0.03313825652003288, "learning_rate": 0.0001894167450611477, "loss": 1.0829, "step": 2300 }, { "epoch": 0.2818721005343825, "grad_norm": 0.033103689551353455, "learning_rate": 0.0001889463781749765, "loss": 1.0889, "step": 2400 }, { "epoch": 0.2936167713899818, "grad_norm": 0.0330510288476944, "learning_rate": 0.0001884760112888053, "loss": 1.0792, "step": 2500 }, { "epoch": 0.3053614422455811, "grad_norm": 0.03478708490729332, "learning_rate": 0.00018800564440263405, "loss": 1.0819, "step": 2600 }, { "epoch": 0.31710611310118036, "grad_norm": 0.033343035727739334, "learning_rate": 0.00018753527751646284, "loss": 1.0774, "step": 2700 }, { "epoch": 0.32885078395677964, "grad_norm": 0.034466274082660675, "learning_rate": 0.00018706491063029163, "loss": 1.0814, "step": 2800 }, { "epoch": 0.3405954548123789, "grad_norm": 0.034216560423374176, "learning_rate": 0.00018659454374412042, "loss": 1.0823, "step": 2900 }, { "epoch": 0.3523401256679782, "grad_norm": 0.033088088035583496, "learning_rate": 0.0001861241768579492, "loss": 1.0745, "step": 3000 }, { "epoch": 0.36408479652357745, "grad_norm": 0.03337638080120087, "learning_rate": 0.000185653809971778, "loss": 1.0876, "step": 3100 }, { "epoch": 0.3758294673791767, "grad_norm": 0.03440937399864197, "learning_rate": 0.00018518344308560678, "loss": 1.0708, "step": 3200 }, { "epoch": 0.387574138234776, "grad_norm": 0.03419345244765282, "learning_rate": 0.00018471307619943557, "loss": 1.0849, "step": 3300 }, { "epoch": 0.39931880909037526, "grad_norm": 0.03453630208969116, "learning_rate": 0.00018424270931326436, "loss": 1.0716, "step": 3400 }, { "epoch": 0.41106347994597453, "grad_norm": 0.033510930836200714, "learning_rate": 0.00018377234242709312, "loss": 1.0708, "step": 3500 }, { "epoch": 0.4228081508015738, "grad_norm": 0.03349142521619797, "learning_rate": 0.00018330197554092191, "loss": 1.0813, "step": 3600 }, { "epoch": 0.43455282165717307, "grad_norm": 0.0351400151848793, "learning_rate": 0.0001828316086547507, "loss": 1.0835, "step": 3700 }, { "epoch": 0.44629749251277234, "grad_norm": 0.034197255969047546, "learning_rate": 0.0001823612417685795, "loss": 1.0762, "step": 3800 }, { "epoch": 0.4580421633683716, "grad_norm": 0.03609395772218704, "learning_rate": 0.00018189087488240828, "loss": 1.0665, "step": 3900 }, { "epoch": 0.4697868342239709, "grad_norm": 0.034626953303813934, "learning_rate": 0.00018142050799623707, "loss": 1.073, "step": 4000 }, { "epoch": 0.48153150507957015, "grad_norm": 0.03524584695696831, "learning_rate": 0.00018095014111006586, "loss": 1.0655, "step": 4100 }, { "epoch": 0.4932761759351694, "grad_norm": 0.0350763201713562, "learning_rate": 0.00018047977422389465, "loss": 1.0733, "step": 4200 }, { "epoch": 0.5050208467907686, "grad_norm": 0.0361902192234993, "learning_rate": 0.0001800094073377234, "loss": 1.0837, "step": 4300 }, { "epoch": 0.5167655176463679, "grad_norm": 0.03573866933584213, "learning_rate": 0.0001795390404515522, "loss": 1.0807, "step": 4400 }, { "epoch": 0.5285101885019672, "grad_norm": 0.03497539460659027, "learning_rate": 0.000179068673565381, "loss": 1.0599, "step": 4500 }, { "epoch": 0.5402548593575665, "grad_norm": 0.034839775413274765, "learning_rate": 0.00017859830667920978, "loss": 1.0732, "step": 4600 }, { "epoch": 0.5519995302131657, "grad_norm": 0.035876356065273285, "learning_rate": 0.0001781279397930386, "loss": 1.0674, "step": 4700 }, { "epoch": 0.563744201068765, "grad_norm": 0.034889355301856995, "learning_rate": 0.00017765757290686738, "loss": 1.0701, "step": 4800 }, { "epoch": 0.5754888719243643, "grad_norm": 0.03712477535009384, "learning_rate": 0.00017718720602069617, "loss": 1.0532, "step": 4900 }, { "epoch": 0.5872335427799636, "grad_norm": 0.036021534353494644, "learning_rate": 0.00017671683913452493, "loss": 1.0734, "step": 5000 }, { "epoch": 0.5989782136355629, "grad_norm": 0.03627597168087959, "learning_rate": 0.00017624647224835372, "loss": 1.0704, "step": 5100 }, { "epoch": 0.6107228844911622, "grad_norm": 0.03433089703321457, "learning_rate": 0.0001757761053621825, "loss": 1.0741, "step": 5200 }, { "epoch": 0.6224675553467615, "grad_norm": 0.03627678006887436, "learning_rate": 0.0001753057384760113, "loss": 1.0626, "step": 5300 }, { "epoch": 0.6342122262023607, "grad_norm": 0.035330165177583694, "learning_rate": 0.0001748353715898401, "loss": 1.0597, "step": 5400 }, { "epoch": 0.64595689705796, "grad_norm": 0.03662644326686859, "learning_rate": 0.00017436500470366888, "loss": 1.0712, "step": 5500 }, { "epoch": 0.6577015679135593, "grad_norm": 0.03487861156463623, "learning_rate": 0.00017389463781749767, "loss": 1.0526, "step": 5600 }, { "epoch": 0.6694462387691585, "grad_norm": 0.036091409623622894, "learning_rate": 0.00017342427093132646, "loss": 1.0641, "step": 5700 }, { "epoch": 0.6811909096247578, "grad_norm": 0.037575751543045044, "learning_rate": 0.00017295390404515525, "loss": 1.0593, "step": 5800 }, { "epoch": 0.6929355804803571, "grad_norm": 0.03606625273823738, "learning_rate": 0.000172483537158984, "loss": 1.067, "step": 5900 }, { "epoch": 0.7046802513359564, "grad_norm": 0.03537227585911751, "learning_rate": 0.0001720131702728128, "loss": 1.0635, "step": 6000 }, { "epoch": 0.7164249221915556, "grad_norm": 0.03722114861011505, "learning_rate": 0.0001715428033866416, "loss": 1.0704, "step": 6100 }, { "epoch": 0.7281695930471549, "grad_norm": 0.03697160631418228, "learning_rate": 0.00017107243650047038, "loss": 1.0523, "step": 6200 }, { "epoch": 0.7399142639027542, "grad_norm": 0.03575093299150467, "learning_rate": 0.00017060206961429917, "loss": 1.0629, "step": 6300 }, { "epoch": 0.7516589347583534, "grad_norm": 0.0371549054980278, "learning_rate": 0.00017013170272812795, "loss": 1.068, "step": 6400 }, { "epoch": 0.7634036056139527, "grad_norm": 0.03723159059882164, "learning_rate": 0.00016966133584195674, "loss": 1.0515, "step": 6500 }, { "epoch": 0.775148276469552, "grad_norm": 0.03721994906663895, "learning_rate": 0.00016919096895578553, "loss": 1.0628, "step": 6600 }, { "epoch": 0.7868929473251512, "grad_norm": 0.03621937334537506, "learning_rate": 0.00016872060206961432, "loss": 1.0691, "step": 6700 }, { "epoch": 0.7986376181807505, "grad_norm": 0.03620177507400513, "learning_rate": 0.00016825023518344308, "loss": 1.0455, "step": 6800 }, { "epoch": 0.8103822890363498, "grad_norm": 0.037204090505838394, "learning_rate": 0.00016777986829727187, "loss": 1.072, "step": 6900 }, { "epoch": 0.8221269598919491, "grad_norm": 0.03673955425620079, "learning_rate": 0.00016730950141110066, "loss": 1.0649, "step": 7000 }, { "epoch": 0.8338716307475483, "grad_norm": 0.036923281848430634, "learning_rate": 0.00016683913452492945, "loss": 1.0661, "step": 7100 }, { "epoch": 0.8456163016031476, "grad_norm": 0.03734573721885681, "learning_rate": 0.00016636876763875824, "loss": 1.0636, "step": 7200 }, { "epoch": 0.8573609724587469, "grad_norm": 0.03644491732120514, "learning_rate": 0.00016589840075258703, "loss": 1.0558, "step": 7300 }, { "epoch": 0.8691056433143461, "grad_norm": 0.03660232573747635, "learning_rate": 0.00016542803386641582, "loss": 1.0671, "step": 7400 }, { "epoch": 0.8808503141699454, "grad_norm": 0.036776404827833176, "learning_rate": 0.0001649576669802446, "loss": 1.0643, "step": 7500 }, { "epoch": 0.8925949850255447, "grad_norm": 0.03714260086417198, "learning_rate": 0.00016448730009407337, "loss": 1.0645, "step": 7600 }, { "epoch": 0.904339655881144, "grad_norm": 0.03652457147836685, "learning_rate": 0.00016401693320790216, "loss": 1.0541, "step": 7700 }, { "epoch": 0.9160843267367432, "grad_norm": 0.03829098492860794, "learning_rate": 0.00016354656632173095, "loss": 1.0585, "step": 7800 }, { "epoch": 0.9278289975923425, "grad_norm": 0.036905597895383835, "learning_rate": 0.00016307619943555974, "loss": 1.0563, "step": 7900 }, { "epoch": 0.9395736684479418, "grad_norm": 0.036890897899866104, "learning_rate": 0.00016260583254938853, "loss": 1.059, "step": 8000 }, { "epoch": 0.951318339303541, "grad_norm": 0.03704727813601494, "learning_rate": 0.00016213546566321731, "loss": 1.0567, "step": 8100 }, { "epoch": 0.9630630101591403, "grad_norm": 0.03992870822548866, "learning_rate": 0.0001616650987770461, "loss": 1.0675, "step": 8200 }, { "epoch": 0.9748076810147396, "grad_norm": 0.03689022362232208, "learning_rate": 0.0001611947318908749, "loss": 1.0592, "step": 8300 }, { "epoch": 0.9865523518703389, "grad_norm": 0.03615827485918999, "learning_rate": 0.00016072436500470368, "loss": 1.0404, "step": 8400 }, { "epoch": 0.9982970227259381, "grad_norm": 0.03881993889808655, "learning_rate": 0.00016025399811853244, "loss": 1.0506, "step": 8500 }, { "epoch": 1.0100416935815373, "grad_norm": 0.03911367058753967, "learning_rate": 0.00015978363123236123, "loss": 1.0508, "step": 8600 }, { "epoch": 1.0217863644371366, "grad_norm": 0.03765745460987091, "learning_rate": 0.00015931326434619002, "loss": 1.0432, "step": 8700 }, { "epoch": 1.0335310352927358, "grad_norm": 0.038481660187244415, "learning_rate": 0.0001588428974600188, "loss": 1.0493, "step": 8800 }, { "epoch": 1.045275706148335, "grad_norm": 0.03929019346833229, "learning_rate": 0.0001583725305738476, "loss": 1.0584, "step": 8900 }, { "epoch": 1.0570203770039344, "grad_norm": 0.03770457208156586, "learning_rate": 0.0001579021636876764, "loss": 1.0497, "step": 9000 }, { "epoch": 1.0687650478595336, "grad_norm": 0.037688400596380234, "learning_rate": 0.00015743179680150518, "loss": 1.0432, "step": 9100 }, { "epoch": 1.080509718715133, "grad_norm": 0.0388583168387413, "learning_rate": 0.00015696142991533397, "loss": 1.0406, "step": 9200 }, { "epoch": 1.0922543895707322, "grad_norm": 0.0381295308470726, "learning_rate": 0.00015649106302916276, "loss": 1.0343, "step": 9300 }, { "epoch": 1.1039990604263314, "grad_norm": 0.03957001864910126, "learning_rate": 0.00015602069614299155, "loss": 1.0445, "step": 9400 }, { "epoch": 1.1157437312819307, "grad_norm": 0.03933073207736015, "learning_rate": 0.00015555032925682034, "loss": 1.0498, "step": 9500 }, { "epoch": 1.12748840213753, "grad_norm": 0.03816806897521019, "learning_rate": 0.00015507996237064912, "loss": 1.0441, "step": 9600 }, { "epoch": 1.1392330729931293, "grad_norm": 0.038057826459407806, "learning_rate": 0.0001546095954844779, "loss": 1.0404, "step": 9700 }, { "epoch": 1.1509777438487285, "grad_norm": 0.03927973657846451, "learning_rate": 0.0001541392285983067, "loss": 1.0435, "step": 9800 }, { "epoch": 1.1627224147043278, "grad_norm": 0.039503954350948334, "learning_rate": 0.0001536688617121355, "loss": 1.0592, "step": 9900 }, { "epoch": 1.174467085559927, "grad_norm": 0.04005419462919235, "learning_rate": 0.00015319849482596428, "loss": 1.0445, "step": 10000 }, { "epoch": 1.1862117564155263, "grad_norm": 0.03986848145723343, "learning_rate": 0.00015272812793979304, "loss": 1.0569, "step": 10100 }, { "epoch": 1.1979564272711256, "grad_norm": 0.03886035457253456, "learning_rate": 0.00015225776105362183, "loss": 1.0473, "step": 10200 }, { "epoch": 1.2097010981267249, "grad_norm": 0.03941928222775459, "learning_rate": 0.00015178739416745062, "loss": 1.0399, "step": 10300 }, { "epoch": 1.2214457689823242, "grad_norm": 0.04138774052262306, "learning_rate": 0.0001513170272812794, "loss": 1.0387, "step": 10400 }, { "epoch": 1.2331904398379234, "grad_norm": 0.040227312594652176, "learning_rate": 0.0001508466603951082, "loss": 1.0592, "step": 10500 }, { "epoch": 1.2449351106935227, "grad_norm": 0.03977705165743828, "learning_rate": 0.000150376293508937, "loss": 1.047, "step": 10600 }, { "epoch": 1.256679781549122, "grad_norm": 0.040624938905239105, "learning_rate": 0.00014990592662276578, "loss": 1.0528, "step": 10700 }, { "epoch": 1.2684244524047212, "grad_norm": 0.03948456794023514, "learning_rate": 0.00014943555973659457, "loss": 1.0413, "step": 10800 }, { "epoch": 1.2801691232603205, "grad_norm": 0.0396355502307415, "learning_rate": 0.00014896519285042333, "loss": 1.0403, "step": 10900 }, { "epoch": 1.2919137941159198, "grad_norm": 0.04015343636274338, "learning_rate": 0.00014849482596425212, "loss": 1.057, "step": 11000 }, { "epoch": 1.303658464971519, "grad_norm": 0.03963370993733406, "learning_rate": 0.0001480244590780809, "loss": 1.0474, "step": 11100 }, { "epoch": 1.3154031358271183, "grad_norm": 0.03991986811161041, "learning_rate": 0.0001475540921919097, "loss": 1.0333, "step": 11200 }, { "epoch": 1.3271478066827176, "grad_norm": 0.040591537952423096, "learning_rate": 0.00014708372530573848, "loss": 1.0401, "step": 11300 }, { "epoch": 1.3388924775383169, "grad_norm": 0.039190664887428284, "learning_rate": 0.00014661335841956727, "loss": 1.0467, "step": 11400 }, { "epoch": 1.3506371483939161, "grad_norm": 0.03903213143348694, "learning_rate": 0.00014614299153339606, "loss": 1.0557, "step": 11500 }, { "epoch": 1.3623818192495154, "grad_norm": 0.03976823762059212, "learning_rate": 0.00014567262464722485, "loss": 1.055, "step": 11600 }, { "epoch": 1.3741264901051147, "grad_norm": 0.04024571180343628, "learning_rate": 0.00014520225776105364, "loss": 1.0375, "step": 11700 }, { "epoch": 1.385871160960714, "grad_norm": 0.040485769510269165, "learning_rate": 0.0001447318908748824, "loss": 1.0407, "step": 11800 }, { "epoch": 1.3976158318163132, "grad_norm": 0.04040844738483429, "learning_rate": 0.0001442615239887112, "loss": 1.0395, "step": 11900 }, { "epoch": 1.4093605026719125, "grad_norm": 0.039215583354234695, "learning_rate": 0.00014379115710253998, "loss": 1.0417, "step": 12000 }, { "epoch": 1.4211051735275118, "grad_norm": 0.040224071592092514, "learning_rate": 0.00014332079021636877, "loss": 1.0342, "step": 12100 }, { "epoch": 1.432849844383111, "grad_norm": 0.03919661417603493, "learning_rate": 0.00014285042333019756, "loss": 1.0396, "step": 12200 }, { "epoch": 1.4445945152387103, "grad_norm": 0.0384608618915081, "learning_rate": 0.00014238005644402635, "loss": 1.0467, "step": 12300 }, { "epoch": 1.4563391860943098, "grad_norm": 0.039908237755298615, "learning_rate": 0.00014190968955785514, "loss": 1.0539, "step": 12400 }, { "epoch": 1.468083856949909, "grad_norm": 0.0399697907269001, "learning_rate": 0.00014143932267168393, "loss": 1.0445, "step": 12500 }, { "epoch": 1.4798285278055083, "grad_norm": 0.04020760953426361, "learning_rate": 0.0001409689557855127, "loss": 1.0456, "step": 12600 }, { "epoch": 1.4915731986611076, "grad_norm": 0.04049207270145416, "learning_rate": 0.00014049858889934148, "loss": 1.0402, "step": 12700 }, { "epoch": 1.5033178695167067, "grad_norm": 0.0405813567340374, "learning_rate": 0.00014002822201317027, "loss": 1.0327, "step": 12800 }, { "epoch": 1.515062540372306, "grad_norm": 0.04098460450768471, "learning_rate": 0.00013955785512699906, "loss": 1.0426, "step": 12900 }, { "epoch": 1.5268072112279052, "grad_norm": 0.03912067785859108, "learning_rate": 0.00013908748824082785, "loss": 1.0398, "step": 13000 }, { "epoch": 1.5385518820835045, "grad_norm": 0.040918510407209396, "learning_rate": 0.00013861712135465663, "loss": 1.0511, "step": 13100 }, { "epoch": 1.5502965529391037, "grad_norm": 0.039643533527851105, "learning_rate": 0.00013814675446848542, "loss": 1.0408, "step": 13200 }, { "epoch": 1.562041223794703, "grad_norm": 0.04073023423552513, "learning_rate": 0.0001376763875823142, "loss": 1.0465, "step": 13300 }, { "epoch": 1.5737858946503023, "grad_norm": 0.03993350267410278, "learning_rate": 0.000137206020696143, "loss": 1.0481, "step": 13400 }, { "epoch": 1.5855305655059015, "grad_norm": 0.0393822006881237, "learning_rate": 0.00013673565380997176, "loss": 1.0378, "step": 13500 }, { "epoch": 1.5972752363615008, "grad_norm": 0.041657913476228714, "learning_rate": 0.00013626528692380055, "loss": 1.0305, "step": 13600 }, { "epoch": 1.6090199072171, "grad_norm": 0.0409579873085022, "learning_rate": 0.00013579492003762934, "loss": 1.0433, "step": 13700 }, { "epoch": 1.6207645780726994, "grad_norm": 0.039673928171396255, "learning_rate": 0.00013532455315145813, "loss": 1.0286, "step": 13800 }, { "epoch": 1.6325092489282986, "grad_norm": 0.04071459546685219, "learning_rate": 0.00013485418626528692, "loss": 1.0413, "step": 13900 }, { "epoch": 1.644253919783898, "grad_norm": 0.04058365523815155, "learning_rate": 0.00013438381937911574, "loss": 1.0465, "step": 14000 }, { "epoch": 1.6559985906394972, "grad_norm": 0.04230272024869919, "learning_rate": 0.00013391345249294453, "loss": 1.0304, "step": 14100 }, { "epoch": 1.6677432614950964, "grad_norm": 0.04120560362935066, "learning_rate": 0.0001334430856067733, "loss": 1.0459, "step": 14200 }, { "epoch": 1.6794879323506957, "grad_norm": 0.0403909832239151, "learning_rate": 0.00013297271872060208, "loss": 1.0402, "step": 14300 }, { "epoch": 1.691232603206295, "grad_norm": 0.04099250212311745, "learning_rate": 0.00013250235183443087, "loss": 1.0477, "step": 14400 }, { "epoch": 1.7029772740618943, "grad_norm": 0.039906516671180725, "learning_rate": 0.00013203198494825965, "loss": 1.057, "step": 14500 }, { "epoch": 1.7147219449174935, "grad_norm": 0.04008522629737854, "learning_rate": 0.00013156161806208844, "loss": 1.0433, "step": 14600 }, { "epoch": 1.7264666157730928, "grad_norm": 0.0416274331510067, "learning_rate": 0.00013109125117591723, "loss": 1.0405, "step": 14700 }, { "epoch": 1.738211286628692, "grad_norm": 0.04142718017101288, "learning_rate": 0.00013062088428974602, "loss": 1.0349, "step": 14800 }, { "epoch": 1.7499559574842916, "grad_norm": 0.0407978855073452, "learning_rate": 0.0001301505174035748, "loss": 1.0415, "step": 14900 }, { "epoch": 1.7617006283398908, "grad_norm": 0.03977083042263985, "learning_rate": 0.0001296801505174036, "loss": 1.0522, "step": 15000 }, { "epoch": 1.77344529919549, "grad_norm": 0.04186280444264412, "learning_rate": 0.00012920978363123236, "loss": 1.0515, "step": 15100 }, { "epoch": 1.7851899700510894, "grad_norm": 0.04049232602119446, "learning_rate": 0.00012873941674506115, "loss": 1.0517, "step": 15200 }, { "epoch": 1.7969346409066886, "grad_norm": 0.039164550602436066, "learning_rate": 0.00012826904985888994, "loss": 1.0403, "step": 15300 }, { "epoch": 1.808679311762288, "grad_norm": 0.04166054725646973, "learning_rate": 0.00012779868297271873, "loss": 1.0469, "step": 15400 }, { "epoch": 1.8204239826178872, "grad_norm": 0.0396597720682621, "learning_rate": 0.00012732831608654752, "loss": 1.0433, "step": 15500 }, { "epoch": 1.8321686534734865, "grad_norm": 0.041060902178287506, "learning_rate": 0.0001268579492003763, "loss": 1.0298, "step": 15600 }, { "epoch": 1.8439133243290857, "grad_norm": 0.04100984334945679, "learning_rate": 0.0001263875823142051, "loss": 1.0423, "step": 15700 }, { "epoch": 1.855657995184685, "grad_norm": 0.03933743014931679, "learning_rate": 0.00012591721542803389, "loss": 1.0459, "step": 15800 }, { "epoch": 1.8674026660402843, "grad_norm": 0.04171588271856308, "learning_rate": 0.00012544684854186265, "loss": 1.0411, "step": 15900 }, { "epoch": 1.8791473368958835, "grad_norm": 0.04075104370713234, "learning_rate": 0.00012497648165569144, "loss": 1.0521, "step": 16000 }, { "epoch": 1.8908920077514828, "grad_norm": 0.04037100449204445, "learning_rate": 0.00012450611476952023, "loss": 1.0482, "step": 16100 }, { "epoch": 1.902636678607082, "grad_norm": 0.04113980382680893, "learning_rate": 0.00012403574788334901, "loss": 1.0335, "step": 16200 }, { "epoch": 1.9143813494626813, "grad_norm": 0.04102000594139099, "learning_rate": 0.0001235653809971778, "loss": 1.0462, "step": 16300 }, { "epoch": 1.9261260203182806, "grad_norm": 0.041277866810560226, "learning_rate": 0.0001230950141110066, "loss": 1.0472, "step": 16400 }, { "epoch": 1.93787069117388, "grad_norm": 0.040296610444784164, "learning_rate": 0.00012262464722483538, "loss": 1.0455, "step": 16500 }, { "epoch": 1.9496153620294792, "grad_norm": 0.04030190408229828, "learning_rate": 0.00012215428033866417, "loss": 1.036, "step": 16600 }, { "epoch": 1.9613600328850784, "grad_norm": 0.04076563939452171, "learning_rate": 0.00012168391345249295, "loss": 1.0355, "step": 16700 }, { "epoch": 1.9731047037406777, "grad_norm": 0.042260345071554184, "learning_rate": 0.00012121354656632174, "loss": 1.0318, "step": 16800 }, { "epoch": 1.984849374596277, "grad_norm": 0.04094604775309563, "learning_rate": 0.00012074317968015052, "loss": 1.0272, "step": 16900 }, { "epoch": 1.9965940454518762, "grad_norm": 0.04106820747256279, "learning_rate": 0.0001202728127939793, "loss": 1.0248, "step": 17000 }, { "epoch": 2.0083387163074753, "grad_norm": 0.041894011199474335, "learning_rate": 0.00011980244590780809, "loss": 1.0415, "step": 17100 }, { "epoch": 2.0200833871630746, "grad_norm": 0.04177311062812805, "learning_rate": 0.00011933207902163688, "loss": 1.034, "step": 17200 }, { "epoch": 2.031828058018674, "grad_norm": 0.04308000206947327, "learning_rate": 0.00011886171213546567, "loss": 1.0164, "step": 17300 }, { "epoch": 2.043572728874273, "grad_norm": 0.04087553173303604, "learning_rate": 0.00011839134524929444, "loss": 1.0271, "step": 17400 }, { "epoch": 2.0553173997298724, "grad_norm": 0.04184258356690407, "learning_rate": 0.00011792097836312323, "loss": 1.0214, "step": 17500 }, { "epoch": 2.0670620705854716, "grad_norm": 0.0424019880592823, "learning_rate": 0.00011745061147695202, "loss": 1.041, "step": 17600 }, { "epoch": 2.078806741441071, "grad_norm": 0.04137306660413742, "learning_rate": 0.00011698024459078081, "loss": 1.0182, "step": 17700 }, { "epoch": 2.09055141229667, "grad_norm": 0.04244280233979225, "learning_rate": 0.0001165098777046096, "loss": 1.0269, "step": 17800 }, { "epoch": 2.1022960831522695, "grad_norm": 0.04111913591623306, "learning_rate": 0.00011603951081843838, "loss": 1.0335, "step": 17900 }, { "epoch": 2.1140407540078687, "grad_norm": 0.0424952507019043, "learning_rate": 0.00011556914393226716, "loss": 1.0266, "step": 18000 }, { "epoch": 2.125785424863468, "grad_norm": 0.0427490659058094, "learning_rate": 0.00011509877704609595, "loss": 1.0274, "step": 18100 }, { "epoch": 2.1375300957190673, "grad_norm": 0.04336949810385704, "learning_rate": 0.00011462841015992474, "loss": 1.0268, "step": 18200 }, { "epoch": 2.1492747665746665, "grad_norm": 0.04262473061680794, "learning_rate": 0.00011415804327375352, "loss": 1.0214, "step": 18300 }, { "epoch": 2.161019437430266, "grad_norm": 0.04471023753285408, "learning_rate": 0.00011368767638758231, "loss": 1.0295, "step": 18400 }, { "epoch": 2.172764108285865, "grad_norm": 0.044171739369630814, "learning_rate": 0.0001132173095014111, "loss": 1.0296, "step": 18500 }, { "epoch": 2.1845087791414644, "grad_norm": 0.04301764816045761, "learning_rate": 0.0001127469426152399, "loss": 1.0258, "step": 18600 }, { "epoch": 2.196253449997064, "grad_norm": 0.04325546696782112, "learning_rate": 0.00011227657572906869, "loss": 1.0288, "step": 18700 }, { "epoch": 2.207998120852663, "grad_norm": 0.04137617349624634, "learning_rate": 0.00011180620884289748, "loss": 1.0385, "step": 18800 }, { "epoch": 2.2197427917082626, "grad_norm": 0.043259892612695694, "learning_rate": 0.00011133584195672627, "loss": 1.0264, "step": 18900 }, { "epoch": 2.2314874625638614, "grad_norm": 0.044129449874162674, "learning_rate": 0.00011086547507055504, "loss": 1.0313, "step": 19000 }, { "epoch": 2.243232133419461, "grad_norm": 0.04146253690123558, "learning_rate": 0.00011039510818438383, "loss": 1.0304, "step": 19100 }, { "epoch": 2.25497680427506, "grad_norm": 0.042836517095565796, "learning_rate": 0.00010992474129821262, "loss": 1.0281, "step": 19200 }, { "epoch": 2.2667214751306597, "grad_norm": 0.04558749496936798, "learning_rate": 0.00010945437441204141, "loss": 1.03, "step": 19300 }, { "epoch": 2.2784661459862585, "grad_norm": 0.04284907504916191, "learning_rate": 0.00010898400752587018, "loss": 1.0469, "step": 19400 }, { "epoch": 2.2902108168418582, "grad_norm": 0.04364863410592079, "learning_rate": 0.00010851364063969897, "loss": 1.0295, "step": 19500 }, { "epoch": 2.301955487697457, "grad_norm": 0.042074691504240036, "learning_rate": 0.00010804327375352776, "loss": 1.0458, "step": 19600 }, { "epoch": 2.3137001585530568, "grad_norm": 0.043050698935985565, "learning_rate": 0.00010757290686735655, "loss": 1.0438, "step": 19700 }, { "epoch": 2.3254448294086556, "grad_norm": 0.04526820033788681, "learning_rate": 0.00010710253998118534, "loss": 1.0283, "step": 19800 }, { "epoch": 2.3371895002642553, "grad_norm": 0.04481109231710434, "learning_rate": 0.00010663217309501412, "loss": 1.0205, "step": 19900 }, { "epoch": 2.348934171119854, "grad_norm": 0.04517560824751854, "learning_rate": 0.0001061618062088429, "loss": 1.0179, "step": 20000 }, { "epoch": 2.3607962886840097, "grad_norm": 0.045082226395606995, "learning_rate": 0.0001056914393226717, "loss": 1.0406, "step": 20100 }, { "epoch": 2.372540959539609, "grad_norm": 0.042884670197963715, "learning_rate": 0.00010522107243650048, "loss": 1.0221, "step": 20200 }, { "epoch": 2.3842856303952082, "grad_norm": 0.04309197515249252, "learning_rate": 0.00010475070555032926, "loss": 1.0287, "step": 20300 }, { "epoch": 2.3960303012508075, "grad_norm": 0.04434290900826454, "learning_rate": 0.00010428033866415805, "loss": 1.0269, "step": 20400 }, { "epoch": 2.407774972106407, "grad_norm": 0.044556260108947754, "learning_rate": 0.00010380997177798684, "loss": 1.0454, "step": 20500 }, { "epoch": 2.419519642962006, "grad_norm": 0.043353039771318436, "learning_rate": 0.00010333960489181563, "loss": 1.0401, "step": 20600 }, { "epoch": 2.4312643138176053, "grad_norm": 0.045345306396484375, "learning_rate": 0.0001028692380056444, "loss": 1.0172, "step": 20700 }, { "epoch": 2.4430089846732046, "grad_norm": 0.043075308203697205, "learning_rate": 0.00010239887111947319, "loss": 1.0413, "step": 20800 }, { "epoch": 2.454753655528804, "grad_norm": 0.044308003038167953, "learning_rate": 0.00010192850423330198, "loss": 1.0292, "step": 20900 }, { "epoch": 2.466498326384403, "grad_norm": 0.04506301134824753, "learning_rate": 0.00010145813734713077, "loss": 1.0382, "step": 21000 }, { "epoch": 2.4782429972400024, "grad_norm": 0.04396146163344383, "learning_rate": 0.00010098777046095956, "loss": 1.0337, "step": 21100 }, { "epoch": 2.4899876680956017, "grad_norm": 0.044499751180410385, "learning_rate": 0.00010051740357478833, "loss": 1.0407, "step": 21200 }, { "epoch": 2.501732338951201, "grad_norm": 0.042769189924001694, "learning_rate": 0.00010004703668861712, "loss": 1.031, "step": 21300 }, { "epoch": 2.5134770098068, "grad_norm": 0.0427669994533062, "learning_rate": 9.957666980244591e-05, "loss": 1.0193, "step": 21400 }, { "epoch": 2.5252216806623995, "grad_norm": 0.04454643651843071, "learning_rate": 9.91063029162747e-05, "loss": 1.0256, "step": 21500 }, { "epoch": 2.5369663515179988, "grad_norm": 0.042179521173238754, "learning_rate": 9.863593603010348e-05, "loss": 1.0293, "step": 21600 }, { "epoch": 2.548711022373598, "grad_norm": 0.04245784506201744, "learning_rate": 9.816556914393227e-05, "loss": 1.0138, "step": 21700 }, { "epoch": 2.5604556932291973, "grad_norm": 0.04282999783754349, "learning_rate": 9.769520225776106e-05, "loss": 1.0216, "step": 21800 }, { "epoch": 2.5722003640847966, "grad_norm": 0.04309820756316185, "learning_rate": 9.722483537158984e-05, "loss": 1.0436, "step": 21900 }, { "epoch": 2.583945034940396, "grad_norm": 0.0428336001932621, "learning_rate": 9.675446848541862e-05, "loss": 1.0327, "step": 22000 }, { "epoch": 2.595689705795995, "grad_norm": 0.0433744341135025, "learning_rate": 9.628410159924742e-05, "loss": 1.0248, "step": 22100 }, { "epoch": 2.6074343766515944, "grad_norm": 0.0440264493227005, "learning_rate": 9.581373471307621e-05, "loss": 1.0276, "step": 22200 }, { "epoch": 2.6191790475071937, "grad_norm": 0.04474351555109024, "learning_rate": 9.5343367826905e-05, "loss": 1.0392, "step": 22300 }, { "epoch": 2.630923718362793, "grad_norm": 0.04529641568660736, "learning_rate": 9.487300094073378e-05, "loss": 1.0436, "step": 22400 }, { "epoch": 2.642668389218392, "grad_norm": 0.04383498802781105, "learning_rate": 9.440263405456257e-05, "loss": 1.0271, "step": 22500 }, { "epoch": 2.6544130600739915, "grad_norm": 0.04416006803512573, "learning_rate": 9.393226716839135e-05, "loss": 1.035, "step": 22600 }, { "epoch": 2.6661577309295907, "grad_norm": 0.0439978651702404, "learning_rate": 9.346190028222014e-05, "loss": 1.0242, "step": 22700 }, { "epoch": 2.67790240178519, "grad_norm": 0.043451737612485886, "learning_rate": 9.299153339604892e-05, "loss": 1.0266, "step": 22800 }, { "epoch": 2.6896470726407893, "grad_norm": 0.043133124709129333, "learning_rate": 9.252116650987771e-05, "loss": 1.0313, "step": 22900 }, { "epoch": 2.7013917434963886, "grad_norm": 0.04365682601928711, "learning_rate": 9.20507996237065e-05, "loss": 1.0164, "step": 23000 }, { "epoch": 2.713136414351988, "grad_norm": 0.045253172516822815, "learning_rate": 9.158043273753529e-05, "loss": 1.0234, "step": 23100 }, { "epoch": 2.724881085207587, "grad_norm": 0.04371510446071625, "learning_rate": 9.111006585136406e-05, "loss": 1.0262, "step": 23200 }, { "epoch": 2.7366257560631864, "grad_norm": 0.04663108289241791, "learning_rate": 9.063969896519285e-05, "loss": 1.027, "step": 23300 }, { "epoch": 2.7483704269187856, "grad_norm": 0.043591178953647614, "learning_rate": 9.016933207902164e-05, "loss": 1.0362, "step": 23400 }, { "epoch": 2.760115097774385, "grad_norm": 0.04423443600535393, "learning_rate": 8.969896519285043e-05, "loss": 1.0262, "step": 23500 }, { "epoch": 2.771859768629984, "grad_norm": 0.045264832675457, "learning_rate": 8.922859830667922e-05, "loss": 1.0227, "step": 23600 }, { "epoch": 2.7836044394855834, "grad_norm": 0.04213082045316696, "learning_rate": 8.8758231420508e-05, "loss": 1.0161, "step": 23700 }, { "epoch": 2.7953491103411827, "grad_norm": 0.04401146247982979, "learning_rate": 8.828786453433678e-05, "loss": 1.0269, "step": 23800 }, { "epoch": 2.807093781196782, "grad_norm": 0.043770622462034225, "learning_rate": 8.781749764816557e-05, "loss": 1.0429, "step": 23900 }, { "epoch": 2.8188384520523813, "grad_norm": 0.04466963931918144, "learning_rate": 8.734713076199436e-05, "loss": 1.0309, "step": 24000 }, { "epoch": 2.8305831229079805, "grad_norm": 0.042598120868206024, "learning_rate": 8.687676387582314e-05, "loss": 1.0321, "step": 24100 }, { "epoch": 2.84232779376358, "grad_norm": 0.04534047096967697, "learning_rate": 8.640639698965193e-05, "loss": 1.0335, "step": 24200 }, { "epoch": 2.854072464619179, "grad_norm": 0.044474124908447266, "learning_rate": 8.593603010348071e-05, "loss": 1.0303, "step": 24300 }, { "epoch": 2.8658171354747783, "grad_norm": 0.04398440942168236, "learning_rate": 8.546566321730952e-05, "loss": 1.0228, "step": 24400 }, { "epoch": 2.8775618063303776, "grad_norm": 0.043202903121709824, "learning_rate": 8.499529633113829e-05, "loss": 1.0299, "step": 24500 }, { "epoch": 2.889306477185977, "grad_norm": 0.04326211288571358, "learning_rate": 8.452492944496708e-05, "loss": 1.0324, "step": 24600 }, { "epoch": 2.901051148041576, "grad_norm": 0.044613469392061234, "learning_rate": 8.405456255879587e-05, "loss": 1.0269, "step": 24700 }, { "epoch": 2.9127958188971754, "grad_norm": 0.04421741142868996, "learning_rate": 8.358419567262466e-05, "loss": 1.0193, "step": 24800 }, { "epoch": 2.9245404897527747, "grad_norm": 0.044949114322662354, "learning_rate": 8.311382878645344e-05, "loss": 1.0296, "step": 24900 }, { "epoch": 2.936285160608374, "grad_norm": 0.044719964265823364, "learning_rate": 8.264346190028222e-05, "loss": 1.031, "step": 25000 }, { "epoch": 2.9480298314639732, "grad_norm": 0.04360034689307213, "learning_rate": 8.217309501411101e-05, "loss": 1.0163, "step": 25100 }, { "epoch": 2.9597745023195725, "grad_norm": 0.0441979356110096, "learning_rate": 8.17027281279398e-05, "loss": 1.0325, "step": 25200 }, { "epoch": 2.971519173175172, "grad_norm": 0.044677652418613434, "learning_rate": 8.123236124176858e-05, "loss": 1.0286, "step": 25300 }, { "epoch": 2.983263844030771, "grad_norm": 0.042885322123765945, "learning_rate": 8.076199435559737e-05, "loss": 1.0365, "step": 25400 }, { "epoch": 2.9950085148863703, "grad_norm": 0.042082566767930984, "learning_rate": 8.029162746942616e-05, "loss": 1.0351, "step": 25500 }, { "epoch": 3.0067531857419696, "grad_norm": 0.04490746557712555, "learning_rate": 7.982126058325495e-05, "loss": 1.0112, "step": 25600 }, { "epoch": 3.018497856597569, "grad_norm": 0.048318084329366684, "learning_rate": 7.935089369708372e-05, "loss": 1.0144, "step": 25700 }, { "epoch": 3.030242527453168, "grad_norm": 0.04372231662273407, "learning_rate": 7.888052681091251e-05, "loss": 1.0081, "step": 25800 }, { "epoch": 3.0419871983087674, "grad_norm": 0.04528006911277771, "learning_rate": 7.84101599247413e-05, "loss": 1.0215, "step": 25900 }, { "epoch": 3.0537318691643667, "grad_norm": 0.04795797914266586, "learning_rate": 7.793979303857009e-05, "loss": 1.0226, "step": 26000 }, { "epoch": 3.065476540019966, "grad_norm": 0.04441961273550987, "learning_rate": 7.746942615239888e-05, "loss": 1.0298, "step": 26100 }, { "epoch": 3.077221210875565, "grad_norm": 0.044861868023872375, "learning_rate": 7.699905926622765e-05, "loss": 1.0158, "step": 26200 }, { "epoch": 3.0889658817311645, "grad_norm": 0.04549916088581085, "learning_rate": 7.652869238005644e-05, "loss": 1.0107, "step": 26300 }, { "epoch": 3.1007105525867638, "grad_norm": 0.04485148563981056, "learning_rate": 7.605832549388523e-05, "loss": 1.0295, "step": 26400 }, { "epoch": 3.112455223442363, "grad_norm": 0.0463709756731987, "learning_rate": 7.558795860771402e-05, "loss": 1.0237, "step": 26500 }, { "epoch": 3.1241998942979623, "grad_norm": 0.04507851600646973, "learning_rate": 7.51175917215428e-05, "loss": 1.0257, "step": 26600 }, { "epoch": 3.1359445651535616, "grad_norm": 0.04443085938692093, "learning_rate": 7.46472248353716e-05, "loss": 1.0161, "step": 26700 }, { "epoch": 3.147689236009161, "grad_norm": 0.04493951424956322, "learning_rate": 7.417685794920039e-05, "loss": 1.0103, "step": 26800 }, { "epoch": 3.15943390686476, "grad_norm": 0.04466501250863075, "learning_rate": 7.370649106302918e-05, "loss": 1.0184, "step": 26900 }, { "epoch": 3.1711785777203594, "grad_norm": 0.04674587398767471, "learning_rate": 7.323612417685795e-05, "loss": 1.0234, "step": 27000 }, { "epoch": 3.1829232485759587, "grad_norm": 0.04568205028772354, "learning_rate": 7.276575729068674e-05, "loss": 1.0177, "step": 27100 }, { "epoch": 3.194667919431558, "grad_norm": 0.04736079275608063, "learning_rate": 7.229539040451553e-05, "loss": 1.0238, "step": 27200 }, { "epoch": 3.206412590287157, "grad_norm": 0.04510754346847534, "learning_rate": 7.182502351834432e-05, "loss": 1.0239, "step": 27300 }, { "epoch": 3.2181572611427565, "grad_norm": 0.04676396772265434, "learning_rate": 7.13546566321731e-05, "loss": 1.0226, "step": 27400 }, { "epoch": 3.2299019319983557, "grad_norm": 0.04639539122581482, "learning_rate": 7.088428974600188e-05, "loss": 1.0304, "step": 27500 }, { "epoch": 3.241646602853955, "grad_norm": 0.046673484146595, "learning_rate": 7.041392285983067e-05, "loss": 1.029, "step": 27600 }, { "epoch": 3.2533912737095543, "grad_norm": 0.04434806853532791, "learning_rate": 6.994355597365946e-05, "loss": 1.0231, "step": 27700 }, { "epoch": 3.2651359445651535, "grad_norm": 0.046948377043008804, "learning_rate": 6.947318908748824e-05, "loss": 1.0212, "step": 27800 }, { "epoch": 3.276880615420753, "grad_norm": 0.045691922307014465, "learning_rate": 6.900282220131703e-05, "loss": 1.0228, "step": 27900 }, { "epoch": 3.288625286276352, "grad_norm": 0.04534591734409332, "learning_rate": 6.853245531514582e-05, "loss": 1.0311, "step": 28000 }, { "epoch": 3.3003699571319514, "grad_norm": 0.045218247920274734, "learning_rate": 6.80620884289746e-05, "loss": 1.0145, "step": 28100 }, { "epoch": 3.3121146279875506, "grad_norm": 0.046320728957653046, "learning_rate": 6.75917215428034e-05, "loss": 1.0184, "step": 28200 }, { "epoch": 3.32385929884315, "grad_norm": 0.04595513269305229, "learning_rate": 6.712135465663217e-05, "loss": 1.0181, "step": 28300 }, { "epoch": 3.335603969698749, "grad_norm": 0.04726444184780121, "learning_rate": 6.665098777046096e-05, "loss": 1.019, "step": 28400 }, { "epoch": 3.3473486405543484, "grad_norm": 0.0476396419107914, "learning_rate": 6.618062088428975e-05, "loss": 1.0109, "step": 28500 }, { "epoch": 3.3590933114099477, "grad_norm": 0.04619845747947693, "learning_rate": 6.571025399811854e-05, "loss": 1.0322, "step": 28600 }, { "epoch": 3.370837982265547, "grad_norm": 0.04548267647624016, "learning_rate": 6.523988711194731e-05, "loss": 1.0088, "step": 28700 }, { "epoch": 3.3825826531211463, "grad_norm": 0.04472291097044945, "learning_rate": 6.47695202257761e-05, "loss": 1.0371, "step": 28800 }, { "epoch": 3.3943273239767455, "grad_norm": 0.04602396488189697, "learning_rate": 6.429915333960489e-05, "loss": 1.0246, "step": 28900 }, { "epoch": 3.406071994832345, "grad_norm": 0.0454532653093338, "learning_rate": 6.382878645343368e-05, "loss": 1.0254, "step": 29000 }, { "epoch": 3.417816665687944, "grad_norm": 0.04494043067097664, "learning_rate": 6.335841956726247e-05, "loss": 1.0193, "step": 29100 }, { "epoch": 3.4295613365435433, "grad_norm": 0.045226361602544785, "learning_rate": 6.288805268109126e-05, "loss": 1.0075, "step": 29200 }, { "epoch": 3.4413060073991426, "grad_norm": 0.04578743502497673, "learning_rate": 6.241768579492005e-05, "loss": 1.0175, "step": 29300 }, { "epoch": 3.453050678254742, "grad_norm": 0.04741055890917778, "learning_rate": 6.194731890874884e-05, "loss": 1.025, "step": 29400 }, { "epoch": 3.464795349110341, "grad_norm": 0.046121254563331604, "learning_rate": 6.147695202257761e-05, "loss": 1.0122, "step": 29500 }, { "epoch": 3.4765400199659404, "grad_norm": 0.04572110250592232, "learning_rate": 6.10065851364064e-05, "loss": 1.0226, "step": 29600 }, { "epoch": 3.4882846908215397, "grad_norm": 0.04542776942253113, "learning_rate": 6.053621825023519e-05, "loss": 1.0139, "step": 29700 }, { "epoch": 3.500029361677139, "grad_norm": 0.04612453654408455, "learning_rate": 6.006585136406397e-05, "loss": 1.0137, "step": 29800 }, { "epoch": 3.5117740325327382, "grad_norm": 0.04599248990416527, "learning_rate": 5.959548447789276e-05, "loss": 1.0254, "step": 29900 }, { "epoch": 3.5235187033883375, "grad_norm": 0.047213006764650345, "learning_rate": 5.9125117591721544e-05, "loss": 1.0202, "step": 30000 }, { "epoch": 3.5352633742439368, "grad_norm": 0.04772693291306496, "learning_rate": 5.865475070555033e-05, "loss": 1.0158, "step": 30100 }, { "epoch": 3.547008045099536, "grad_norm": 0.04780668392777443, "learning_rate": 5.8184383819379116e-05, "loss": 1.0122, "step": 30200 }, { "epoch": 3.5587527159551353, "grad_norm": 0.04593056067824364, "learning_rate": 5.7714016933207905e-05, "loss": 1.0196, "step": 30300 }, { "epoch": 3.5704973868107346, "grad_norm": 0.046468161046504974, "learning_rate": 5.724365004703669e-05, "loss": 1.016, "step": 30400 }, { "epoch": 3.582242057666334, "grad_norm": 0.046613674610853195, "learning_rate": 5.6773283160865476e-05, "loss": 1.028, "step": 30500 }, { "epoch": 3.593986728521933, "grad_norm": 0.0453767292201519, "learning_rate": 5.6302916274694265e-05, "loss": 1.0179, "step": 30600 }, { "epoch": 3.6057313993775324, "grad_norm": 0.0448760949075222, "learning_rate": 5.583254938852305e-05, "loss": 1.0107, "step": 30700 }, { "epoch": 3.6174760702331317, "grad_norm": 0.04624709486961365, "learning_rate": 5.5362182502351837e-05, "loss": 1.0191, "step": 30800 }, { "epoch": 3.629220741088731, "grad_norm": 0.04776296019554138, "learning_rate": 5.489181561618062e-05, "loss": 1.0145, "step": 30900 }, { "epoch": 3.64096541194433, "grad_norm": 0.044639695435762405, "learning_rate": 5.442144873000941e-05, "loss": 1.0286, "step": 31000 }, { "epoch": 3.6527100827999295, "grad_norm": 0.04474237933754921, "learning_rate": 5.395108184383819e-05, "loss": 1.0202, "step": 31100 }, { "epoch": 3.6644547536555288, "grad_norm": 0.045259665697813034, "learning_rate": 5.348071495766698e-05, "loss": 1.0034, "step": 31200 }, { "epoch": 3.676199424511128, "grad_norm": 0.04631993547081947, "learning_rate": 5.3010348071495775e-05, "loss": 1.026, "step": 31300 }, { "epoch": 3.6879440953667273, "grad_norm": 0.04611456021666527, "learning_rate": 5.253998118532456e-05, "loss": 1.0084, "step": 31400 }, { "epoch": 3.6996887662223266, "grad_norm": 0.04521900787949562, "learning_rate": 5.206961429915335e-05, "loss": 1.019, "step": 31500 }, { "epoch": 3.711433437077926, "grad_norm": 0.04682457074522972, "learning_rate": 5.1599247412982136e-05, "loss": 1.029, "step": 31600 }, { "epoch": 3.723178107933525, "grad_norm": 0.04528072476387024, "learning_rate": 5.112888052681092e-05, "loss": 1.0381, "step": 31700 }, { "epoch": 3.7349227787891244, "grad_norm": 0.044861361384391785, "learning_rate": 5.065851364063971e-05, "loss": 1.0215, "step": 31800 }, { "epoch": 3.7466674496447236, "grad_norm": 0.04482056945562363, "learning_rate": 5.018814675446849e-05, "loss": 1.0113, "step": 31900 }, { "epoch": 3.758412120500323, "grad_norm": 0.046249981969594955, "learning_rate": 4.971777986829728e-05, "loss": 1.0162, "step": 32000 }, { "epoch": 3.7702155147102, "grad_norm": 0.04573667049407959, "learning_rate": 4.924741298212606e-05, "loss": 1.0207, "step": 32100 }, { "epoch": 3.7819601855657994, "grad_norm": 0.045440200716257095, "learning_rate": 4.877704609595485e-05, "loss": 1.0204, "step": 32200 }, { "epoch": 3.7937048564213987, "grad_norm": 0.043568406254053116, "learning_rate": 4.830667920978363e-05, "loss": 1.0218, "step": 32300 }, { "epoch": 3.805449527276998, "grad_norm": 0.04695621505379677, "learning_rate": 4.783631232361242e-05, "loss": 1.016, "step": 32400 }, { "epoch": 3.817194198132597, "grad_norm": 0.04511117562651634, "learning_rate": 4.7365945437441204e-05, "loss": 1.018, "step": 32500 }, { "epoch": 3.8289388689881965, "grad_norm": 0.0471058115363121, "learning_rate": 4.689557855126999e-05, "loss": 1.0265, "step": 32600 }, { "epoch": 3.8406835398437957, "grad_norm": 0.04763401299715042, "learning_rate": 4.6425211665098775e-05, "loss": 1.0207, "step": 32700 }, { "epoch": 3.852428210699395, "grad_norm": 0.04863814637064934, "learning_rate": 4.5954844778927564e-05, "loss": 1.0176, "step": 32800 }, { "epoch": 3.8641728815549943, "grad_norm": 0.04665295407176018, "learning_rate": 4.5484477892756347e-05, "loss": 1.0143, "step": 32900 }, { "epoch": 3.8759175524105935, "grad_norm": 0.04519110545516014, "learning_rate": 4.501411100658514e-05, "loss": 1.0206, "step": 33000 }, { "epoch": 3.887662223266193, "grad_norm": 0.04543546214699745, "learning_rate": 4.4543744120413925e-05, "loss": 1.0204, "step": 33100 }, { "epoch": 3.899406894121792, "grad_norm": 0.04484422877430916, "learning_rate": 4.4073377234242714e-05, "loss": 1.0166, "step": 33200 }, { "epoch": 3.9111515649773914, "grad_norm": 0.04460673779249191, "learning_rate": 4.3603010348071496e-05, "loss": 1.0116, "step": 33300 }, { "epoch": 3.9228962358329906, "grad_norm": 0.04780727997422218, "learning_rate": 4.3132643461900285e-05, "loss": 1.0127, "step": 33400 }, { "epoch": 3.93464090668859, "grad_norm": 0.045138854533433914, "learning_rate": 4.2662276575729074e-05, "loss": 1.0173, "step": 33500 }, { "epoch": 3.946385577544189, "grad_norm": 0.046860042959451675, "learning_rate": 4.219190968955786e-05, "loss": 1.0164, "step": 33600 }, { "epoch": 3.9581302483997884, "grad_norm": 0.04478363320231438, "learning_rate": 4.1721542803386646e-05, "loss": 1.0339, "step": 33700 }, { "epoch": 3.9698749192553877, "grad_norm": 0.04575762897729874, "learning_rate": 4.125117591721543e-05, "loss": 1.01, "step": 33800 }, { "epoch": 3.981619590110987, "grad_norm": 0.04523037001490593, "learning_rate": 4.078080903104422e-05, "loss": 1.0145, "step": 33900 }, { "epoch": 3.9933642609665863, "grad_norm": 0.04593048244714737, "learning_rate": 4.0310442144873e-05, "loss": 1.0131, "step": 34000 } ], "logging_steps": 100, "max_steps": 42570, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.302351602279881e+20, "train_batch_size": 9, "trial_name": null, "trial_params": null }