diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10043 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 6252, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0023992322456813818, + "grad_norm": 98.49387406847768, + "learning_rate": 3.9936102236421723e-07, + "loss": 3.4946, + "mean_token_accuracy": 0.4661314785480499, + "step": 5 + }, + { + "epoch": 0.0047984644913627635, + "grad_norm": 99.1562953039722, + "learning_rate": 7.987220447284345e-07, + "loss": 3.3942, + "mean_token_accuracy": 0.4688503980636597, + "step": 10 + }, + { + "epoch": 0.007197696737044146, + "grad_norm": 103.59800633052092, + "learning_rate": 1.1980830670926517e-06, + "loss": 2.8201, + "mean_token_accuracy": 0.5040203750133514, + "step": 15 + }, + { + "epoch": 0.009596928982725527, + "grad_norm": 41.28134143567553, + "learning_rate": 1.597444089456869e-06, + "loss": 2.2437, + "mean_token_accuracy": 0.5471958994865418, + "step": 20 + }, + { + "epoch": 0.01199616122840691, + "grad_norm": 7.008155501822939, + "learning_rate": 1.9968051118210863e-06, + "loss": 1.9265, + "mean_token_accuracy": 0.5786412835121155, + "step": 25 + }, + { + "epoch": 0.014395393474088292, + "grad_norm": 3.7304961794538944, + "learning_rate": 2.3961661341853035e-06, + "loss": 1.6877, + "mean_token_accuracy": 0.6062699258327484, + "step": 30 + }, + { + "epoch": 0.016794625719769675, + "grad_norm": 3.161125047418359, + "learning_rate": 2.7955271565495207e-06, + "loss": 1.5141, + "mean_token_accuracy": 0.6290818691253662, + "step": 35 + }, + { + "epoch": 0.019193857965451054, + "grad_norm": 3.2545444446941714, + "learning_rate": 3.194888178913738e-06, + "loss": 1.3254, + "mean_token_accuracy": 0.6603264272212982, + "step": 40 + }, + { + "epoch": 0.021593090211132437, + "grad_norm": 2.5316219364038686, + "learning_rate": 3.5942492012779555e-06, + "loss": 1.1635, + "mean_token_accuracy": 0.692139345407486, + "step": 45 + }, + { + "epoch": 0.02399232245681382, + "grad_norm": 2.401613446779219, + "learning_rate": 3.993610223642173e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.7093232333660126, + "step": 50 + }, + { + "epoch": 0.026391554702495202, + "grad_norm": 3.26330685077421, + "learning_rate": 4.39297124600639e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7275775074958801, + "step": 55 + }, + { + "epoch": 0.028790786948176585, + "grad_norm": 2.618739831701825, + "learning_rate": 4.792332268370607e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7401251554489136, + "step": 60 + }, + { + "epoch": 0.031190019193857964, + "grad_norm": 2.398659345240197, + "learning_rate": 5.191693290734825e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7501980125904083, + "step": 65 + }, + { + "epoch": 0.03358925143953935, + "grad_norm": 1.9361958724739154, + "learning_rate": 5.591054313099041e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.760456132888794, + "step": 70 + }, + { + "epoch": 0.03598848368522073, + "grad_norm": 1.888737531555722, + "learning_rate": 5.990415335463259e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7615372240543365, + "step": 75 + }, + { + "epoch": 0.03838771593090211, + "grad_norm": 1.7614713374922835, + "learning_rate": 6.389776357827476e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7687996089458465, + "step": 80 + }, + { + "epoch": 0.040786948176583494, + "grad_norm": 1.5424851045060726, + "learning_rate": 6.789137380191693e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7746296703815461, + "step": 85 + }, + { + "epoch": 0.04318618042226487, + "grad_norm": 1.4479842150627094, + "learning_rate": 7.188498402555911e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7788853943347931, + "step": 90 + }, + { + "epoch": 0.04558541266794626, + "grad_norm": 1.573368555060965, + "learning_rate": 7.5878594249201285e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7856944262981415, + "step": 95 + }, + { + "epoch": 0.04798464491362764, + "grad_norm": 1.4141414526158402, + "learning_rate": 7.987220447284345e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.783971655368805, + "step": 100 + }, + { + "epoch": 0.05038387715930902, + "grad_norm": 1.9824310652518375, + "learning_rate": 8.386581469648563e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7859264612197876, + "step": 105 + }, + { + "epoch": 0.052783109404990404, + "grad_norm": 1.9827753371234091, + "learning_rate": 8.78594249201278e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7866788566112518, + "step": 110 + }, + { + "epoch": 0.05518234165067178, + "grad_norm": 1.2728746495222643, + "learning_rate": 9.185303514376996e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7887039840221405, + "step": 115 + }, + { + "epoch": 0.05758157389635317, + "grad_norm": 1.5593829761863378, + "learning_rate": 9.584664536741214e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7892837047576904, + "step": 120 + }, + { + "epoch": 0.05998080614203455, + "grad_norm": 1.33665011351666, + "learning_rate": 9.984025559105432e-06, + "loss": 0.6705, + "mean_token_accuracy": 0.79561066031456, + "step": 125 + }, + { + "epoch": 0.06238003838771593, + "grad_norm": 1.650657003715394, + "learning_rate": 1.038338658146965e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.7983006715774537, + "step": 130 + }, + { + "epoch": 0.0647792706333973, + "grad_norm": 1.2501851092297778, + "learning_rate": 1.0782747603833867e-05, + "loss": 0.6706, + "mean_token_accuracy": 0.7948801577091217, + "step": 135 + }, + { + "epoch": 0.0671785028790787, + "grad_norm": 1.489237643326096, + "learning_rate": 1.1182108626198083e-05, + "loss": 0.669, + "mean_token_accuracy": 0.7941152632236481, + "step": 140 + }, + { + "epoch": 0.06957773512476008, + "grad_norm": 1.1801378516581407, + "learning_rate": 1.1581469648562302e-05, + "loss": 0.6692, + "mean_token_accuracy": 0.7941246449947357, + "step": 145 + }, + { + "epoch": 0.07197696737044146, + "grad_norm": 1.3234370393319912, + "learning_rate": 1.1980830670926518e-05, + "loss": 0.6601, + "mean_token_accuracy": 0.7972006022930145, + "step": 150 + }, + { + "epoch": 0.07437619961612284, + "grad_norm": 1.3266843826076535, + "learning_rate": 1.2380191693290735e-05, + "loss": 0.6431, + "mean_token_accuracy": 0.8009279251098633, + "step": 155 + }, + { + "epoch": 0.07677543186180422, + "grad_norm": 1.171676614531739, + "learning_rate": 1.2779552715654951e-05, + "loss": 0.6441, + "mean_token_accuracy": 0.8012164771556854, + "step": 160 + }, + { + "epoch": 0.07917466410748561, + "grad_norm": 1.191195633173334, + "learning_rate": 1.3178913738019169e-05, + "loss": 0.6313, + "mean_token_accuracy": 0.8047502994537353, + "step": 165 + }, + { + "epoch": 0.08157389635316699, + "grad_norm": 1.2319321923794087, + "learning_rate": 1.3578274760383387e-05, + "loss": 0.6375, + "mean_token_accuracy": 0.8019321501255036, + "step": 170 + }, + { + "epoch": 0.08397312859884837, + "grad_norm": 1.4534520664061426, + "learning_rate": 1.3977635782747606e-05, + "loss": 0.6436, + "mean_token_accuracy": 0.8006344079971314, + "step": 175 + }, + { + "epoch": 0.08637236084452975, + "grad_norm": 1.2484396717014798, + "learning_rate": 1.4376996805111822e-05, + "loss": 0.6298, + "mean_token_accuracy": 0.8044987142086029, + "step": 180 + }, + { + "epoch": 0.08877159309021113, + "grad_norm": 1.401816255311233, + "learning_rate": 1.477635782747604e-05, + "loss": 0.6412, + "mean_token_accuracy": 0.8011281907558441, + "step": 185 + }, + { + "epoch": 0.09117082533589252, + "grad_norm": 1.3511684768747914, + "learning_rate": 1.5175718849840257e-05, + "loss": 0.621, + "mean_token_accuracy": 0.8077153265476227, + "step": 190 + }, + { + "epoch": 0.0935700575815739, + "grad_norm": 1.108573103056433, + "learning_rate": 1.5575079872204475e-05, + "loss": 0.6172, + "mean_token_accuracy": 0.8076364099979401, + "step": 195 + }, + { + "epoch": 0.09596928982725528, + "grad_norm": 1.3243400421217433, + "learning_rate": 1.597444089456869e-05, + "loss": 0.6385, + "mean_token_accuracy": 0.802015745639801, + "step": 200 + }, + { + "epoch": 0.09836852207293666, + "grad_norm": 1.5147183355913882, + "learning_rate": 1.6373801916932906e-05, + "loss": 0.6273, + "mean_token_accuracy": 0.8052714228630066, + "step": 205 + }, + { + "epoch": 0.10076775431861804, + "grad_norm": 1.0814564338138692, + "learning_rate": 1.6773162939297126e-05, + "loss": 0.6159, + "mean_token_accuracy": 0.8079419076442719, + "step": 210 + }, + { + "epoch": 0.10316698656429943, + "grad_norm": 1.1275338205488943, + "learning_rate": 1.7172523961661345e-05, + "loss": 0.6347, + "mean_token_accuracy": 0.801912808418274, + "step": 215 + }, + { + "epoch": 0.10556621880998081, + "grad_norm": 1.0329542633437603, + "learning_rate": 1.757188498402556e-05, + "loss": 0.6151, + "mean_token_accuracy": 0.8077832996845246, + "step": 220 + }, + { + "epoch": 0.10796545105566219, + "grad_norm": 1.0527318791464246, + "learning_rate": 1.7971246006389777e-05, + "loss": 0.5926, + "mean_token_accuracy": 0.8138438284397125, + "step": 225 + }, + { + "epoch": 0.11036468330134357, + "grad_norm": 1.0376015582992664, + "learning_rate": 1.8370607028753993e-05, + "loss": 0.6067, + "mean_token_accuracy": 0.8101162552833557, + "step": 230 + }, + { + "epoch": 0.11276391554702495, + "grad_norm": 1.2251681217622086, + "learning_rate": 1.8769968051118212e-05, + "loss": 0.6252, + "mean_token_accuracy": 0.8043080806732178, + "step": 235 + }, + { + "epoch": 0.11516314779270634, + "grad_norm": 1.1885968023406797, + "learning_rate": 1.9169329073482428e-05, + "loss": 0.6126, + "mean_token_accuracy": 0.8087013304233551, + "step": 240 + }, + { + "epoch": 0.11756238003838772, + "grad_norm": 1.2212514526533986, + "learning_rate": 1.9568690095846644e-05, + "loss": 0.6055, + "mean_token_accuracy": 0.8101045370101929, + "step": 245 + }, + { + "epoch": 0.1199616122840691, + "grad_norm": 1.1976425853454404, + "learning_rate": 1.9968051118210863e-05, + "loss": 0.6204, + "mean_token_accuracy": 0.8052823603153229, + "step": 250 + }, + { + "epoch": 0.12236084452975048, + "grad_norm": 1.0624500234563632, + "learning_rate": 2.0367412140575082e-05, + "loss": 0.6026, + "mean_token_accuracy": 0.8114176630973816, + "step": 255 + }, + { + "epoch": 0.12476007677543186, + "grad_norm": 1.3455443254267732, + "learning_rate": 2.07667731629393e-05, + "loss": 0.6042, + "mean_token_accuracy": 0.8102307736873626, + "step": 260 + }, + { + "epoch": 0.12715930902111325, + "grad_norm": 1.1145770451465071, + "learning_rate": 2.1166134185303514e-05, + "loss": 0.6104, + "mean_token_accuracy": 0.8091982364654541, + "step": 265 + }, + { + "epoch": 0.1295585412667946, + "grad_norm": 0.9619063430955215, + "learning_rate": 2.1565495207667734e-05, + "loss": 0.6051, + "mean_token_accuracy": 0.8094959020614624, + "step": 270 + }, + { + "epoch": 0.131957773512476, + "grad_norm": 1.0999050169577322, + "learning_rate": 2.196485623003195e-05, + "loss": 0.6129, + "mean_token_accuracy": 0.807866895198822, + "step": 275 + }, + { + "epoch": 0.1343570057581574, + "grad_norm": 1.383699374554589, + "learning_rate": 2.2364217252396165e-05, + "loss": 0.6185, + "mean_token_accuracy": 0.8059761583805084, + "step": 280 + }, + { + "epoch": 0.13675623800383876, + "grad_norm": 0.8522287990957228, + "learning_rate": 2.2763578274760385e-05, + "loss": 0.6093, + "mean_token_accuracy": 0.8090029060840607, + "step": 285 + }, + { + "epoch": 0.13915547024952016, + "grad_norm": 0.9617614265565335, + "learning_rate": 2.3162939297124604e-05, + "loss": 0.6076, + "mean_token_accuracy": 0.8090137720108033, + "step": 290 + }, + { + "epoch": 0.14155470249520152, + "grad_norm": 1.0820740523297105, + "learning_rate": 2.356230031948882e-05, + "loss": 0.6108, + "mean_token_accuracy": 0.8079468786716462, + "step": 295 + }, + { + "epoch": 0.14395393474088292, + "grad_norm": 1.208085906040005, + "learning_rate": 2.3961661341853036e-05, + "loss": 0.6137, + "mean_token_accuracy": 0.8066973686218262, + "step": 300 + }, + { + "epoch": 0.1463531669865643, + "grad_norm": 1.2867186093418053, + "learning_rate": 2.4361022364217255e-05, + "loss": 0.6165, + "mean_token_accuracy": 0.806743037700653, + "step": 305 + }, + { + "epoch": 0.14875239923224567, + "grad_norm": 1.2335001938431094, + "learning_rate": 2.476038338658147e-05, + "loss": 0.6184, + "mean_token_accuracy": 0.8057464599609375, + "step": 310 + }, + { + "epoch": 0.15115163147792707, + "grad_norm": 1.1353843569347062, + "learning_rate": 2.515974440894569e-05, + "loss": 0.5888, + "mean_token_accuracy": 0.8137832462787629, + "step": 315 + }, + { + "epoch": 0.15355086372360843, + "grad_norm": 0.9520234538262042, + "learning_rate": 2.5559105431309903e-05, + "loss": 0.6065, + "mean_token_accuracy": 0.8100951492786408, + "step": 320 + }, + { + "epoch": 0.15595009596928983, + "grad_norm": 1.1027272126900591, + "learning_rate": 2.5958466453674125e-05, + "loss": 0.6046, + "mean_token_accuracy": 0.8104814648628235, + "step": 325 + }, + { + "epoch": 0.15834932821497122, + "grad_norm": 0.9494045507221671, + "learning_rate": 2.6357827476038338e-05, + "loss": 0.6021, + "mean_token_accuracy": 0.8098224759101867, + "step": 330 + }, + { + "epoch": 0.16074856046065258, + "grad_norm": 1.064270767277033, + "learning_rate": 2.6757188498402557e-05, + "loss": 0.6028, + "mean_token_accuracy": 0.8100285053253173, + "step": 335 + }, + { + "epoch": 0.16314779270633398, + "grad_norm": 0.9515730626704475, + "learning_rate": 2.7156549520766773e-05, + "loss": 0.6034, + "mean_token_accuracy": 0.809994375705719, + "step": 340 + }, + { + "epoch": 0.16554702495201534, + "grad_norm": 1.1169315271788858, + "learning_rate": 2.7555910543130992e-05, + "loss": 0.617, + "mean_token_accuracy": 0.8061847150325775, + "step": 345 + }, + { + "epoch": 0.16794625719769674, + "grad_norm": 1.0117750051838819, + "learning_rate": 2.7955271565495212e-05, + "loss": 0.5877, + "mean_token_accuracy": 0.8143430769443512, + "step": 350 + }, + { + "epoch": 0.17034548944337813, + "grad_norm": 1.011758660828968, + "learning_rate": 2.8354632587859424e-05, + "loss": 0.5902, + "mean_token_accuracy": 0.813703978061676, + "step": 355 + }, + { + "epoch": 0.1727447216890595, + "grad_norm": 0.991649818964084, + "learning_rate": 2.8753993610223644e-05, + "loss": 0.5934, + "mean_token_accuracy": 0.8137360095977784, + "step": 360 + }, + { + "epoch": 0.1751439539347409, + "grad_norm": 0.9886966147061703, + "learning_rate": 2.915335463258786e-05, + "loss": 0.5835, + "mean_token_accuracy": 0.8160845398902893, + "step": 365 + }, + { + "epoch": 0.17754318618042225, + "grad_norm": 0.9333152812140807, + "learning_rate": 2.955271565495208e-05, + "loss": 0.5912, + "mean_token_accuracy": 0.8131695687770844, + "step": 370 + }, + { + "epoch": 0.17994241842610365, + "grad_norm": 0.8650635350560877, + "learning_rate": 2.9952076677316295e-05, + "loss": 0.5897, + "mean_token_accuracy": 0.8132461369037628, + "step": 375 + }, + { + "epoch": 0.18234165067178504, + "grad_norm": 1.0526197764669132, + "learning_rate": 3.0351437699680514e-05, + "loss": 0.5929, + "mean_token_accuracy": 0.8123046815395355, + "step": 380 + }, + { + "epoch": 0.1847408829174664, + "grad_norm": 0.8702811072498366, + "learning_rate": 3.075079872204473e-05, + "loss": 0.5795, + "mean_token_accuracy": 0.815815019607544, + "step": 385 + }, + { + "epoch": 0.1871401151631478, + "grad_norm": 1.0530301566850875, + "learning_rate": 3.115015974440895e-05, + "loss": 0.5886, + "mean_token_accuracy": 0.8134219169616699, + "step": 390 + }, + { + "epoch": 0.18953934740882916, + "grad_norm": 1.0645206955111999, + "learning_rate": 3.154952076677317e-05, + "loss": 0.6064, + "mean_token_accuracy": 0.8076598584651947, + "step": 395 + }, + { + "epoch": 0.19193857965451055, + "grad_norm": 0.928351802093768, + "learning_rate": 3.194888178913738e-05, + "loss": 0.5986, + "mean_token_accuracy": 0.8101225137710572, + "step": 400 + }, + { + "epoch": 0.19433781190019195, + "grad_norm": 0.8939754967147308, + "learning_rate": 3.23482428115016e-05, + "loss": 0.5881, + "mean_token_accuracy": 0.8137291073799133, + "step": 405 + }, + { + "epoch": 0.1967370441458733, + "grad_norm": 0.8998458172125146, + "learning_rate": 3.274760383386581e-05, + "loss": 0.5801, + "mean_token_accuracy": 0.815822857618332, + "step": 410 + }, + { + "epoch": 0.1991362763915547, + "grad_norm": 0.9045669856998589, + "learning_rate": 3.314696485623003e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.810960841178894, + "step": 415 + }, + { + "epoch": 0.20153550863723607, + "grad_norm": 1.2366817121637053, + "learning_rate": 3.354632587859425e-05, + "loss": 0.6007, + "mean_token_accuracy": 0.8106834948062897, + "step": 420 + }, + { + "epoch": 0.20393474088291746, + "grad_norm": 1.1241688540561303, + "learning_rate": 3.394568690095847e-05, + "loss": 0.5837, + "mean_token_accuracy": 0.8151392281055451, + "step": 425 + }, + { + "epoch": 0.20633397312859886, + "grad_norm": 0.8871443151549863, + "learning_rate": 3.434504792332269e-05, + "loss": 0.5939, + "mean_token_accuracy": 0.8126257479190826, + "step": 430 + }, + { + "epoch": 0.20873320537428022, + "grad_norm": 0.8424395176147732, + "learning_rate": 3.47444089456869e-05, + "loss": 0.5711, + "mean_token_accuracy": 0.8188099205493927, + "step": 435 + }, + { + "epoch": 0.21113243761996162, + "grad_norm": 0.7827214371353761, + "learning_rate": 3.514376996805112e-05, + "loss": 0.577, + "mean_token_accuracy": 0.8164414882659912, + "step": 440 + }, + { + "epoch": 0.21353166986564298, + "grad_norm": 0.9316075347810953, + "learning_rate": 3.5543130990415334e-05, + "loss": 0.5863, + "mean_token_accuracy": 0.8147446632385253, + "step": 445 + }, + { + "epoch": 0.21593090211132437, + "grad_norm": 1.0158313500744018, + "learning_rate": 3.5942492012779554e-05, + "loss": 0.5894, + "mean_token_accuracy": 0.8138821184635162, + "step": 450 + }, + { + "epoch": 0.21833013435700577, + "grad_norm": 0.8530663228237867, + "learning_rate": 3.6341853035143766e-05, + "loss": 0.5796, + "mean_token_accuracy": 0.8160447537899017, + "step": 455 + }, + { + "epoch": 0.22072936660268713, + "grad_norm": 0.8480787301010322, + "learning_rate": 3.6741214057507985e-05, + "loss": 0.5842, + "mean_token_accuracy": 0.8153697609901428, + "step": 460 + }, + { + "epoch": 0.22312859884836853, + "grad_norm": 0.7935009865546706, + "learning_rate": 3.714057507987221e-05, + "loss": 0.5867, + "mean_token_accuracy": 0.8143532335758209, + "step": 465 + }, + { + "epoch": 0.2255278310940499, + "grad_norm": 0.9176259936940574, + "learning_rate": 3.7539936102236424e-05, + "loss": 0.5954, + "mean_token_accuracy": 0.8111553907394409, + "step": 470 + }, + { + "epoch": 0.22792706333973128, + "grad_norm": 0.9937759416482129, + "learning_rate": 3.793929712460064e-05, + "loss": 0.5721, + "mean_token_accuracy": 0.8177870512008667, + "step": 475 + }, + { + "epoch": 0.23032629558541268, + "grad_norm": 0.9396616647288063, + "learning_rate": 3.8338658146964856e-05, + "loss": 0.5639, + "mean_token_accuracy": 0.8213005423545837, + "step": 480 + }, + { + "epoch": 0.23272552783109404, + "grad_norm": 0.9673726655600524, + "learning_rate": 3.8738019169329075e-05, + "loss": 0.5745, + "mean_token_accuracy": 0.8171282410621643, + "step": 485 + }, + { + "epoch": 0.23512476007677544, + "grad_norm": 1.397789609885595, + "learning_rate": 3.913738019169329e-05, + "loss": 0.5965, + "mean_token_accuracy": 0.8120984196662903, + "step": 490 + }, + { + "epoch": 0.2375239923224568, + "grad_norm": 1.4129747649477833, + "learning_rate": 3.953674121405751e-05, + "loss": 0.579, + "mean_token_accuracy": 0.8163260221481323, + "step": 495 + }, + { + "epoch": 0.2399232245681382, + "grad_norm": 0.912512990318671, + "learning_rate": 3.9936102236421726e-05, + "loss": 0.6037, + "mean_token_accuracy": 0.8085700690746307, + "step": 500 + }, + { + "epoch": 0.2423224568138196, + "grad_norm": 1.1301806240169505, + "learning_rate": 4.0335463258785946e-05, + "loss": 0.5844, + "mean_token_accuracy": 0.8150687873363495, + "step": 505 + }, + { + "epoch": 0.24472168905950095, + "grad_norm": 0.7765974643502626, + "learning_rate": 4.0734824281150165e-05, + "loss": 0.5959, + "mean_token_accuracy": 0.811828863620758, + "step": 510 + }, + { + "epoch": 0.24712092130518235, + "grad_norm": 1.3548826892769203, + "learning_rate": 4.113418530351438e-05, + "loss": 0.5868, + "mean_token_accuracy": 0.8138845384120941, + "step": 515 + }, + { + "epoch": 0.2495201535508637, + "grad_norm": 1.1163949052296775, + "learning_rate": 4.15335463258786e-05, + "loss": 0.5879, + "mean_token_accuracy": 0.81446653008461, + "step": 520 + }, + { + "epoch": 0.2519193857965451, + "grad_norm": 0.9714485046081821, + "learning_rate": 4.193290734824281e-05, + "loss": 0.5883, + "mean_token_accuracy": 0.8142540156841278, + "step": 525 + }, + { + "epoch": 0.2543186180422265, + "grad_norm": 0.7557409936221269, + "learning_rate": 4.233226837060703e-05, + "loss": 0.5851, + "mean_token_accuracy": 0.8139719665050507, + "step": 530 + }, + { + "epoch": 0.2567178502879079, + "grad_norm": 0.7496854833800273, + "learning_rate": 4.273162939297125e-05, + "loss": 0.5761, + "mean_token_accuracy": 0.8175175011157989, + "step": 535 + }, + { + "epoch": 0.2591170825335892, + "grad_norm": 0.7888537580262366, + "learning_rate": 4.313099041533547e-05, + "loss": 0.5797, + "mean_token_accuracy": 0.8156923830509186, + "step": 540 + }, + { + "epoch": 0.2615163147792706, + "grad_norm": 0.785622281172855, + "learning_rate": 4.3530351437699686e-05, + "loss": 0.5782, + "mean_token_accuracy": 0.8164088368415833, + "step": 545 + }, + { + "epoch": 0.263915547024952, + "grad_norm": 0.852186691849217, + "learning_rate": 4.39297124600639e-05, + "loss": 0.6022, + "mean_token_accuracy": 0.8093033075332642, + "step": 550 + }, + { + "epoch": 0.2663147792706334, + "grad_norm": 0.7784512087734883, + "learning_rate": 4.432907348242812e-05, + "loss": 0.5823, + "mean_token_accuracy": 0.8160103797912598, + "step": 555 + }, + { + "epoch": 0.2687140115163148, + "grad_norm": 0.8320565617519823, + "learning_rate": 4.472843450479233e-05, + "loss": 0.5939, + "mean_token_accuracy": 0.8122792363166809, + "step": 560 + }, + { + "epoch": 0.27111324376199614, + "grad_norm": 0.8877223112366365, + "learning_rate": 4.512779552715655e-05, + "loss": 0.594, + "mean_token_accuracy": 0.8112381875514985, + "step": 565 + }, + { + "epoch": 0.27351247600767753, + "grad_norm": 0.8868733480541511, + "learning_rate": 4.552715654952077e-05, + "loss": 0.591, + "mean_token_accuracy": 0.8121663808822632, + "step": 570 + }, + { + "epoch": 0.2759117082533589, + "grad_norm": 1.2963706262591022, + "learning_rate": 4.592651757188499e-05, + "loss": 0.6031, + "mean_token_accuracy": 0.8092474520206452, + "step": 575 + }, + { + "epoch": 0.2783109404990403, + "grad_norm": 1.310894898713945, + "learning_rate": 4.632587859424921e-05, + "loss": 0.5782, + "mean_token_accuracy": 0.8162494540214539, + "step": 580 + }, + { + "epoch": 0.2807101727447217, + "grad_norm": 0.8988385867516254, + "learning_rate": 4.672523961661342e-05, + "loss": 0.585, + "mean_token_accuracy": 0.8153525233268738, + "step": 585 + }, + { + "epoch": 0.28310940499040305, + "grad_norm": 0.751447563005443, + "learning_rate": 4.712460063897764e-05, + "loss": 0.5795, + "mean_token_accuracy": 0.816132253408432, + "step": 590 + }, + { + "epoch": 0.28550863723608444, + "grad_norm": 0.6807754316677679, + "learning_rate": 4.752396166134185e-05, + "loss": 0.5661, + "mean_token_accuracy": 0.8204606473445892, + "step": 595 + }, + { + "epoch": 0.28790786948176583, + "grad_norm": 0.8236607801744058, + "learning_rate": 4.792332268370607e-05, + "loss": 0.5864, + "mean_token_accuracy": 0.8139038801193237, + "step": 600 + }, + { + "epoch": 0.2903071017274472, + "grad_norm": 1.0126301458862197, + "learning_rate": 4.832268370607029e-05, + "loss": 0.5753, + "mean_token_accuracy": 0.8181136310100555, + "step": 605 + }, + { + "epoch": 0.2927063339731286, + "grad_norm": 0.8019914579553031, + "learning_rate": 4.872204472843451e-05, + "loss": 0.5542, + "mean_token_accuracy": 0.8230095028877258, + "step": 610 + }, + { + "epoch": 0.29510556621880996, + "grad_norm": 0.7062463323534514, + "learning_rate": 4.912140575079873e-05, + "loss": 0.5741, + "mean_token_accuracy": 0.8183253645896912, + "step": 615 + }, + { + "epoch": 0.29750479846449135, + "grad_norm": 0.6855738394736139, + "learning_rate": 4.952076677316294e-05, + "loss": 0.5805, + "mean_token_accuracy": 0.8156778931617736, + "step": 620 + }, + { + "epoch": 0.29990403071017274, + "grad_norm": 0.6936388947928334, + "learning_rate": 4.992012779552716e-05, + "loss": 0.5755, + "mean_token_accuracy": 0.8175960659980774, + "step": 625 + }, + { + "epoch": 0.30230326295585414, + "grad_norm": 1.0037748634842334, + "learning_rate": 4.996445076430857e-05, + "loss": 0.5822, + "mean_token_accuracy": 0.815348619222641, + "step": 630 + }, + { + "epoch": 0.30470249520153553, + "grad_norm": 6.244854046134436, + "learning_rate": 4.992001421969428e-05, + "loss": 0.5861, + "mean_token_accuracy": 0.8144954383373261, + "step": 635 + }, + { + "epoch": 0.30710172744721687, + "grad_norm": 0.8969783520250975, + "learning_rate": 4.9875577675079985e-05, + "loss": 0.5822, + "mean_token_accuracy": 0.8159963071346283, + "step": 640 + }, + { + "epoch": 0.30950095969289826, + "grad_norm": 0.7734920429461225, + "learning_rate": 4.9831141130465697e-05, + "loss": 0.5744, + "mean_token_accuracy": 0.8177722036838532, + "step": 645 + }, + { + "epoch": 0.31190019193857965, + "grad_norm": 0.7142126577980644, + "learning_rate": 4.978670458585141e-05, + "loss": 0.569, + "mean_token_accuracy": 0.8191527545452117, + "step": 650 + }, + { + "epoch": 0.31429942418426104, + "grad_norm": 0.7382008749131507, + "learning_rate": 4.9742268041237114e-05, + "loss": 0.5682, + "mean_token_accuracy": 0.8193980932235718, + "step": 655 + }, + { + "epoch": 0.31669865642994244, + "grad_norm": 0.7169900376811519, + "learning_rate": 4.9697831496622825e-05, + "loss": 0.5543, + "mean_token_accuracy": 0.8232217609882355, + "step": 660 + }, + { + "epoch": 0.3190978886756238, + "grad_norm": 0.694035172942871, + "learning_rate": 4.965339495200854e-05, + "loss": 0.5706, + "mean_token_accuracy": 0.8184644281864166, + "step": 665 + }, + { + "epoch": 0.32149712092130517, + "grad_norm": 0.6463126718358011, + "learning_rate": 4.960895840739425e-05, + "loss": 0.5737, + "mean_token_accuracy": 0.8177339375019074, + "step": 670 + }, + { + "epoch": 0.32389635316698656, + "grad_norm": 0.829236370161891, + "learning_rate": 4.956452186277995e-05, + "loss": 0.5516, + "mean_token_accuracy": 0.823012375831604, + "step": 675 + }, + { + "epoch": 0.32629558541266795, + "grad_norm": 0.7806704010520713, + "learning_rate": 4.952008531816566e-05, + "loss": 0.5807, + "mean_token_accuracy": 0.8154712855815888, + "step": 680 + }, + { + "epoch": 0.32869481765834935, + "grad_norm": 0.6873888024143939, + "learning_rate": 4.947564877355137e-05, + "loss": 0.5643, + "mean_token_accuracy": 0.8201543867588044, + "step": 685 + }, + { + "epoch": 0.3310940499040307, + "grad_norm": 1.2696445158394363, + "learning_rate": 4.9431212228937076e-05, + "loss": 0.5527, + "mean_token_accuracy": 0.8234491348266602, + "step": 690 + }, + { + "epoch": 0.3334932821497121, + "grad_norm": 0.8417323314066051, + "learning_rate": 4.938677568432279e-05, + "loss": 0.5681, + "mean_token_accuracy": 0.81938636302948, + "step": 695 + }, + { + "epoch": 0.33589251439539347, + "grad_norm": 1.03191767907223, + "learning_rate": 4.93423391397085e-05, + "loss": 0.5659, + "mean_token_accuracy": 0.8202950179576873, + "step": 700 + }, + { + "epoch": 0.33829174664107486, + "grad_norm": 0.838214828399018, + "learning_rate": 4.929790259509421e-05, + "loss": 0.5822, + "mean_token_accuracy": 0.8157517671585083, + "step": 705 + }, + { + "epoch": 0.34069097888675626, + "grad_norm": 0.9470259432449729, + "learning_rate": 4.925346605047992e-05, + "loss": 0.5572, + "mean_token_accuracy": 0.8220482587814331, + "step": 710 + }, + { + "epoch": 0.3430902111324376, + "grad_norm": 0.7616814758368874, + "learning_rate": 4.920902950586563e-05, + "loss": 0.5518, + "mean_token_accuracy": 0.8239187180995942, + "step": 715 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 0.6928452392521073, + "learning_rate": 4.916459296125134e-05, + "loss": 0.5623, + "mean_token_accuracy": 0.8204801857471467, + "step": 720 + }, + { + "epoch": 0.3478886756238004, + "grad_norm": 0.5932890667190485, + "learning_rate": 4.912015641663704e-05, + "loss": 0.5549, + "mean_token_accuracy": 0.8232076942920685, + "step": 725 + }, + { + "epoch": 0.3502879078694818, + "grad_norm": 1.4632469671190833, + "learning_rate": 4.907571987202275e-05, + "loss": 0.563, + "mean_token_accuracy": 0.8213146209716797, + "step": 730 + }, + { + "epoch": 0.35268714011516317, + "grad_norm": 0.6762729658771497, + "learning_rate": 4.903128332740846e-05, + "loss": 0.555, + "mean_token_accuracy": 0.822946411371231, + "step": 735 + }, + { + "epoch": 0.3550863723608445, + "grad_norm": 0.6390018861279814, + "learning_rate": 4.8986846782794174e-05, + "loss": 0.5636, + "mean_token_accuracy": 0.8206958174705505, + "step": 740 + }, + { + "epoch": 0.3574856046065259, + "grad_norm": 0.6032245697331051, + "learning_rate": 4.894241023817988e-05, + "loss": 0.555, + "mean_token_accuracy": 0.8216525256633759, + "step": 745 + }, + { + "epoch": 0.3598848368522073, + "grad_norm": 0.6617068176509739, + "learning_rate": 4.889797369356559e-05, + "loss": 0.5628, + "mean_token_accuracy": 0.8194816887378693, + "step": 750 + }, + { + "epoch": 0.3622840690978887, + "grad_norm": 0.5676852742206375, + "learning_rate": 4.88535371489513e-05, + "loss": 0.5605, + "mean_token_accuracy": 0.8203934669494629, + "step": 755 + }, + { + "epoch": 0.3646833013435701, + "grad_norm": 0.6372895506366388, + "learning_rate": 4.880910060433701e-05, + "loss": 0.5567, + "mean_token_accuracy": 0.8218388557434082, + "step": 760 + }, + { + "epoch": 0.3670825335892514, + "grad_norm": 0.6855330039826973, + "learning_rate": 4.876466405972272e-05, + "loss": 0.5739, + "mean_token_accuracy": 0.8184073507785797, + "step": 765 + }, + { + "epoch": 0.3694817658349328, + "grad_norm": 0.6063969401795531, + "learning_rate": 4.872022751510843e-05, + "loss": 0.5608, + "mean_token_accuracy": 0.8208466172218323, + "step": 770 + }, + { + "epoch": 0.3718809980806142, + "grad_norm": 0.6318300543152913, + "learning_rate": 4.867579097049414e-05, + "loss": 0.5792, + "mean_token_accuracy": 0.8148290395736695, + "step": 775 + }, + { + "epoch": 0.3742802303262956, + "grad_norm": 0.6240652015135113, + "learning_rate": 4.863135442587984e-05, + "loss": 0.5573, + "mean_token_accuracy": 0.8221084117889405, + "step": 780 + }, + { + "epoch": 0.376679462571977, + "grad_norm": 0.6072419176079188, + "learning_rate": 4.8586917881265554e-05, + "loss": 0.5602, + "mean_token_accuracy": 0.820676177740097, + "step": 785 + }, + { + "epoch": 0.3790786948176583, + "grad_norm": 0.6916151373942763, + "learning_rate": 4.8542481336651266e-05, + "loss": 0.5618, + "mean_token_accuracy": 0.8195629417896271, + "step": 790 + }, + { + "epoch": 0.3814779270633397, + "grad_norm": 0.7558352618674623, + "learning_rate": 4.849804479203697e-05, + "loss": 0.5708, + "mean_token_accuracy": 0.8190373599529266, + "step": 795 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 0.8372896189519592, + "learning_rate": 4.845360824742268e-05, + "loss": 0.564, + "mean_token_accuracy": 0.8195176303386689, + "step": 800 + }, + { + "epoch": 0.3862763915547025, + "grad_norm": 0.6238576830422211, + "learning_rate": 4.8409171702808395e-05, + "loss": 0.5574, + "mean_token_accuracy": 0.8217509090900421, + "step": 805 + }, + { + "epoch": 0.3886756238003839, + "grad_norm": 0.6698872279230852, + "learning_rate": 4.83647351581941e-05, + "loss": 0.5836, + "mean_token_accuracy": 0.8135609984397888, + "step": 810 + }, + { + "epoch": 0.39107485604606523, + "grad_norm": 0.6386814692049486, + "learning_rate": 4.832029861357981e-05, + "loss": 0.5611, + "mean_token_accuracy": 0.8212653994560242, + "step": 815 + }, + { + "epoch": 0.3934740882917466, + "grad_norm": 0.75029553320145, + "learning_rate": 4.827586206896552e-05, + "loss": 0.5454, + "mean_token_accuracy": 0.8249085962772369, + "step": 820 + }, + { + "epoch": 0.395873320537428, + "grad_norm": 0.6082151973850165, + "learning_rate": 4.823142552435123e-05, + "loss": 0.5487, + "mean_token_accuracy": 0.8236030340194702, + "step": 825 + }, + { + "epoch": 0.3982725527831094, + "grad_norm": 0.5915943123457897, + "learning_rate": 4.8186988979736933e-05, + "loss": 0.552, + "mean_token_accuracy": 0.8226123511791229, + "step": 830 + }, + { + "epoch": 0.4006717850287908, + "grad_norm": 0.6071974690638059, + "learning_rate": 4.8142552435122645e-05, + "loss": 0.5601, + "mean_token_accuracy": 0.8202442348003387, + "step": 835 + }, + { + "epoch": 0.40307101727447214, + "grad_norm": 0.6125811367551213, + "learning_rate": 4.809811589050836e-05, + "loss": 0.5532, + "mean_token_accuracy": 0.8221201241016388, + "step": 840 + }, + { + "epoch": 0.40547024952015354, + "grad_norm": 0.6682594766455511, + "learning_rate": 4.805367934589406e-05, + "loss": 0.5399, + "mean_token_accuracy": 0.8263704001903533, + "step": 845 + }, + { + "epoch": 0.40786948176583493, + "grad_norm": 0.735519043559457, + "learning_rate": 4.8009242801279774e-05, + "loss": 0.548, + "mean_token_accuracy": 0.8243632435798645, + "step": 850 + }, + { + "epoch": 0.4102687140115163, + "grad_norm": 0.5423593788691745, + "learning_rate": 4.7964806256665486e-05, + "loss": 0.5551, + "mean_token_accuracy": 0.8216099441051483, + "step": 855 + }, + { + "epoch": 0.4126679462571977, + "grad_norm": 0.569128145619009, + "learning_rate": 4.792036971205119e-05, + "loss": 0.5374, + "mean_token_accuracy": 0.8269579291343689, + "step": 860 + }, + { + "epoch": 0.41506717850287905, + "grad_norm": 0.5621821735679401, + "learning_rate": 4.78759331674369e-05, + "loss": 0.5266, + "mean_token_accuracy": 0.8298987448215485, + "step": 865 + }, + { + "epoch": 0.41746641074856045, + "grad_norm": 0.5731699475882329, + "learning_rate": 4.7831496622822615e-05, + "loss": 0.5407, + "mean_token_accuracy": 0.8258492708206177, + "step": 870 + }, + { + "epoch": 0.41986564299424184, + "grad_norm": 0.5662106727038226, + "learning_rate": 4.7787060078208327e-05, + "loss": 0.5382, + "mean_token_accuracy": 0.8260211527347565, + "step": 875 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 0.5906976504172934, + "learning_rate": 4.7742623533594025e-05, + "loss": 0.5585, + "mean_token_accuracy": 0.8203856468200683, + "step": 880 + }, + { + "epoch": 0.4246641074856046, + "grad_norm": 0.5719381852725446, + "learning_rate": 4.769818698897974e-05, + "loss": 0.5298, + "mean_token_accuracy": 0.8297291934490204, + "step": 885 + }, + { + "epoch": 0.42706333973128596, + "grad_norm": 0.6104627055672889, + "learning_rate": 4.765375044436545e-05, + "loss": 0.5574, + "mean_token_accuracy": 0.8215497851371765, + "step": 890 + }, + { + "epoch": 0.42946257197696736, + "grad_norm": 0.5106967506472315, + "learning_rate": 4.7609313899751154e-05, + "loss": 0.5505, + "mean_token_accuracy": 0.8234421014785767, + "step": 895 + }, + { + "epoch": 0.43186180422264875, + "grad_norm": 0.5486180289895729, + "learning_rate": 4.7564877355136865e-05, + "loss": 0.5279, + "mean_token_accuracy": 0.8291941523551941, + "step": 900 + }, + { + "epoch": 0.43426103646833014, + "grad_norm": 0.8110944973636702, + "learning_rate": 4.752044081052258e-05, + "loss": 0.5438, + "mean_token_accuracy": 0.824740606546402, + "step": 905 + }, + { + "epoch": 0.43666026871401153, + "grad_norm": 0.5083722030175633, + "learning_rate": 4.747600426590829e-05, + "loss": 0.5371, + "mean_token_accuracy": 0.8267485558986664, + "step": 910 + }, + { + "epoch": 0.43905950095969287, + "grad_norm": 0.5450657681596905, + "learning_rate": 4.7431567721293994e-05, + "loss": 0.5331, + "mean_token_accuracy": 0.8280423700809478, + "step": 915 + }, + { + "epoch": 0.44145873320537427, + "grad_norm": 0.5587272673659367, + "learning_rate": 4.7387131176679706e-05, + "loss": 0.533, + "mean_token_accuracy": 0.8275462627410889, + "step": 920 + }, + { + "epoch": 0.44385796545105566, + "grad_norm": 0.597994173280171, + "learning_rate": 4.734269463206542e-05, + "loss": 0.5221, + "mean_token_accuracy": 0.8312480449676514, + "step": 925 + }, + { + "epoch": 0.44625719769673705, + "grad_norm": 0.5699731767132227, + "learning_rate": 4.7298258087451116e-05, + "loss": 0.5417, + "mean_token_accuracy": 0.8253937721252441, + "step": 930 + }, + { + "epoch": 0.44865642994241844, + "grad_norm": 0.530459601961475, + "learning_rate": 4.725382154283683e-05, + "loss": 0.5411, + "mean_token_accuracy": 0.8255383253097535, + "step": 935 + }, + { + "epoch": 0.4510556621880998, + "grad_norm": 0.5786015168216376, + "learning_rate": 4.720938499822254e-05, + "loss": 0.5367, + "mean_token_accuracy": 0.8274415552616119, + "step": 940 + }, + { + "epoch": 0.4534548944337812, + "grad_norm": 0.5352156988298586, + "learning_rate": 4.716494845360825e-05, + "loss": 0.5486, + "mean_token_accuracy": 0.8225701510906219, + "step": 945 + }, + { + "epoch": 0.45585412667946257, + "grad_norm": 0.5537066343867931, + "learning_rate": 4.712051190899396e-05, + "loss": 0.5332, + "mean_token_accuracy": 0.8273627698421478, + "step": 950 + }, + { + "epoch": 0.45825335892514396, + "grad_norm": 0.5496688141801068, + "learning_rate": 4.707607536437967e-05, + "loss": 0.5505, + "mean_token_accuracy": 0.8225115597248077, + "step": 955 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 0.5291584157449768, + "learning_rate": 4.703163881976538e-05, + "loss": 0.5284, + "mean_token_accuracy": 0.829184639453888, + "step": 960 + }, + { + "epoch": 0.4630518234165067, + "grad_norm": 0.6097079936773455, + "learning_rate": 4.6987202275151086e-05, + "loss": 0.5293, + "mean_token_accuracy": 0.8293408989906311, + "step": 965 + }, + { + "epoch": 0.4654510556621881, + "grad_norm": 0.6198873182908643, + "learning_rate": 4.69427657305368e-05, + "loss": 0.523, + "mean_token_accuracy": 0.8304136216640472, + "step": 970 + }, + { + "epoch": 0.4678502879078695, + "grad_norm": 0.5891656862471465, + "learning_rate": 4.689832918592251e-05, + "loss": 0.549, + "mean_token_accuracy": 0.8228592276573181, + "step": 975 + }, + { + "epoch": 0.47024952015355087, + "grad_norm": 0.6107839108650862, + "learning_rate": 4.6853892641308214e-05, + "loss": 0.5365, + "mean_token_accuracy": 0.8271978914737701, + "step": 980 + }, + { + "epoch": 0.47264875239923226, + "grad_norm": 0.571716205851, + "learning_rate": 4.680945609669392e-05, + "loss": 0.5394, + "mean_token_accuracy": 0.8253172099590301, + "step": 985 + }, + { + "epoch": 0.4750479846449136, + "grad_norm": 0.5542308753871588, + "learning_rate": 4.676501955207963e-05, + "loss": 0.5207, + "mean_token_accuracy": 0.8310206890106201, + "step": 990 + }, + { + "epoch": 0.477447216890595, + "grad_norm": 0.5483955302881889, + "learning_rate": 4.672058300746534e-05, + "loss": 0.5275, + "mean_token_accuracy": 0.8294080853462219, + "step": 995 + }, + { + "epoch": 0.4798464491362764, + "grad_norm": 0.580528097435676, + "learning_rate": 4.667614646285105e-05, + "loss": 0.5326, + "mean_token_accuracy": 0.8276298522949219, + "step": 1000 + }, + { + "epoch": 0.4822456813819578, + "grad_norm": 0.5350717699048666, + "learning_rate": 4.663170991823676e-05, + "loss": 0.5315, + "mean_token_accuracy": 0.8278722941875458, + "step": 1005 + }, + { + "epoch": 0.4846449136276392, + "grad_norm": 0.49218288607628896, + "learning_rate": 4.658727337362247e-05, + "loss": 0.5328, + "mean_token_accuracy": 0.8278626799583435, + "step": 1010 + }, + { + "epoch": 0.4870441458733205, + "grad_norm": 0.5134720273160304, + "learning_rate": 4.654283682900818e-05, + "loss": 0.524, + "mean_token_accuracy": 0.830355030298233, + "step": 1015 + }, + { + "epoch": 0.4894433781190019, + "grad_norm": 0.5251369142411848, + "learning_rate": 4.649840028439389e-05, + "loss": 0.5397, + "mean_token_accuracy": 0.8257737815380096, + "step": 1020 + }, + { + "epoch": 0.4918426103646833, + "grad_norm": 0.5157247220630689, + "learning_rate": 4.64539637397796e-05, + "loss": 0.5237, + "mean_token_accuracy": 0.8299675941467285, + "step": 1025 + }, + { + "epoch": 0.4942418426103647, + "grad_norm": 0.4859001572426086, + "learning_rate": 4.6409527195165306e-05, + "loss": 0.5277, + "mean_token_accuracy": 0.8294627845287323, + "step": 1030 + }, + { + "epoch": 0.4966410748560461, + "grad_norm": 0.561175024557525, + "learning_rate": 4.636509065055101e-05, + "loss": 0.5455, + "mean_token_accuracy": 0.8240741550922394, + "step": 1035 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 0.5338562625947265, + "learning_rate": 4.632065410593672e-05, + "loss": 0.5374, + "mean_token_accuracy": 0.8271634161472321, + "step": 1040 + }, + { + "epoch": 0.5014395393474088, + "grad_norm": 0.515601873079243, + "learning_rate": 4.6276217561322435e-05, + "loss": 0.5345, + "mean_token_accuracy": 0.8272102892398834, + "step": 1045 + }, + { + "epoch": 0.5038387715930902, + "grad_norm": 0.6177024352288669, + "learning_rate": 4.623178101670814e-05, + "loss": 0.5333, + "mean_token_accuracy": 0.8278423607349396, + "step": 1050 + }, + { + "epoch": 0.5062380038387716, + "grad_norm": 0.5512310415321986, + "learning_rate": 4.618734447209385e-05, + "loss": 0.5242, + "mean_token_accuracy": 0.8300135910511017, + "step": 1055 + }, + { + "epoch": 0.508637236084453, + "grad_norm": 0.5663859189128105, + "learning_rate": 4.6142907927479563e-05, + "loss": 0.5154, + "mean_token_accuracy": 0.8328012764453888, + "step": 1060 + }, + { + "epoch": 0.5110364683301344, + "grad_norm": 0.5773949952752142, + "learning_rate": 4.609847138286527e-05, + "loss": 0.5349, + "mean_token_accuracy": 0.8270993530750275, + "step": 1065 + }, + { + "epoch": 0.5134357005758158, + "grad_norm": 0.6280922601470035, + "learning_rate": 4.605403483825098e-05, + "loss": 0.5348, + "mean_token_accuracy": 0.8269883990287781, + "step": 1070 + }, + { + "epoch": 0.5158349328214972, + "grad_norm": 0.5990994330045243, + "learning_rate": 4.600959829363669e-05, + "loss": 0.5293, + "mean_token_accuracy": 0.8284970939159393, + "step": 1075 + }, + { + "epoch": 0.5182341650671785, + "grad_norm": 0.5656451607094958, + "learning_rate": 4.5965161749022404e-05, + "loss": 0.5277, + "mean_token_accuracy": 0.8288775861263276, + "step": 1080 + }, + { + "epoch": 0.5206333973128598, + "grad_norm": 0.5397614018675521, + "learning_rate": 4.59207252044081e-05, + "loss": 0.5449, + "mean_token_accuracy": 0.8244773209095001, + "step": 1085 + }, + { + "epoch": 0.5230326295585412, + "grad_norm": 0.5669284914862057, + "learning_rate": 4.5876288659793814e-05, + "loss": 0.5148, + "mean_token_accuracy": 0.8320637166500091, + "step": 1090 + }, + { + "epoch": 0.5254318618042226, + "grad_norm": 0.5378169685314852, + "learning_rate": 4.5831852115179526e-05, + "loss": 0.5312, + "mean_token_accuracy": 0.8280978500843048, + "step": 1095 + }, + { + "epoch": 0.527831094049904, + "grad_norm": 0.4785055105443453, + "learning_rate": 4.578741557056523e-05, + "loss": 0.511, + "mean_token_accuracy": 0.8339739978313446, + "step": 1100 + }, + { + "epoch": 0.5302303262955854, + "grad_norm": 0.5033536918525533, + "learning_rate": 4.574297902595094e-05, + "loss": 0.5343, + "mean_token_accuracy": 0.8264274299144745, + "step": 1105 + }, + { + "epoch": 0.5326295585412668, + "grad_norm": 0.5617636032573375, + "learning_rate": 4.5698542481336655e-05, + "loss": 0.5251, + "mean_token_accuracy": 0.8295752882957459, + "step": 1110 + }, + { + "epoch": 0.5350287907869482, + "grad_norm": 0.47649118768675885, + "learning_rate": 4.565410593672237e-05, + "loss": 0.5224, + "mean_token_accuracy": 0.8308883607387543, + "step": 1115 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 0.5332406727760631, + "learning_rate": 4.560966939210807e-05, + "loss": 0.5321, + "mean_token_accuracy": 0.8277345418930053, + "step": 1120 + }, + { + "epoch": 0.539827255278311, + "grad_norm": 0.4782827735405752, + "learning_rate": 4.5565232847493784e-05, + "loss": 0.5268, + "mean_token_accuracy": 0.8292799592018127, + "step": 1125 + }, + { + "epoch": 0.5422264875239923, + "grad_norm": 0.5375370012352939, + "learning_rate": 4.5520796302879495e-05, + "loss": 0.5422, + "mean_token_accuracy": 0.8255633115768433, + "step": 1130 + }, + { + "epoch": 0.5446257197696737, + "grad_norm": 0.693752706192889, + "learning_rate": 4.5476359758265194e-05, + "loss": 0.5298, + "mean_token_accuracy": 0.8276696979999543, + "step": 1135 + }, + { + "epoch": 0.5470249520153551, + "grad_norm": 0.6665627567388126, + "learning_rate": 4.5431923213650906e-05, + "loss": 0.5266, + "mean_token_accuracy": 0.8296854555606842, + "step": 1140 + }, + { + "epoch": 0.5494241842610365, + "grad_norm": 0.5755677817276045, + "learning_rate": 4.538748666903662e-05, + "loss": 0.5178, + "mean_token_accuracy": 0.831487900018692, + "step": 1145 + }, + { + "epoch": 0.5518234165067178, + "grad_norm": 0.5462744381863546, + "learning_rate": 4.534305012442233e-05, + "loss": 0.5204, + "mean_token_accuracy": 0.8311738312244416, + "step": 1150 + }, + { + "epoch": 0.5542226487523992, + "grad_norm": 0.6500856819910776, + "learning_rate": 4.5298613579808034e-05, + "loss": 0.5245, + "mean_token_accuracy": 0.8298864960670471, + "step": 1155 + }, + { + "epoch": 0.5566218809980806, + "grad_norm": 0.6007744834984243, + "learning_rate": 4.5254177035193746e-05, + "loss": 0.5308, + "mean_token_accuracy": 0.8276986002922058, + "step": 1160 + }, + { + "epoch": 0.559021113243762, + "grad_norm": 0.4925324081774109, + "learning_rate": 4.520974049057946e-05, + "loss": 0.5206, + "mean_token_accuracy": 0.8303386211395264, + "step": 1165 + }, + { + "epoch": 0.5614203454894434, + "grad_norm": 0.5198373434963387, + "learning_rate": 4.516530394596516e-05, + "loss": 0.5214, + "mean_token_accuracy": 0.8302409529685975, + "step": 1170 + }, + { + "epoch": 0.5638195777351248, + "grad_norm": 0.5220759377556011, + "learning_rate": 4.5120867401350875e-05, + "loss": 0.5184, + "mean_token_accuracy": 0.8315722942352295, + "step": 1175 + }, + { + "epoch": 0.5662188099808061, + "grad_norm": 0.5114196157128489, + "learning_rate": 4.507643085673659e-05, + "loss": 0.5341, + "mean_token_accuracy": 0.8262094497680664, + "step": 1180 + }, + { + "epoch": 0.5686180422264875, + "grad_norm": 0.5068021513263434, + "learning_rate": 4.503199431212229e-05, + "loss": 0.5187, + "mean_token_accuracy": 0.8312457025051116, + "step": 1185 + }, + { + "epoch": 0.5710172744721689, + "grad_norm": 0.5613227478451318, + "learning_rate": 4.4987557767508e-05, + "loss": 0.5057, + "mean_token_accuracy": 0.8353295505046845, + "step": 1190 + }, + { + "epoch": 0.5734165067178503, + "grad_norm": 0.8284201861643404, + "learning_rate": 4.494312122289371e-05, + "loss": 0.5133, + "mean_token_accuracy": 0.8334067642688752, + "step": 1195 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 0.9210564168780322, + "learning_rate": 4.489868467827942e-05, + "loss": 0.5195, + "mean_token_accuracy": 0.8312300026416779, + "step": 1200 + }, + { + "epoch": 0.5782149712092131, + "grad_norm": 0.5997534401025089, + "learning_rate": 4.4854248133665126e-05, + "loss": 0.532, + "mean_token_accuracy": 0.8277329862117767, + "step": 1205 + }, + { + "epoch": 0.5806142034548945, + "grad_norm": 0.509102977352861, + "learning_rate": 4.480981158905084e-05, + "loss": 0.5229, + "mean_token_accuracy": 0.8301851987838745, + "step": 1210 + }, + { + "epoch": 0.5830134357005758, + "grad_norm": 0.5232298511871786, + "learning_rate": 4.476537504443655e-05, + "loss": 0.5115, + "mean_token_accuracy": 0.8335091173648834, + "step": 1215 + }, + { + "epoch": 0.5854126679462572, + "grad_norm": 1.4520334905055199, + "learning_rate": 4.4720938499822255e-05, + "loss": 0.5266, + "mean_token_accuracy": 0.8287900865077973, + "step": 1220 + }, + { + "epoch": 0.5878119001919386, + "grad_norm": 0.6774712243421297, + "learning_rate": 4.4676501955207966e-05, + "loss": 0.5423, + "mean_token_accuracy": 0.8255094051361084, + "step": 1225 + }, + { + "epoch": 0.5902111324376199, + "grad_norm": 1.5754303835971644, + "learning_rate": 4.463206541059368e-05, + "loss": 0.5188, + "mean_token_accuracy": 0.8306097269058228, + "step": 1230 + }, + { + "epoch": 0.5926103646833013, + "grad_norm": 0.7313851331688538, + "learning_rate": 4.458762886597938e-05, + "loss": 0.5361, + "mean_token_accuracy": 0.8274235844612121, + "step": 1235 + }, + { + "epoch": 0.5950095969289827, + "grad_norm": 0.7811366827057676, + "learning_rate": 4.454319232136509e-05, + "loss": 0.5376, + "mean_token_accuracy": 0.8265086829662323, + "step": 1240 + }, + { + "epoch": 0.5974088291746641, + "grad_norm": 0.5852967202784879, + "learning_rate": 4.44987557767508e-05, + "loss": 0.5222, + "mean_token_accuracy": 0.829976099729538, + "step": 1245 + }, + { + "epoch": 0.5998080614203455, + "grad_norm": 0.6500504682698087, + "learning_rate": 4.445431923213651e-05, + "loss": 0.524, + "mean_token_accuracy": 0.8303073704242706, + "step": 1250 + }, + { + "epoch": 0.6022072936660269, + "grad_norm": 0.5183436136927764, + "learning_rate": 4.440988268752222e-05, + "loss": 0.5219, + "mean_token_accuracy": 0.8303581476211548, + "step": 1255 + }, + { + "epoch": 0.6046065259117083, + "grad_norm": 1.1492046855836786, + "learning_rate": 4.436544614290793e-05, + "loss": 0.5332, + "mean_token_accuracy": 0.8281478404998779, + "step": 1260 + }, + { + "epoch": 0.6070057581573897, + "grad_norm": 0.752249891324009, + "learning_rate": 4.432100959829364e-05, + "loss": 0.512, + "mean_token_accuracy": 0.8334548771381378, + "step": 1265 + }, + { + "epoch": 0.6094049904030711, + "grad_norm": 0.5544631019636599, + "learning_rate": 4.4276573053679346e-05, + "loss": 0.5163, + "mean_token_accuracy": 0.8321223258972168, + "step": 1270 + }, + { + "epoch": 0.6118042226487524, + "grad_norm": 0.5153914235770953, + "learning_rate": 4.423213650906506e-05, + "loss": 0.5296, + "mean_token_accuracy": 0.8289190053939819, + "step": 1275 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 0.47633132901358466, + "learning_rate": 4.418769996445077e-05, + "loss": 0.5277, + "mean_token_accuracy": 0.8282978534698486, + "step": 1280 + }, + { + "epoch": 0.6166026871401151, + "grad_norm": 0.4932196916279109, + "learning_rate": 4.4143263419836475e-05, + "loss": 0.5213, + "mean_token_accuracy": 0.8302229821681977, + "step": 1285 + }, + { + "epoch": 0.6190019193857965, + "grad_norm": 0.5011121697532454, + "learning_rate": 4.409882687522218e-05, + "loss": 0.5225, + "mean_token_accuracy": 0.8293979346752167, + "step": 1290 + }, + { + "epoch": 0.6214011516314779, + "grad_norm": 0.4856904264094217, + "learning_rate": 4.405439033060789e-05, + "loss": 0.5219, + "mean_token_accuracy": 0.829535436630249, + "step": 1295 + }, + { + "epoch": 0.6238003838771593, + "grad_norm": 0.4820149931102137, + "learning_rate": 4.4009953785993604e-05, + "loss": 0.5138, + "mean_token_accuracy": 0.8327163219451904, + "step": 1300 + }, + { + "epoch": 0.6261996161228407, + "grad_norm": 0.4624881398808915, + "learning_rate": 4.396551724137931e-05, + "loss": 0.5157, + "mean_token_accuracy": 0.8317691743373871, + "step": 1305 + }, + { + "epoch": 0.6285988483685221, + "grad_norm": 0.5095347564346104, + "learning_rate": 4.392108069676502e-05, + "loss": 0.5158, + "mean_token_accuracy": 0.8313847720623017, + "step": 1310 + }, + { + "epoch": 0.6309980806142035, + "grad_norm": 0.44685463760378097, + "learning_rate": 4.387664415215073e-05, + "loss": 0.5273, + "mean_token_accuracy": 0.8286812126636505, + "step": 1315 + }, + { + "epoch": 0.6333973128598849, + "grad_norm": 0.4641733447876531, + "learning_rate": 4.383220760753644e-05, + "loss": 0.5146, + "mean_token_accuracy": 0.8313398063182831, + "step": 1320 + }, + { + "epoch": 0.6357965451055663, + "grad_norm": 0.4766102041605914, + "learning_rate": 4.378777106292215e-05, + "loss": 0.5109, + "mean_token_accuracy": 0.8354115903377533, + "step": 1325 + }, + { + "epoch": 0.6381957773512476, + "grad_norm": 0.546100262545878, + "learning_rate": 4.374333451830786e-05, + "loss": 0.5353, + "mean_token_accuracy": 0.8268633961677552, + "step": 1330 + }, + { + "epoch": 0.6405950095969289, + "grad_norm": 0.5356075705213353, + "learning_rate": 4.369889797369357e-05, + "loss": 0.5139, + "mean_token_accuracy": 0.8328723609447479, + "step": 1335 + }, + { + "epoch": 0.6429942418426103, + "grad_norm": 0.5350381311815007, + "learning_rate": 4.365446142907927e-05, + "loss": 0.5182, + "mean_token_accuracy": 0.8312582015991211, + "step": 1340 + }, + { + "epoch": 0.6453934740882917, + "grad_norm": 0.5202240202485383, + "learning_rate": 4.361002488446498e-05, + "loss": 0.5056, + "mean_token_accuracy": 0.8346873342990875, + "step": 1345 + }, + { + "epoch": 0.6477927063339731, + "grad_norm": 0.5784323915265774, + "learning_rate": 4.3565588339850695e-05, + "loss": 0.529, + "mean_token_accuracy": 0.8280220568180084, + "step": 1350 + }, + { + "epoch": 0.6501919385796545, + "grad_norm": 0.4835689781428135, + "learning_rate": 4.352115179523641e-05, + "loss": 0.5231, + "mean_token_accuracy": 0.8293167948722839, + "step": 1355 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 0.5757605413585991, + "learning_rate": 4.347671525062211e-05, + "loss": 0.5334, + "mean_token_accuracy": 0.8265172779560089, + "step": 1360 + }, + { + "epoch": 0.6549904030710173, + "grad_norm": 0.507486093941236, + "learning_rate": 4.3432278706007824e-05, + "loss": 0.5206, + "mean_token_accuracy": 0.8299190640449524, + "step": 1365 + }, + { + "epoch": 0.6573896353166987, + "grad_norm": 0.5249598517466133, + "learning_rate": 4.3387842161393536e-05, + "loss": 0.5195, + "mean_token_accuracy": 0.830981433391571, + "step": 1370 + }, + { + "epoch": 0.6597888675623801, + "grad_norm": 0.48631807389153725, + "learning_rate": 4.334340561677924e-05, + "loss": 0.5228, + "mean_token_accuracy": 0.829991239309311, + "step": 1375 + }, + { + "epoch": 0.6621880998080614, + "grad_norm": 0.486725909348472, + "learning_rate": 4.329896907216495e-05, + "loss": 0.5112, + "mean_token_accuracy": 0.8330169022083282, + "step": 1380 + }, + { + "epoch": 0.6645873320537428, + "grad_norm": 0.47067469083752866, + "learning_rate": 4.3254532527550664e-05, + "loss": 0.5139, + "mean_token_accuracy": 0.8317618012428284, + "step": 1385 + }, + { + "epoch": 0.6669865642994242, + "grad_norm": 0.4707320443459022, + "learning_rate": 4.321009598293637e-05, + "loss": 0.5033, + "mean_token_accuracy": 0.8353248655796051, + "step": 1390 + }, + { + "epoch": 0.6693857965451055, + "grad_norm": 0.5177653250480144, + "learning_rate": 4.3165659438322074e-05, + "loss": 0.5094, + "mean_token_accuracy": 0.8332755088806152, + "step": 1395 + }, + { + "epoch": 0.6717850287907869, + "grad_norm": 0.5351120723588554, + "learning_rate": 4.3121222893707786e-05, + "loss": 0.5226, + "mean_token_accuracy": 0.8294448137283326, + "step": 1400 + }, + { + "epoch": 0.6741842610364683, + "grad_norm": 0.4765578169939615, + "learning_rate": 4.30767863490935e-05, + "loss": 0.5044, + "mean_token_accuracy": 0.83488889336586, + "step": 1405 + }, + { + "epoch": 0.6765834932821497, + "grad_norm": 0.5188267403257218, + "learning_rate": 4.30323498044792e-05, + "loss": 0.5025, + "mean_token_accuracy": 0.8355053424835205, + "step": 1410 + }, + { + "epoch": 0.6789827255278311, + "grad_norm": 0.5033840380744201, + "learning_rate": 4.2987913259864915e-05, + "loss": 0.5019, + "mean_token_accuracy": 0.8350803136825562, + "step": 1415 + }, + { + "epoch": 0.6813819577735125, + "grad_norm": 0.5037590855559116, + "learning_rate": 4.294347671525063e-05, + "loss": 0.4861, + "mean_token_accuracy": 0.8396735787391663, + "step": 1420 + }, + { + "epoch": 0.6837811900191939, + "grad_norm": 0.4530621438574399, + "learning_rate": 4.289904017063633e-05, + "loss": 0.5233, + "mean_token_accuracy": 0.8285799026489258, + "step": 1425 + }, + { + "epoch": 0.6861804222648752, + "grad_norm": 0.4312748853048293, + "learning_rate": 4.2854603626022044e-05, + "loss": 0.5029, + "mean_token_accuracy": 0.8345647633075715, + "step": 1430 + }, + { + "epoch": 0.6885796545105566, + "grad_norm": 0.4273932940130073, + "learning_rate": 4.2810167081407756e-05, + "loss": 0.4906, + "mean_token_accuracy": 0.8378296196460724, + "step": 1435 + }, + { + "epoch": 0.690978886756238, + "grad_norm": 0.44519718560433147, + "learning_rate": 4.276573053679346e-05, + "loss": 0.5146, + "mean_token_accuracy": 0.8315480768680572, + "step": 1440 + }, + { + "epoch": 0.6933781190019194, + "grad_norm": 2.050838607632764, + "learning_rate": 4.2721293992179166e-05, + "loss": 0.4954, + "mean_token_accuracy": 0.8377414226531983, + "step": 1445 + }, + { + "epoch": 0.6957773512476008, + "grad_norm": 0.5012731719663341, + "learning_rate": 4.267685744756488e-05, + "loss": 0.4955, + "mean_token_accuracy": 0.8370757579803467, + "step": 1450 + }, + { + "epoch": 0.6981765834932822, + "grad_norm": 0.46356735369736507, + "learning_rate": 4.263242090295059e-05, + "loss": 0.4939, + "mean_token_accuracy": 0.8377406358718872, + "step": 1455 + }, + { + "epoch": 0.7005758157389635, + "grad_norm": 0.4325835028161503, + "learning_rate": 4.2587984358336295e-05, + "loss": 0.5148, + "mean_token_accuracy": 0.8308480203151702, + "step": 1460 + }, + { + "epoch": 0.7029750479846449, + "grad_norm": 0.4595501874101042, + "learning_rate": 4.2543547813722007e-05, + "loss": 0.5, + "mean_token_accuracy": 0.835820984840393, + "step": 1465 + }, + { + "epoch": 0.7053742802303263, + "grad_norm": 0.4822053236469392, + "learning_rate": 4.249911126910772e-05, + "loss": 0.503, + "mean_token_accuracy": 0.83474200963974, + "step": 1470 + }, + { + "epoch": 0.7077735124760077, + "grad_norm": 0.47568643360076, + "learning_rate": 4.2454674724493423e-05, + "loss": 0.5097, + "mean_token_accuracy": 0.8334442794322967, + "step": 1475 + }, + { + "epoch": 0.710172744721689, + "grad_norm": 0.45809054187912474, + "learning_rate": 4.2410238179879135e-05, + "loss": 0.4993, + "mean_token_accuracy": 0.835784274339676, + "step": 1480 + }, + { + "epoch": 0.7125719769673704, + "grad_norm": 0.5960561618915279, + "learning_rate": 4.236580163526485e-05, + "loss": 0.5135, + "mean_token_accuracy": 0.8313300907611847, + "step": 1485 + }, + { + "epoch": 0.7149712092130518, + "grad_norm": 0.47511929182323426, + "learning_rate": 4.232136509065055e-05, + "loss": 0.5057, + "mean_token_accuracy": 0.8338849306106567, + "step": 1490 + }, + { + "epoch": 0.7173704414587332, + "grad_norm": 0.4763020223745529, + "learning_rate": 4.227692854603626e-05, + "loss": 0.4954, + "mean_token_accuracy": 0.837703138589859, + "step": 1495 + }, + { + "epoch": 0.7197696737044146, + "grad_norm": 0.4929704143972219, + "learning_rate": 4.223249200142197e-05, + "loss": 0.5148, + "mean_token_accuracy": 0.8312839865684509, + "step": 1500 + }, + { + "epoch": 0.722168905950096, + "grad_norm": 0.4777453194471662, + "learning_rate": 4.218805545680768e-05, + "loss": 0.5004, + "mean_token_accuracy": 0.8349537491798401, + "step": 1505 + }, + { + "epoch": 0.7245681381957774, + "grad_norm": 0.4990271169279016, + "learning_rate": 4.2143618912193386e-05, + "loss": 0.4995, + "mean_token_accuracy": 0.835792076587677, + "step": 1510 + }, + { + "epoch": 0.7269673704414588, + "grad_norm": 0.45068189297866923, + "learning_rate": 4.20991823675791e-05, + "loss": 0.4993, + "mean_token_accuracy": 0.8355522215366363, + "step": 1515 + }, + { + "epoch": 0.7293666026871402, + "grad_norm": 0.5488621493176185, + "learning_rate": 4.205474582296481e-05, + "loss": 0.5034, + "mean_token_accuracy": 0.8344599723815918, + "step": 1520 + }, + { + "epoch": 0.7317658349328215, + "grad_norm": 0.49110092403567085, + "learning_rate": 4.2010309278350515e-05, + "loss": 0.5049, + "mean_token_accuracy": 0.8338232159614563, + "step": 1525 + }, + { + "epoch": 0.7341650671785028, + "grad_norm": 0.44819619071534816, + "learning_rate": 4.196587273373623e-05, + "loss": 0.4989, + "mean_token_accuracy": 0.8356217563152313, + "step": 1530 + }, + { + "epoch": 0.7365642994241842, + "grad_norm": 0.4978704155039036, + "learning_rate": 4.192143618912194e-05, + "loss": 0.5041, + "mean_token_accuracy": 0.8346357643604279, + "step": 1535 + }, + { + "epoch": 0.7389635316698656, + "grad_norm": 0.4955027083886941, + "learning_rate": 4.187699964450765e-05, + "loss": 0.5015, + "mean_token_accuracy": 0.8347662448883056, + "step": 1540 + }, + { + "epoch": 0.741362763915547, + "grad_norm": 0.9777169453073922, + "learning_rate": 4.183256309989335e-05, + "loss": 0.5107, + "mean_token_accuracy": 0.832754397392273, + "step": 1545 + }, + { + "epoch": 0.7437619961612284, + "grad_norm": 0.5239229227325649, + "learning_rate": 4.178812655527906e-05, + "loss": 0.5038, + "mean_token_accuracy": 0.8346226036548614, + "step": 1550 + }, + { + "epoch": 0.7461612284069098, + "grad_norm": 0.49596215598318977, + "learning_rate": 4.174369001066477e-05, + "loss": 0.5026, + "mean_token_accuracy": 0.8348248362541199, + "step": 1555 + }, + { + "epoch": 0.7485604606525912, + "grad_norm": 0.48117516776556213, + "learning_rate": 4.169925346605048e-05, + "loss": 0.4875, + "mean_token_accuracy": 0.8402361094951629, + "step": 1560 + }, + { + "epoch": 0.7509596928982726, + "grad_norm": 0.4438969952921841, + "learning_rate": 4.165481692143619e-05, + "loss": 0.4977, + "mean_token_accuracy": 0.8357623934745788, + "step": 1565 + }, + { + "epoch": 0.753358925143954, + "grad_norm": 0.521505618398822, + "learning_rate": 4.16103803768219e-05, + "loss": 0.4887, + "mean_token_accuracy": 0.8391876101493836, + "step": 1570 + }, + { + "epoch": 0.7557581573896354, + "grad_norm": 0.4693636851519488, + "learning_rate": 4.156594383220761e-05, + "loss": 0.5063, + "mean_token_accuracy": 0.8331591010093689, + "step": 1575 + }, + { + "epoch": 0.7581573896353166, + "grad_norm": 0.496197058535834, + "learning_rate": 4.152150728759332e-05, + "loss": 0.5126, + "mean_token_accuracy": 0.8317332327365875, + "step": 1580 + }, + { + "epoch": 0.760556621880998, + "grad_norm": 0.6505659912029434, + "learning_rate": 4.147707074297903e-05, + "loss": 0.4984, + "mean_token_accuracy": 0.8353803277015686, + "step": 1585 + }, + { + "epoch": 0.7629558541266794, + "grad_norm": 0.47473838918744676, + "learning_rate": 4.143263419836474e-05, + "loss": 0.5106, + "mean_token_accuracy": 0.8329426884651184, + "step": 1590 + }, + { + "epoch": 0.7653550863723608, + "grad_norm": 0.5803134205385464, + "learning_rate": 4.138819765375045e-05, + "loss": 0.5169, + "mean_token_accuracy": 0.8308128654956818, + "step": 1595 + }, + { + "epoch": 0.7677543186180422, + "grad_norm": 0.5614686683573403, + "learning_rate": 4.134376110913615e-05, + "loss": 0.5109, + "mean_token_accuracy": 0.83311847448349, + "step": 1600 + }, + { + "epoch": 0.7701535508637236, + "grad_norm": 0.7065207576015031, + "learning_rate": 4.1299324564521864e-05, + "loss": 0.4999, + "mean_token_accuracy": 0.8362725853919983, + "step": 1605 + }, + { + "epoch": 0.772552783109405, + "grad_norm": 0.560451963494759, + "learning_rate": 4.1254888019907576e-05, + "loss": 0.4998, + "mean_token_accuracy": 0.8343302607536316, + "step": 1610 + }, + { + "epoch": 0.7749520153550864, + "grad_norm": 0.5244654481078436, + "learning_rate": 4.121045147529328e-05, + "loss": 0.5132, + "mean_token_accuracy": 0.8325715720653534, + "step": 1615 + }, + { + "epoch": 0.7773512476007678, + "grad_norm": 0.46348761758989515, + "learning_rate": 4.116601493067899e-05, + "loss": 0.5092, + "mean_token_accuracy": 0.8330723762512207, + "step": 1620 + }, + { + "epoch": 0.7797504798464492, + "grad_norm": 0.4481507057971629, + "learning_rate": 4.1121578386064704e-05, + "loss": 0.5002, + "mean_token_accuracy": 0.8361100614070892, + "step": 1625 + }, + { + "epoch": 0.7821497120921305, + "grad_norm": 0.581248195098889, + "learning_rate": 4.107714184145041e-05, + "loss": 0.4929, + "mean_token_accuracy": 0.8380164384841919, + "step": 1630 + }, + { + "epoch": 0.7845489443378119, + "grad_norm": 0.520687770629966, + "learning_rate": 4.103270529683612e-05, + "loss": 0.5056, + "mean_token_accuracy": 0.8334766566753388, + "step": 1635 + }, + { + "epoch": 0.7869481765834933, + "grad_norm": 0.49427636553199883, + "learning_rate": 4.098826875222183e-05, + "loss": 0.4806, + "mean_token_accuracy": 0.8409277200698853, + "step": 1640 + }, + { + "epoch": 0.7893474088291746, + "grad_norm": 0.4746622492122426, + "learning_rate": 4.094383220760754e-05, + "loss": 0.4978, + "mean_token_accuracy": 0.836303836107254, + "step": 1645 + }, + { + "epoch": 0.791746641074856, + "grad_norm": 0.45998552131958137, + "learning_rate": 4.089939566299324e-05, + "loss": 0.5055, + "mean_token_accuracy": 0.8346006035804748, + "step": 1650 + }, + { + "epoch": 0.7941458733205374, + "grad_norm": 0.7306011013931711, + "learning_rate": 4.0854959118378955e-05, + "loss": 0.4914, + "mean_token_accuracy": 0.8384609997272492, + "step": 1655 + }, + { + "epoch": 0.7965451055662188, + "grad_norm": 0.48801343160132227, + "learning_rate": 4.081052257376467e-05, + "loss": 0.5002, + "mean_token_accuracy": 0.8357686400413513, + "step": 1660 + }, + { + "epoch": 0.7989443378119002, + "grad_norm": 0.46142435250279384, + "learning_rate": 4.076608602915037e-05, + "loss": 0.5068, + "mean_token_accuracy": 0.8335560023784637, + "step": 1665 + }, + { + "epoch": 0.8013435700575816, + "grad_norm": 0.4953618366906397, + "learning_rate": 4.0721649484536084e-05, + "loss": 0.4981, + "mean_token_accuracy": 0.8363311767578125, + "step": 1670 + }, + { + "epoch": 0.803742802303263, + "grad_norm": 0.4470979902731658, + "learning_rate": 4.0677212939921796e-05, + "loss": 0.5018, + "mean_token_accuracy": 0.8343466818332672, + "step": 1675 + }, + { + "epoch": 0.8061420345489443, + "grad_norm": 0.5257607137210768, + "learning_rate": 4.06327763953075e-05, + "loss": 0.5272, + "mean_token_accuracy": 0.8283361494541168, + "step": 1680 + }, + { + "epoch": 0.8085412667946257, + "grad_norm": 0.5449197937622, + "learning_rate": 4.058833985069321e-05, + "loss": 0.5027, + "mean_token_accuracy": 0.8345248162746429, + "step": 1685 + }, + { + "epoch": 0.8109404990403071, + "grad_norm": 0.5272292325409751, + "learning_rate": 4.0543903306078925e-05, + "loss": 0.5003, + "mean_token_accuracy": 0.8353365778923034, + "step": 1690 + }, + { + "epoch": 0.8133397312859885, + "grad_norm": 0.49483975937460833, + "learning_rate": 4.049946676146463e-05, + "loss": 0.48, + "mean_token_accuracy": 0.841797149181366, + "step": 1695 + }, + { + "epoch": 0.8157389635316699, + "grad_norm": 0.455696435370605, + "learning_rate": 4.0455030216850335e-05, + "loss": 0.5031, + "mean_token_accuracy": 0.8344521641731262, + "step": 1700 + }, + { + "epoch": 0.8181381957773513, + "grad_norm": 0.809778240421179, + "learning_rate": 4.041059367223605e-05, + "loss": 0.4793, + "mean_token_accuracy": 0.8415903806686401, + "step": 1705 + }, + { + "epoch": 0.8205374280230326, + "grad_norm": 0.4581620367543629, + "learning_rate": 4.036615712762176e-05, + "loss": 0.5031, + "mean_token_accuracy": 0.8346677839756012, + "step": 1710 + }, + { + "epoch": 0.822936660268714, + "grad_norm": 0.4793760548775191, + "learning_rate": 4.0321720583007464e-05, + "loss": 0.5082, + "mean_token_accuracy": 0.8328176736831665, + "step": 1715 + }, + { + "epoch": 0.8253358925143954, + "grad_norm": 0.7488626062500724, + "learning_rate": 4.0277284038393175e-05, + "loss": 0.4992, + "mean_token_accuracy": 0.8357600927352905, + "step": 1720 + }, + { + "epoch": 0.8277351247600768, + "grad_norm": 0.4417556948390778, + "learning_rate": 4.023284749377889e-05, + "loss": 0.4785, + "mean_token_accuracy": 0.8417211413383484, + "step": 1725 + }, + { + "epoch": 0.8301343570057581, + "grad_norm": 0.4778636985339465, + "learning_rate": 4.018841094916459e-05, + "loss": 0.4986, + "mean_token_accuracy": 0.8356509983539582, + "step": 1730 + }, + { + "epoch": 0.8325335892514395, + "grad_norm": 0.41886367691680565, + "learning_rate": 4.0143974404550304e-05, + "loss": 0.4959, + "mean_token_accuracy": 0.8363343060016633, + "step": 1735 + }, + { + "epoch": 0.8349328214971209, + "grad_norm": 0.46794847168479464, + "learning_rate": 4.0099537859936016e-05, + "loss": 0.4882, + "mean_token_accuracy": 0.8390391588211059, + "step": 1740 + }, + { + "epoch": 0.8373320537428023, + "grad_norm": 0.45094719812416384, + "learning_rate": 4.005510131532173e-05, + "loss": 0.5056, + "mean_token_accuracy": 0.8326801776885986, + "step": 1745 + }, + { + "epoch": 0.8397312859884837, + "grad_norm": 0.4410114321911972, + "learning_rate": 4.001066477070743e-05, + "loss": 0.4915, + "mean_token_accuracy": 0.8374046802520752, + "step": 1750 + }, + { + "epoch": 0.8421305182341651, + "grad_norm": 0.5013043600759126, + "learning_rate": 3.996622822609314e-05, + "loss": 0.4987, + "mean_token_accuracy": 0.8353951811790467, + "step": 1755 + }, + { + "epoch": 0.8445297504798465, + "grad_norm": 0.43861309242180074, + "learning_rate": 3.992179168147885e-05, + "loss": 0.4899, + "mean_token_accuracy": 0.8380984723567962, + "step": 1760 + }, + { + "epoch": 0.8469289827255279, + "grad_norm": 0.41336946937588115, + "learning_rate": 3.9877355136864555e-05, + "loss": 0.4924, + "mean_token_accuracy": 0.8372499346733093, + "step": 1765 + }, + { + "epoch": 0.8493282149712092, + "grad_norm": 0.4545097883190453, + "learning_rate": 3.983291859225027e-05, + "loss": 0.5168, + "mean_token_accuracy": 0.8302761137485504, + "step": 1770 + }, + { + "epoch": 0.8517274472168906, + "grad_norm": 0.474966739957993, + "learning_rate": 3.978848204763598e-05, + "loss": 0.494, + "mean_token_accuracy": 0.8385133445262909, + "step": 1775 + }, + { + "epoch": 0.8541266794625719, + "grad_norm": 0.5865771257587197, + "learning_rate": 3.974404550302169e-05, + "loss": 0.5173, + "mean_token_accuracy": 0.8318293452262878, + "step": 1780 + }, + { + "epoch": 0.8565259117082533, + "grad_norm": 0.5147748010011185, + "learning_rate": 3.9699608958407396e-05, + "loss": 0.4901, + "mean_token_accuracy": 0.8380867540836334, + "step": 1785 + }, + { + "epoch": 0.8589251439539347, + "grad_norm": 0.6332747120400611, + "learning_rate": 3.965517241379311e-05, + "loss": 0.4877, + "mean_token_accuracy": 0.8399276256561279, + "step": 1790 + }, + { + "epoch": 0.8613243761996161, + "grad_norm": 0.47831786443131424, + "learning_rate": 3.961073586917882e-05, + "loss": 0.5068, + "mean_token_accuracy": 0.8339995861053466, + "step": 1795 + }, + { + "epoch": 0.8637236084452975, + "grad_norm": 0.4832626111095525, + "learning_rate": 3.9566299324564524e-05, + "loss": 0.4936, + "mean_token_accuracy": 0.8372804462909699, + "step": 1800 + }, + { + "epoch": 0.8661228406909789, + "grad_norm": 0.5261836705483144, + "learning_rate": 3.952186277995023e-05, + "loss": 0.4769, + "mean_token_accuracy": 0.8424659311771393, + "step": 1805 + }, + { + "epoch": 0.8685220729366603, + "grad_norm": 0.451251646000052, + "learning_rate": 3.947742623533594e-05, + "loss": 0.4892, + "mean_token_accuracy": 0.8388195514678956, + "step": 1810 + }, + { + "epoch": 0.8709213051823417, + "grad_norm": 0.48207059123147333, + "learning_rate": 3.943298969072165e-05, + "loss": 0.4983, + "mean_token_accuracy": 0.8356702029705048, + "step": 1815 + }, + { + "epoch": 0.8733205374280231, + "grad_norm": 0.47858523546672777, + "learning_rate": 3.938855314610736e-05, + "loss": 0.4831, + "mean_token_accuracy": 0.8398571729660034, + "step": 1820 + }, + { + "epoch": 0.8757197696737045, + "grad_norm": 0.4855780035679636, + "learning_rate": 3.934411660149307e-05, + "loss": 0.5054, + "mean_token_accuracy": 0.8337648630142211, + "step": 1825 + }, + { + "epoch": 0.8781190019193857, + "grad_norm": 0.4670757101333078, + "learning_rate": 3.929968005687878e-05, + "loss": 0.4824, + "mean_token_accuracy": 0.8406095683574677, + "step": 1830 + }, + { + "epoch": 0.8805182341650671, + "grad_norm": 0.4549267712242883, + "learning_rate": 3.925524351226449e-05, + "loss": 0.4921, + "mean_token_accuracy": 0.8366944909095764, + "step": 1835 + }, + { + "epoch": 0.8829174664107485, + "grad_norm": 0.45048453890973705, + "learning_rate": 3.92108069676502e-05, + "loss": 0.5092, + "mean_token_accuracy": 0.8323357820510864, + "step": 1840 + }, + { + "epoch": 0.8853166986564299, + "grad_norm": 0.4362188334466366, + "learning_rate": 3.916637042303591e-05, + "loss": 0.4818, + "mean_token_accuracy": 0.8405416011810303, + "step": 1845 + }, + { + "epoch": 0.8877159309021113, + "grad_norm": 0.4609804738877885, + "learning_rate": 3.9121933878421616e-05, + "loss": 0.4833, + "mean_token_accuracy": 0.8405681133270264, + "step": 1850 + }, + { + "epoch": 0.8901151631477927, + "grad_norm": 0.45727787424378313, + "learning_rate": 3.907749733380732e-05, + "loss": 0.4806, + "mean_token_accuracy": 0.8407939493656158, + "step": 1855 + }, + { + "epoch": 0.8925143953934741, + "grad_norm": 0.4517352190937965, + "learning_rate": 3.903306078919303e-05, + "loss": 0.4856, + "mean_token_accuracy": 0.8397642016410828, + "step": 1860 + }, + { + "epoch": 0.8949136276391555, + "grad_norm": 0.5410580134801548, + "learning_rate": 3.8988624244578745e-05, + "loss": 0.5012, + "mean_token_accuracy": 0.835526442527771, + "step": 1865 + }, + { + "epoch": 0.8973128598848369, + "grad_norm": 0.4255275614609763, + "learning_rate": 3.894418769996445e-05, + "loss": 0.4882, + "mean_token_accuracy": 0.8384945869445801, + "step": 1870 + }, + { + "epoch": 0.8997120921305183, + "grad_norm": 0.5016674928378082, + "learning_rate": 3.889975115535016e-05, + "loss": 0.4864, + "mean_token_accuracy": 0.8388539910316467, + "step": 1875 + }, + { + "epoch": 0.9021113243761996, + "grad_norm": 0.47185204205793313, + "learning_rate": 3.885531461073587e-05, + "loss": 0.5008, + "mean_token_accuracy": 0.834422481060028, + "step": 1880 + }, + { + "epoch": 0.904510556621881, + "grad_norm": 0.44149478527936975, + "learning_rate": 3.881087806612158e-05, + "loss": 0.4747, + "mean_token_accuracy": 0.8427581369876862, + "step": 1885 + }, + { + "epoch": 0.9069097888675623, + "grad_norm": 0.42788888304582523, + "learning_rate": 3.876644152150729e-05, + "loss": 0.5003, + "mean_token_accuracy": 0.835177195072174, + "step": 1890 + }, + { + "epoch": 0.9093090211132437, + "grad_norm": 0.44941832370477924, + "learning_rate": 3.8722004976893e-05, + "loss": 0.4783, + "mean_token_accuracy": 0.841586971282959, + "step": 1895 + }, + { + "epoch": 0.9117082533589251, + "grad_norm": 0.4427516271825846, + "learning_rate": 3.867756843227871e-05, + "loss": 0.491, + "mean_token_accuracy": 0.8373789012432098, + "step": 1900 + }, + { + "epoch": 0.9141074856046065, + "grad_norm": 0.4476423549355035, + "learning_rate": 3.863313188766441e-05, + "loss": 0.4995, + "mean_token_accuracy": 0.8350639104843139, + "step": 1905 + }, + { + "epoch": 0.9165067178502879, + "grad_norm": 0.4492630820771489, + "learning_rate": 3.8588695343050124e-05, + "loss": 0.4879, + "mean_token_accuracy": 0.8386876225471497, + "step": 1910 + }, + { + "epoch": 0.9189059500959693, + "grad_norm": 0.47882494003670634, + "learning_rate": 3.8544258798435836e-05, + "loss": 0.4833, + "mean_token_accuracy": 0.8402657985687256, + "step": 1915 + }, + { + "epoch": 0.9213051823416507, + "grad_norm": 0.4419811380727961, + "learning_rate": 3.849982225382154e-05, + "loss": 0.4936, + "mean_token_accuracy": 0.8363905549049377, + "step": 1920 + }, + { + "epoch": 0.9237044145873321, + "grad_norm": 0.39621957717208156, + "learning_rate": 3.845538570920725e-05, + "loss": 0.484, + "mean_token_accuracy": 0.8395649671554566, + "step": 1925 + }, + { + "epoch": 0.9261036468330134, + "grad_norm": 0.4529698155759262, + "learning_rate": 3.8410949164592965e-05, + "loss": 0.4686, + "mean_token_accuracy": 0.84321910738945, + "step": 1930 + }, + { + "epoch": 0.9285028790786948, + "grad_norm": 0.40480258139982583, + "learning_rate": 3.836651261997867e-05, + "loss": 0.4935, + "mean_token_accuracy": 0.8369616866111755, + "step": 1935 + }, + { + "epoch": 0.9309021113243762, + "grad_norm": 0.49493368299837737, + "learning_rate": 3.832207607536438e-05, + "loss": 0.4729, + "mean_token_accuracy": 0.8431230068206788, + "step": 1940 + }, + { + "epoch": 0.9333013435700576, + "grad_norm": 0.5013934672328659, + "learning_rate": 3.8277639530750094e-05, + "loss": 0.4843, + "mean_token_accuracy": 0.8394111692905426, + "step": 1945 + }, + { + "epoch": 0.935700575815739, + "grad_norm": 0.4330692332817214, + "learning_rate": 3.8233202986135805e-05, + "loss": 0.4876, + "mean_token_accuracy": 0.838521808385849, + "step": 1950 + }, + { + "epoch": 0.9380998080614203, + "grad_norm": 0.4518953171866384, + "learning_rate": 3.818876644152151e-05, + "loss": 0.4993, + "mean_token_accuracy": 0.8351330041885376, + "step": 1955 + }, + { + "epoch": 0.9404990403071017, + "grad_norm": 0.46867857792205087, + "learning_rate": 3.8144329896907216e-05, + "loss": 0.4836, + "mean_token_accuracy": 0.8396610736846923, + "step": 1960 + }, + { + "epoch": 0.9428982725527831, + "grad_norm": 0.4628778139252986, + "learning_rate": 3.809989335229293e-05, + "loss": 0.5, + "mean_token_accuracy": 0.834876400232315, + "step": 1965 + }, + { + "epoch": 0.9452975047984645, + "grad_norm": 0.4600077570542966, + "learning_rate": 3.805545680767863e-05, + "loss": 0.4935, + "mean_token_accuracy": 0.8370054304599762, + "step": 1970 + }, + { + "epoch": 0.9476967370441459, + "grad_norm": 0.4993993819155987, + "learning_rate": 3.8011020263064344e-05, + "loss": 0.4935, + "mean_token_accuracy": 0.8372502028942108, + "step": 1975 + }, + { + "epoch": 0.9500959692898272, + "grad_norm": 0.4439645062611687, + "learning_rate": 3.7966583718450056e-05, + "loss": 0.4891, + "mean_token_accuracy": 0.8377054810523987, + "step": 1980 + }, + { + "epoch": 0.9524952015355086, + "grad_norm": 0.5121282144414162, + "learning_rate": 3.792214717383577e-05, + "loss": 0.4832, + "mean_token_accuracy": 0.8399454653263092, + "step": 1985 + }, + { + "epoch": 0.95489443378119, + "grad_norm": 0.49717141863444153, + "learning_rate": 3.787771062922147e-05, + "loss": 0.4896, + "mean_token_accuracy": 0.837871116399765, + "step": 1990 + }, + { + "epoch": 0.9572936660268714, + "grad_norm": 0.4544349403730862, + "learning_rate": 3.7833274084607185e-05, + "loss": 0.5072, + "mean_token_accuracy": 0.8325957894325257, + "step": 1995 + }, + { + "epoch": 0.9596928982725528, + "grad_norm": 0.42858727516703904, + "learning_rate": 3.77888375399929e-05, + "loss": 0.4828, + "mean_token_accuracy": 0.8397642016410828, + "step": 2000 + }, + { + "epoch": 0.9620921305182342, + "grad_norm": 0.40754396210939614, + "learning_rate": 3.77444009953786e-05, + "loss": 0.4765, + "mean_token_accuracy": 0.8419104278087616, + "step": 2005 + }, + { + "epoch": 0.9644913627639156, + "grad_norm": 0.44726864462424454, + "learning_rate": 3.769996445076431e-05, + "loss": 0.4854, + "mean_token_accuracy": 0.8394436955451965, + "step": 2010 + }, + { + "epoch": 0.966890595009597, + "grad_norm": 0.47404065872867157, + "learning_rate": 3.765552790615002e-05, + "loss": 0.5063, + "mean_token_accuracy": 0.8324910879135132, + "step": 2015 + }, + { + "epoch": 0.9692898272552783, + "grad_norm": 0.410702396325773, + "learning_rate": 3.761109136153573e-05, + "loss": 0.477, + "mean_token_accuracy": 0.8416393160820007, + "step": 2020 + }, + { + "epoch": 0.9716890595009597, + "grad_norm": 0.4476680516713628, + "learning_rate": 3.7566654816921436e-05, + "loss": 0.4881, + "mean_token_accuracy": 0.8375460922718048, + "step": 2025 + }, + { + "epoch": 0.974088291746641, + "grad_norm": 0.44590317336698554, + "learning_rate": 3.752221827230715e-05, + "loss": 0.4827, + "mean_token_accuracy": 0.8395681023597718, + "step": 2030 + }, + { + "epoch": 0.9764875239923224, + "grad_norm": 0.5040536414096327, + "learning_rate": 3.747778172769286e-05, + "loss": 0.4922, + "mean_token_accuracy": 0.8366780757904053, + "step": 2035 + }, + { + "epoch": 0.9788867562380038, + "grad_norm": 0.40814566317367457, + "learning_rate": 3.7433345183078564e-05, + "loss": 0.4864, + "mean_token_accuracy": 0.8382266163825989, + "step": 2040 + }, + { + "epoch": 0.9812859884836852, + "grad_norm": 0.4511394234284313, + "learning_rate": 3.7388908638464276e-05, + "loss": 0.4893, + "mean_token_accuracy": 0.8377293705940246, + "step": 2045 + }, + { + "epoch": 0.9836852207293666, + "grad_norm": 0.436955498827051, + "learning_rate": 3.734447209384999e-05, + "loss": 0.4918, + "mean_token_accuracy": 0.8379468977451324, + "step": 2050 + }, + { + "epoch": 0.986084452975048, + "grad_norm": 0.5357720057793889, + "learning_rate": 3.730003554923569e-05, + "loss": 0.491, + "mean_token_accuracy": 0.8379468977451324, + "step": 2055 + }, + { + "epoch": 0.9884836852207294, + "grad_norm": 0.4217359911795722, + "learning_rate": 3.72555990046214e-05, + "loss": 0.4761, + "mean_token_accuracy": 0.8418299496173859, + "step": 2060 + }, + { + "epoch": 0.9908829174664108, + "grad_norm": 0.43096352852270436, + "learning_rate": 3.721116246000711e-05, + "loss": 0.4885, + "mean_token_accuracy": 0.8379851818084717, + "step": 2065 + }, + { + "epoch": 0.9932821497120922, + "grad_norm": 0.4297567148467833, + "learning_rate": 3.716672591539282e-05, + "loss": 0.4914, + "mean_token_accuracy": 0.8365148782730103, + "step": 2070 + }, + { + "epoch": 0.9956813819577736, + "grad_norm": 0.4804447999108489, + "learning_rate": 3.712228937077853e-05, + "loss": 0.4936, + "mean_token_accuracy": 0.836709177494049, + "step": 2075 + }, + { + "epoch": 0.9980806142034548, + "grad_norm": 0.4691591099165681, + "learning_rate": 3.707785282616424e-05, + "loss": 0.4919, + "mean_token_accuracy": 0.8369327783584595, + "step": 2080 + }, + { + "epoch": 1.0004798464491362, + "grad_norm": 0.5385056566589963, + "learning_rate": 3.703341628154995e-05, + "loss": 0.4725, + "mean_token_accuracy": 0.8429514586925506, + "step": 2085 + }, + { + "epoch": 1.0028790786948176, + "grad_norm": 0.5247407907209998, + "learning_rate": 3.6988979736935656e-05, + "loss": 0.3989, + "mean_token_accuracy": 0.8624344766139984, + "step": 2090 + }, + { + "epoch": 1.005278310940499, + "grad_norm": 0.48811929992401026, + "learning_rate": 3.694454319232137e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.8603689730167389, + "step": 2095 + }, + { + "epoch": 1.0076775431861804, + "grad_norm": 0.4398358878276954, + "learning_rate": 3.690010664770708e-05, + "loss": 0.4098, + "mean_token_accuracy": 0.8592318236827851, + "step": 2100 + }, + { + "epoch": 1.0100767754318618, + "grad_norm": 0.5344316993297887, + "learning_rate": 3.6855670103092785e-05, + "loss": 0.4156, + "mean_token_accuracy": 0.8575473427772522, + "step": 2105 + }, + { + "epoch": 1.0124760076775432, + "grad_norm": 0.43873766298460454, + "learning_rate": 3.681123355847849e-05, + "loss": 0.4143, + "mean_token_accuracy": 0.8583036422729492, + "step": 2110 + }, + { + "epoch": 1.0148752399232246, + "grad_norm": 0.4431906254548146, + "learning_rate": 3.67667970138642e-05, + "loss": 0.4007, + "mean_token_accuracy": 0.8621528685092926, + "step": 2115 + }, + { + "epoch": 1.017274472168906, + "grad_norm": 0.46385051341702943, + "learning_rate": 3.6722360469249913e-05, + "loss": 0.4133, + "mean_token_accuracy": 0.8582450449466705, + "step": 2120 + }, + { + "epoch": 1.0196737044145874, + "grad_norm": 0.47509224818785306, + "learning_rate": 3.667792392463562e-05, + "loss": 0.4083, + "mean_token_accuracy": 0.8595705032348633, + "step": 2125 + }, + { + "epoch": 1.0220729366602688, + "grad_norm": 0.4552889927508356, + "learning_rate": 3.663348738002133e-05, + "loss": 0.4096, + "mean_token_accuracy": 0.8591662049293518, + "step": 2130 + }, + { + "epoch": 1.0244721689059502, + "grad_norm": 0.5509774885331505, + "learning_rate": 3.658905083540704e-05, + "loss": 0.421, + "mean_token_accuracy": 0.8578622102737427, + "step": 2135 + }, + { + "epoch": 1.0268714011516316, + "grad_norm": 0.4556583809000766, + "learning_rate": 3.654461429079275e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.8639024317264556, + "step": 2140 + }, + { + "epoch": 1.029270633397313, + "grad_norm": 0.5340933536827104, + "learning_rate": 3.650017774617846e-05, + "loss": 0.4122, + "mean_token_accuracy": 0.8583052039146424, + "step": 2145 + }, + { + "epoch": 1.0316698656429943, + "grad_norm": 0.42404563232605813, + "learning_rate": 3.645574120156417e-05, + "loss": 0.4011, + "mean_token_accuracy": 0.8615194737911225, + "step": 2150 + }, + { + "epoch": 1.0340690978886755, + "grad_norm": 0.5072208926542439, + "learning_rate": 3.6411304656949876e-05, + "loss": 0.4036, + "mean_token_accuracy": 0.8606564939022064, + "step": 2155 + }, + { + "epoch": 1.036468330134357, + "grad_norm": 0.4182179906134632, + "learning_rate": 3.636686811233559e-05, + "loss": 0.4248, + "mean_token_accuracy": 0.8546604514122009, + "step": 2160 + }, + { + "epoch": 1.0388675623800383, + "grad_norm": 0.44924145858745945, + "learning_rate": 3.632243156772129e-05, + "loss": 0.4063, + "mean_token_accuracy": 0.8609499096870422, + "step": 2165 + }, + { + "epoch": 1.0412667946257197, + "grad_norm": 0.48104691609354716, + "learning_rate": 3.6277995023107005e-05, + "loss": 0.434, + "mean_token_accuracy": 0.852498596906662, + "step": 2170 + }, + { + "epoch": 1.043666026871401, + "grad_norm": 0.48132504476906385, + "learning_rate": 3.623355847849271e-05, + "loss": 0.4011, + "mean_token_accuracy": 0.8613822042942048, + "step": 2175 + }, + { + "epoch": 1.0460652591170825, + "grad_norm": 0.46151017509763026, + "learning_rate": 3.618912193387842e-05, + "loss": 0.4006, + "mean_token_accuracy": 0.8626507818698883, + "step": 2180 + }, + { + "epoch": 1.0484644913627639, + "grad_norm": 0.4829126029300642, + "learning_rate": 3.6144685389264134e-05, + "loss": 0.4085, + "mean_token_accuracy": 0.8593409061431885, + "step": 2185 + }, + { + "epoch": 1.0508637236084453, + "grad_norm": 0.4719323814072435, + "learning_rate": 3.6100248844649845e-05, + "loss": 0.4071, + "mean_token_accuracy": 0.860792088508606, + "step": 2190 + }, + { + "epoch": 1.0532629558541267, + "grad_norm": 0.44618383248608906, + "learning_rate": 3.605581230003555e-05, + "loss": 0.4071, + "mean_token_accuracy": 0.8597249269485474, + "step": 2195 + }, + { + "epoch": 1.055662188099808, + "grad_norm": 0.484371830161018, + "learning_rate": 3.601137575542126e-05, + "loss": 0.4001, + "mean_token_accuracy": 0.8620155990123749, + "step": 2200 + }, + { + "epoch": 1.0580614203454894, + "grad_norm": 0.4726148631163947, + "learning_rate": 3.5966939210806974e-05, + "loss": 0.4083, + "mean_token_accuracy": 0.8604319036006928, + "step": 2205 + }, + { + "epoch": 1.0604606525911708, + "grad_norm": 0.5195452365929821, + "learning_rate": 3.592250266619268e-05, + "loss": 0.4177, + "mean_token_accuracy": 0.8567543268203736, + "step": 2210 + }, + { + "epoch": 1.0628598848368522, + "grad_norm": 0.5067336953723536, + "learning_rate": 3.5878066121578384e-05, + "loss": 0.4055, + "mean_token_accuracy": 0.860358464717865, + "step": 2215 + }, + { + "epoch": 1.0652591170825336, + "grad_norm": 0.44003137165691186, + "learning_rate": 3.5833629576964096e-05, + "loss": 0.4041, + "mean_token_accuracy": 0.8620176315307617, + "step": 2220 + }, + { + "epoch": 1.067658349328215, + "grad_norm": 0.47306018294891, + "learning_rate": 3.578919303234981e-05, + "loss": 0.4027, + "mean_token_accuracy": 0.8611225724220276, + "step": 2225 + }, + { + "epoch": 1.0700575815738964, + "grad_norm": 0.43149528893426176, + "learning_rate": 3.574475648773551e-05, + "loss": 0.3874, + "mean_token_accuracy": 0.8662525892257691, + "step": 2230 + }, + { + "epoch": 1.0724568138195778, + "grad_norm": 0.4558625625028902, + "learning_rate": 3.5700319943121225e-05, + "loss": 0.4056, + "mean_token_accuracy": 0.8602272033691406, + "step": 2235 + }, + { + "epoch": 1.0748560460652592, + "grad_norm": 0.45346032009736464, + "learning_rate": 3.565588339850694e-05, + "loss": 0.411, + "mean_token_accuracy": 0.8586599171161652, + "step": 2240 + }, + { + "epoch": 1.0772552783109406, + "grad_norm": 0.46919295313231174, + "learning_rate": 3.561144685389264e-05, + "loss": 0.3986, + "mean_token_accuracy": 0.8619990050792694, + "step": 2245 + }, + { + "epoch": 1.079654510556622, + "grad_norm": 0.48726010152230115, + "learning_rate": 3.5567010309278354e-05, + "loss": 0.4025, + "mean_token_accuracy": 0.8616835474967957, + "step": 2250 + }, + { + "epoch": 1.0820537428023034, + "grad_norm": 0.48673873249135896, + "learning_rate": 3.5522573764664066e-05, + "loss": 0.4256, + "mean_token_accuracy": 0.8545612275600434, + "step": 2255 + }, + { + "epoch": 1.0844529750479845, + "grad_norm": 0.4771285752641356, + "learning_rate": 3.547813722004977e-05, + "loss": 0.4115, + "mean_token_accuracy": 0.858768516778946, + "step": 2260 + }, + { + "epoch": 1.086852207293666, + "grad_norm": 0.47918813523195114, + "learning_rate": 3.5433700675435476e-05, + "loss": 0.3993, + "mean_token_accuracy": 0.8627258002758026, + "step": 2265 + }, + { + "epoch": 1.0892514395393473, + "grad_norm": 0.44180464504771916, + "learning_rate": 3.538926413082119e-05, + "loss": 0.4098, + "mean_token_accuracy": 0.8589477658271789, + "step": 2270 + }, + { + "epoch": 1.0916506717850287, + "grad_norm": 0.48558126860005485, + "learning_rate": 3.53448275862069e-05, + "loss": 0.4018, + "mean_token_accuracy": 0.8613408207893372, + "step": 2275 + }, + { + "epoch": 1.0940499040307101, + "grad_norm": 0.4830201657987925, + "learning_rate": 3.5300391041592605e-05, + "loss": 0.4012, + "mean_token_accuracy": 0.8613366425037384, + "step": 2280 + }, + { + "epoch": 1.0964491362763915, + "grad_norm": 0.45267780099531413, + "learning_rate": 3.5255954496978316e-05, + "loss": 0.3976, + "mean_token_accuracy": 0.8629164338111878, + "step": 2285 + }, + { + "epoch": 1.098848368522073, + "grad_norm": 0.4502807128522101, + "learning_rate": 3.521151795236403e-05, + "loss": 0.411, + "mean_token_accuracy": 0.8588341414928437, + "step": 2290 + }, + { + "epoch": 1.1012476007677543, + "grad_norm": 0.44440205971107866, + "learning_rate": 3.516708140774973e-05, + "loss": 0.4135, + "mean_token_accuracy": 0.8572691977024078, + "step": 2295 + }, + { + "epoch": 1.1036468330134357, + "grad_norm": 0.4979388904136079, + "learning_rate": 3.5122644863135445e-05, + "loss": 0.4094, + "mean_token_accuracy": 0.8590193212032318, + "step": 2300 + }, + { + "epoch": 1.106046065259117, + "grad_norm": 0.46332524132082875, + "learning_rate": 3.507820831852116e-05, + "loss": 0.3928, + "mean_token_accuracy": 0.864500904083252, + "step": 2305 + }, + { + "epoch": 1.1084452975047985, + "grad_norm": 0.43360291665533524, + "learning_rate": 3.503377177390686e-05, + "loss": 0.4196, + "mean_token_accuracy": 0.8552972078323364, + "step": 2310 + }, + { + "epoch": 1.1108445297504799, + "grad_norm": 0.4429935353816478, + "learning_rate": 3.498933522929257e-05, + "loss": 0.3985, + "mean_token_accuracy": 0.8629589080810547, + "step": 2315 + }, + { + "epoch": 1.1132437619961613, + "grad_norm": 0.46705306376162703, + "learning_rate": 3.494489868467828e-05, + "loss": 0.3979, + "mean_token_accuracy": 0.8631719291210175, + "step": 2320 + }, + { + "epoch": 1.1156429942418427, + "grad_norm": 0.4418967124373468, + "learning_rate": 3.490046214006399e-05, + "loss": 0.4008, + "mean_token_accuracy": 0.8622421741485595, + "step": 2325 + }, + { + "epoch": 1.118042226487524, + "grad_norm": 0.44881463295860696, + "learning_rate": 3.4856025595449696e-05, + "loss": 0.4055, + "mean_token_accuracy": 0.8604006469249725, + "step": 2330 + }, + { + "epoch": 1.1204414587332054, + "grad_norm": 0.4919264589965018, + "learning_rate": 3.481158905083541e-05, + "loss": 0.4082, + "mean_token_accuracy": 0.8602576732635498, + "step": 2335 + }, + { + "epoch": 1.1228406909788868, + "grad_norm": 0.4704160431070676, + "learning_rate": 3.476715250622112e-05, + "loss": 0.4207, + "mean_token_accuracy": 0.8553534567356109, + "step": 2340 + }, + { + "epoch": 1.1252399232245682, + "grad_norm": 0.444446496623664, + "learning_rate": 3.4722715961606825e-05, + "loss": 0.4048, + "mean_token_accuracy": 0.8606355607509613, + "step": 2345 + }, + { + "epoch": 1.1276391554702494, + "grad_norm": 0.4846769028223817, + "learning_rate": 3.467827941699254e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.8652548551559448, + "step": 2350 + }, + { + "epoch": 1.1300383877159308, + "grad_norm": 0.43614245378491234, + "learning_rate": 3.463384287237825e-05, + "loss": 0.4088, + "mean_token_accuracy": 0.8596396625041962, + "step": 2355 + }, + { + "epoch": 1.1324376199616122, + "grad_norm": 0.4657677207483913, + "learning_rate": 3.4589406327763954e-05, + "loss": 0.4124, + "mean_token_accuracy": 0.85767782330513, + "step": 2360 + }, + { + "epoch": 1.1348368522072936, + "grad_norm": 0.4426745604094487, + "learning_rate": 3.4544969783149665e-05, + "loss": 0.3971, + "mean_token_accuracy": 0.8634883403778076, + "step": 2365 + }, + { + "epoch": 1.137236084452975, + "grad_norm": 0.46815018460988816, + "learning_rate": 3.450053323853537e-05, + "loss": 0.4039, + "mean_token_accuracy": 0.8610803782939911, + "step": 2370 + }, + { + "epoch": 1.1396353166986564, + "grad_norm": 0.438316624790037, + "learning_rate": 3.445609669392108e-05, + "loss": 0.4001, + "mean_token_accuracy": 0.8615439713001252, + "step": 2375 + }, + { + "epoch": 1.1420345489443378, + "grad_norm": 0.4418058595426201, + "learning_rate": 3.441166014930679e-05, + "loss": 0.4065, + "mean_token_accuracy": 0.8600826561450958, + "step": 2380 + }, + { + "epoch": 1.1444337811900192, + "grad_norm": 0.4847595999781224, + "learning_rate": 3.43672236046925e-05, + "loss": 0.4239, + "mean_token_accuracy": 0.855004221200943, + "step": 2385 + }, + { + "epoch": 1.1468330134357005, + "grad_norm": 1.5446867021881845, + "learning_rate": 3.432278706007821e-05, + "loss": 0.4038, + "mean_token_accuracy": 0.8607834935188293, + "step": 2390 + }, + { + "epoch": 1.149232245681382, + "grad_norm": 0.48658583367657177, + "learning_rate": 3.4278350515463916e-05, + "loss": 0.4075, + "mean_token_accuracy": 0.8598154127597809, + "step": 2395 + }, + { + "epoch": 1.1516314779270633, + "grad_norm": 0.5326892549587043, + "learning_rate": 3.423391397084963e-05, + "loss": 0.4114, + "mean_token_accuracy": 0.8590575993061066, + "step": 2400 + }, + { + "epoch": 1.1540307101727447, + "grad_norm": 0.4771899586929266, + "learning_rate": 3.418947742623534e-05, + "loss": 0.4136, + "mean_token_accuracy": 0.8581348896026612, + "step": 2405 + }, + { + "epoch": 1.1564299424184261, + "grad_norm": 0.455821737127857, + "learning_rate": 3.414504088162105e-05, + "loss": 0.4134, + "mean_token_accuracy": 0.8582122266292572, + "step": 2410 + }, + { + "epoch": 1.1588291746641075, + "grad_norm": 0.5094254888878418, + "learning_rate": 3.410060433700676e-05, + "loss": 0.4169, + "mean_token_accuracy": 0.8570551395416259, + "step": 2415 + }, + { + "epoch": 1.161228406909789, + "grad_norm": 0.4593576345470221, + "learning_rate": 3.405616779239246e-05, + "loss": 0.4108, + "mean_token_accuracy": 0.8585395991802216, + "step": 2420 + }, + { + "epoch": 1.1636276391554703, + "grad_norm": 0.45335113787411774, + "learning_rate": 3.4011731247778174e-05, + "loss": 0.4141, + "mean_token_accuracy": 0.858020031452179, + "step": 2425 + }, + { + "epoch": 1.1660268714011517, + "grad_norm": 0.43176314428761675, + "learning_rate": 3.396729470316388e-05, + "loss": 0.4048, + "mean_token_accuracy": 0.8606631636619568, + "step": 2430 + }, + { + "epoch": 1.168426103646833, + "grad_norm": 0.5078546819049952, + "learning_rate": 3.392285815854959e-05, + "loss": 0.4293, + "mean_token_accuracy": 0.8537072598934173, + "step": 2435 + }, + { + "epoch": 1.1708253358925145, + "grad_norm": 0.4502128911684478, + "learning_rate": 3.38784216139353e-05, + "loss": 0.4055, + "mean_token_accuracy": 0.860330331325531, + "step": 2440 + }, + { + "epoch": 1.1732245681381959, + "grad_norm": 0.45356485938149477, + "learning_rate": 3.3833985069321014e-05, + "loss": 0.4068, + "mean_token_accuracy": 0.8610100626945496, + "step": 2445 + }, + { + "epoch": 1.1756238003838773, + "grad_norm": 0.44546255582121363, + "learning_rate": 3.378954852470672e-05, + "loss": 0.4059, + "mean_token_accuracy": 0.8603936195373535, + "step": 2450 + }, + { + "epoch": 1.1780230326295587, + "grad_norm": 0.519034529576858, + "learning_rate": 3.374511198009243e-05, + "loss": 0.3996, + "mean_token_accuracy": 0.8620694935321808, + "step": 2455 + }, + { + "epoch": 1.18042226487524, + "grad_norm": 0.4306474312910174, + "learning_rate": 3.370067543547814e-05, + "loss": 0.4175, + "mean_token_accuracy": 0.8564855635166169, + "step": 2460 + }, + { + "epoch": 1.1828214971209212, + "grad_norm": 0.44331321051212536, + "learning_rate": 3.365623889086385e-05, + "loss": 0.4221, + "mean_token_accuracy": 0.8551550149917603, + "step": 2465 + }, + { + "epoch": 1.1852207293666026, + "grad_norm": 0.46799379104396804, + "learning_rate": 3.361180234624955e-05, + "loss": 0.4057, + "mean_token_accuracy": 0.8598842024803162, + "step": 2470 + }, + { + "epoch": 1.187619961612284, + "grad_norm": 0.4456420970776554, + "learning_rate": 3.3567365801635265e-05, + "loss": 0.4123, + "mean_token_accuracy": 0.8581973850727082, + "step": 2475 + }, + { + "epoch": 1.1900191938579654, + "grad_norm": 0.45848417063764557, + "learning_rate": 3.352292925702098e-05, + "loss": 0.4107, + "mean_token_accuracy": 0.8585059881210327, + "step": 2480 + }, + { + "epoch": 1.1924184261036468, + "grad_norm": 0.4316965925133896, + "learning_rate": 3.347849271240668e-05, + "loss": 0.4138, + "mean_token_accuracy": 0.8580934762954712, + "step": 2485 + }, + { + "epoch": 1.1948176583493282, + "grad_norm": 0.43525772194220896, + "learning_rate": 3.3434056167792394e-05, + "loss": 0.4149, + "mean_token_accuracy": 0.8575979828834533, + "step": 2490 + }, + { + "epoch": 1.1972168905950096, + "grad_norm": 0.46870966305994644, + "learning_rate": 3.3389619623178106e-05, + "loss": 0.4175, + "mean_token_accuracy": 0.856852000951767, + "step": 2495 + }, + { + "epoch": 1.199616122840691, + "grad_norm": 0.40911552344798013, + "learning_rate": 3.334518307856381e-05, + "loss": 0.4009, + "mean_token_accuracy": 0.8614593029022217, + "step": 2500 + }, + { + "epoch": 1.2020153550863724, + "grad_norm": 0.4340404405974866, + "learning_rate": 3.330074653394952e-05, + "loss": 0.3927, + "mean_token_accuracy": 0.8641118109226227, + "step": 2505 + }, + { + "epoch": 1.2044145873320538, + "grad_norm": 0.43787534396656413, + "learning_rate": 3.3256309989335235e-05, + "loss": 0.4085, + "mean_token_accuracy": 0.8591709673404694, + "step": 2510 + }, + { + "epoch": 1.2068138195777351, + "grad_norm": 0.44863502667369215, + "learning_rate": 3.321187344472094e-05, + "loss": 0.4144, + "mean_token_accuracy": 0.8567050993442535, + "step": 2515 + }, + { + "epoch": 1.2092130518234165, + "grad_norm": 0.4213872953371581, + "learning_rate": 3.316743690010665e-05, + "loss": 0.3892, + "mean_token_accuracy": 0.8648431181907654, + "step": 2520 + }, + { + "epoch": 1.211612284069098, + "grad_norm": 0.3923155586128721, + "learning_rate": 3.3123000355492357e-05, + "loss": 0.4038, + "mean_token_accuracy": 0.8607444286346435, + "step": 2525 + }, + { + "epoch": 1.2140115163147793, + "grad_norm": 0.433505967574003, + "learning_rate": 3.307856381087807e-05, + "loss": 0.4046, + "mean_token_accuracy": 0.8599982798099518, + "step": 2530 + }, + { + "epoch": 1.2164107485604607, + "grad_norm": 0.40196481275606427, + "learning_rate": 3.3034127266263773e-05, + "loss": 0.3952, + "mean_token_accuracy": 0.8635883510112763, + "step": 2535 + }, + { + "epoch": 1.2188099808061421, + "grad_norm": 0.4360372715597363, + "learning_rate": 3.2989690721649485e-05, + "loss": 0.4021, + "mean_token_accuracy": 0.8613421142101287, + "step": 2540 + }, + { + "epoch": 1.2212092130518235, + "grad_norm": 0.4316984682373569, + "learning_rate": 3.29452541770352e-05, + "loss": 0.4086, + "mean_token_accuracy": 0.8592318296432495, + "step": 2545 + }, + { + "epoch": 1.2236084452975047, + "grad_norm": 0.456502910992107, + "learning_rate": 3.29008176324209e-05, + "loss": 0.4066, + "mean_token_accuracy": 0.8597779631614685, + "step": 2550 + }, + { + "epoch": 1.226007677543186, + "grad_norm": 0.4444102807735176, + "learning_rate": 3.2856381087806614e-05, + "loss": 0.4074, + "mean_token_accuracy": 0.8595255970954895, + "step": 2555 + }, + { + "epoch": 1.2284069097888675, + "grad_norm": 0.4231888031917078, + "learning_rate": 3.2811944543192326e-05, + "loss": 0.4072, + "mean_token_accuracy": 0.8598849952220917, + "step": 2560 + }, + { + "epoch": 1.2308061420345489, + "grad_norm": 0.4570773325504898, + "learning_rate": 3.276750799857803e-05, + "loss": 0.4204, + "mean_token_accuracy": 0.85575150847435, + "step": 2565 + }, + { + "epoch": 1.2332053742802302, + "grad_norm": 0.411842115095972, + "learning_rate": 3.272307145396374e-05, + "loss": 0.4013, + "mean_token_accuracy": 0.8611139714717865, + "step": 2570 + }, + { + "epoch": 1.2356046065259116, + "grad_norm": 0.45644905287947346, + "learning_rate": 3.267863490934945e-05, + "loss": 0.4153, + "mean_token_accuracy": 0.8568707346916199, + "step": 2575 + }, + { + "epoch": 1.238003838771593, + "grad_norm": 0.4818186654409561, + "learning_rate": 3.263419836473516e-05, + "loss": 0.4013, + "mean_token_accuracy": 0.8617413580417633, + "step": 2580 + }, + { + "epoch": 1.2404030710172744, + "grad_norm": 0.438770764421927, + "learning_rate": 3.2589761820120865e-05, + "loss": 0.4122, + "mean_token_accuracy": 0.8579687118530274, + "step": 2585 + }, + { + "epoch": 1.2428023032629558, + "grad_norm": 0.4653378155990347, + "learning_rate": 3.254532527550658e-05, + "loss": 0.4183, + "mean_token_accuracy": 0.8564316511154175, + "step": 2590 + }, + { + "epoch": 1.2452015355086372, + "grad_norm": 0.4813156809808044, + "learning_rate": 3.250088873089229e-05, + "loss": 0.4283, + "mean_token_accuracy": 0.853442233800888, + "step": 2595 + }, + { + "epoch": 1.2476007677543186, + "grad_norm": 0.41203719170614067, + "learning_rate": 3.2456452186277994e-05, + "loss": 0.3965, + "mean_token_accuracy": 0.862660163640976, + "step": 2600 + }, + { + "epoch": 1.25, + "grad_norm": 0.47609148798409084, + "learning_rate": 3.2412015641663706e-05, + "loss": 0.4175, + "mean_token_accuracy": 0.8569301187992096, + "step": 2605 + }, + { + "epoch": 1.2523992322456814, + "grad_norm": 0.42629307255923604, + "learning_rate": 3.236757909704942e-05, + "loss": 0.4138, + "mean_token_accuracy": 0.8578989326953887, + "step": 2610 + }, + { + "epoch": 1.2547984644913628, + "grad_norm": 0.4283572421423295, + "learning_rate": 3.232314255243513e-05, + "loss": 0.4105, + "mean_token_accuracy": 0.8592274427413941, + "step": 2615 + }, + { + "epoch": 1.2571976967370442, + "grad_norm": 0.43715619923246357, + "learning_rate": 3.2278706007820834e-05, + "loss": 0.4007, + "mean_token_accuracy": 0.8620129883289337, + "step": 2620 + }, + { + "epoch": 1.2595969289827256, + "grad_norm": 0.42096711462262465, + "learning_rate": 3.223426946320654e-05, + "loss": 0.3966, + "mean_token_accuracy": 0.8628343999385834, + "step": 2625 + }, + { + "epoch": 1.261996161228407, + "grad_norm": 0.41932841031562834, + "learning_rate": 3.218983291859225e-05, + "loss": 0.4051, + "mean_token_accuracy": 0.8604014277458191, + "step": 2630 + }, + { + "epoch": 1.2643953934740884, + "grad_norm": 0.5315771823488319, + "learning_rate": 3.2145396373977956e-05, + "loss": 0.4322, + "mean_token_accuracy": 0.8546823263168335, + "step": 2635 + }, + { + "epoch": 1.2667946257197698, + "grad_norm": 0.41897523637834844, + "learning_rate": 3.210095982936367e-05, + "loss": 0.3968, + "mean_token_accuracy": 0.86270392537117, + "step": 2640 + }, + { + "epoch": 1.2691938579654511, + "grad_norm": 0.40715830938846465, + "learning_rate": 3.205652328474938e-05, + "loss": 0.4068, + "mean_token_accuracy": 0.8598295211791992, + "step": 2645 + }, + { + "epoch": 1.2715930902111325, + "grad_norm": 0.8769975146104805, + "learning_rate": 3.201208674013509e-05, + "loss": 0.425, + "mean_token_accuracy": 0.854360431432724, + "step": 2650 + }, + { + "epoch": 1.273992322456814, + "grad_norm": 3.3831831887481756, + "learning_rate": 3.19676501955208e-05, + "loss": 0.4314, + "mean_token_accuracy": 0.852653294801712, + "step": 2655 + }, + { + "epoch": 1.2763915547024953, + "grad_norm": 0.45730751260407754, + "learning_rate": 3.192321365090651e-05, + "loss": 0.4195, + "mean_token_accuracy": 0.8559574007987976, + "step": 2660 + }, + { + "epoch": 1.2787907869481767, + "grad_norm": 0.4092466623798688, + "learning_rate": 3.187877710629222e-05, + "loss": 0.3926, + "mean_token_accuracy": 0.8644907474517822, + "step": 2665 + }, + { + "epoch": 1.2811900191938579, + "grad_norm": 0.4309016236914282, + "learning_rate": 3.1834340561677926e-05, + "loss": 0.4066, + "mean_token_accuracy": 0.8598381280899048, + "step": 2670 + }, + { + "epoch": 1.2835892514395393, + "grad_norm": 0.5020853714609004, + "learning_rate": 3.178990401706363e-05, + "loss": 0.4025, + "mean_token_accuracy": 0.8616749465465545, + "step": 2675 + }, + { + "epoch": 1.2859884836852207, + "grad_norm": 0.4564585894852153, + "learning_rate": 3.174546747244934e-05, + "loss": 0.4065, + "mean_token_accuracy": 0.8599982798099518, + "step": 2680 + }, + { + "epoch": 1.288387715930902, + "grad_norm": 0.4551031892701387, + "learning_rate": 3.1701030927835054e-05, + "loss": 0.4055, + "mean_token_accuracy": 0.8602951765060425, + "step": 2685 + }, + { + "epoch": 1.2907869481765835, + "grad_norm": 0.5107255154240907, + "learning_rate": 3.165659438322076e-05, + "loss": 0.423, + "mean_token_accuracy": 0.8549253046512604, + "step": 2690 + }, + { + "epoch": 1.2931861804222649, + "grad_norm": 0.500179788278748, + "learning_rate": 3.161215783860647e-05, + "loss": 0.4212, + "mean_token_accuracy": 0.8551692306995392, + "step": 2695 + }, + { + "epoch": 1.2955854126679462, + "grad_norm": 0.4230688657327201, + "learning_rate": 3.156772129399218e-05, + "loss": 0.4213, + "mean_token_accuracy": 0.8560464382171631, + "step": 2700 + }, + { + "epoch": 1.2979846449136276, + "grad_norm": 0.43479209014268144, + "learning_rate": 3.152328474937789e-05, + "loss": 0.3941, + "mean_token_accuracy": 0.8634872555732727, + "step": 2705 + }, + { + "epoch": 1.300383877159309, + "grad_norm": 0.43650872497538457, + "learning_rate": 3.14788482047636e-05, + "loss": 0.408, + "mean_token_accuracy": 0.8591680586338043, + "step": 2710 + }, + { + "epoch": 1.3027831094049904, + "grad_norm": 0.4493400523195851, + "learning_rate": 3.143441166014931e-05, + "loss": 0.4175, + "mean_token_accuracy": 0.8564364910125732, + "step": 2715 + }, + { + "epoch": 1.3051823416506718, + "grad_norm": 0.459115570189103, + "learning_rate": 3.138997511553502e-05, + "loss": 0.403, + "mean_token_accuracy": 0.860854583978653, + "step": 2720 + }, + { + "epoch": 1.3075815738963532, + "grad_norm": 0.45103286240520074, + "learning_rate": 3.134553857092073e-05, + "loss": 0.4134, + "mean_token_accuracy": 0.8586698293685913, + "step": 2725 + }, + { + "epoch": 1.3099808061420346, + "grad_norm": 0.4272327842623446, + "learning_rate": 3.1301102026306434e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.8617429137229919, + "step": 2730 + }, + { + "epoch": 1.312380038387716, + "grad_norm": 0.45529413891054127, + "learning_rate": 3.1256665481692146e-05, + "loss": 0.4129, + "mean_token_accuracy": 0.8579215168952942, + "step": 2735 + }, + { + "epoch": 1.3147792706333974, + "grad_norm": 0.4086111236520043, + "learning_rate": 3.121222893707785e-05, + "loss": 0.4068, + "mean_token_accuracy": 0.8598678052425385, + "step": 2740 + }, + { + "epoch": 1.3171785028790786, + "grad_norm": 0.44772635264843236, + "learning_rate": 3.116779239246356e-05, + "loss": 0.4197, + "mean_token_accuracy": 0.8561370968818665, + "step": 2745 + }, + { + "epoch": 1.31957773512476, + "grad_norm": 0.4521226002995627, + "learning_rate": 3.1123355847849275e-05, + "loss": 0.4242, + "mean_token_accuracy": 0.8544846594333648, + "step": 2750 + }, + { + "epoch": 1.3219769673704413, + "grad_norm": 0.435967523632314, + "learning_rate": 3.107891930323498e-05, + "loss": 0.4005, + "mean_token_accuracy": 0.8615476310253143, + "step": 2755 + }, + { + "epoch": 1.3243761996161227, + "grad_norm": 0.45930032570272467, + "learning_rate": 3.103448275862069e-05, + "loss": 0.4041, + "mean_token_accuracy": 0.8609639704227448, + "step": 2760 + }, + { + "epoch": 1.3267754318618041, + "grad_norm": 0.44549351512795554, + "learning_rate": 3.0990046214006403e-05, + "loss": 0.3965, + "mean_token_accuracy": 0.8633203625679016, + "step": 2765 + }, + { + "epoch": 1.3291746641074855, + "grad_norm": 0.3934861435770103, + "learning_rate": 3.094560966939211e-05, + "loss": 0.4045, + "mean_token_accuracy": 0.859321677684784, + "step": 2770 + }, + { + "epoch": 1.331573896353167, + "grad_norm": 0.41342925620089666, + "learning_rate": 3.090117312477782e-05, + "loss": 0.4033, + "mean_token_accuracy": 0.8610045909881592, + "step": 2775 + }, + { + "epoch": 1.3339731285988483, + "grad_norm": 0.403635823706059, + "learning_rate": 3.0856736580163525e-05, + "loss": 0.3841, + "mean_token_accuracy": 0.8668143391609192, + "step": 2780 + }, + { + "epoch": 1.3363723608445297, + "grad_norm": 0.39217459524479503, + "learning_rate": 3.081230003554924e-05, + "loss": 0.4054, + "mean_token_accuracy": 0.8603837728500366, + "step": 2785 + }, + { + "epoch": 1.338771593090211, + "grad_norm": 0.4134206958266228, + "learning_rate": 3.076786349093494e-05, + "loss": 0.3956, + "mean_token_accuracy": 0.8637172639369964, + "step": 2790 + }, + { + "epoch": 1.3411708253358925, + "grad_norm": 0.41573581786552966, + "learning_rate": 3.0723426946320654e-05, + "loss": 0.4058, + "mean_token_accuracy": 0.8605648100376129, + "step": 2795 + }, + { + "epoch": 1.3435700575815739, + "grad_norm": 0.443513172094798, + "learning_rate": 3.0678990401706366e-05, + "loss": 0.4093, + "mean_token_accuracy": 0.858421903848648, + "step": 2800 + }, + { + "epoch": 1.3459692898272553, + "grad_norm": 0.43938493124740363, + "learning_rate": 3.063455385709207e-05, + "loss": 0.4065, + "mean_token_accuracy": 0.8605037808418274, + "step": 2805 + }, + { + "epoch": 1.3483685220729367, + "grad_norm": 0.4165424593181452, + "learning_rate": 3.059011731247778e-05, + "loss": 0.41, + "mean_token_accuracy": 0.8584935009479523, + "step": 2810 + }, + { + "epoch": 1.350767754318618, + "grad_norm": 0.43948093820333756, + "learning_rate": 3.0545680767863495e-05, + "loss": 0.4058, + "mean_token_accuracy": 0.8599857807159423, + "step": 2815 + }, + { + "epoch": 1.3531669865642995, + "grad_norm": 0.4340762126855867, + "learning_rate": 3.0501244223249203e-05, + "loss": 0.4151, + "mean_token_accuracy": 0.8570738732814789, + "step": 2820 + }, + { + "epoch": 1.3555662188099808, + "grad_norm": 0.4279976668729953, + "learning_rate": 3.0456807678634912e-05, + "loss": 0.4113, + "mean_token_accuracy": 0.8584268510341644, + "step": 2825 + }, + { + "epoch": 1.3579654510556622, + "grad_norm": 0.4643169392851994, + "learning_rate": 3.0412371134020617e-05, + "loss": 0.4082, + "mean_token_accuracy": 0.8594587981700897, + "step": 2830 + }, + { + "epoch": 1.3603646833013436, + "grad_norm": 0.4520395742541979, + "learning_rate": 3.036793458940633e-05, + "loss": 0.4152, + "mean_token_accuracy": 0.8566230773925781, + "step": 2835 + }, + { + "epoch": 1.362763915547025, + "grad_norm": 0.43104350994262125, + "learning_rate": 3.0323498044792037e-05, + "loss": 0.405, + "mean_token_accuracy": 0.8607537925243378, + "step": 2840 + }, + { + "epoch": 1.3651631477927064, + "grad_norm": 0.4946581783931828, + "learning_rate": 3.0279061500177746e-05, + "loss": 0.4108, + "mean_token_accuracy": 0.8585208475589752, + "step": 2845 + }, + { + "epoch": 1.3675623800383878, + "grad_norm": 0.43118015846032265, + "learning_rate": 3.0234624955563457e-05, + "loss": 0.4167, + "mean_token_accuracy": 0.8567094743251801, + "step": 2850 + }, + { + "epoch": 1.3699616122840692, + "grad_norm": 0.4205461305435935, + "learning_rate": 3.0190188410949166e-05, + "loss": 0.4069, + "mean_token_accuracy": 0.8604311168193817, + "step": 2855 + }, + { + "epoch": 1.3723608445297506, + "grad_norm": 0.45673558688969584, + "learning_rate": 3.0145751866334874e-05, + "loss": 0.401, + "mean_token_accuracy": 0.862135136127472, + "step": 2860 + }, + { + "epoch": 1.374760076775432, + "grad_norm": 0.49315897603113723, + "learning_rate": 3.0101315321720586e-05, + "loss": 0.41, + "mean_token_accuracy": 0.8588114857673645, + "step": 2865 + }, + { + "epoch": 1.3771593090211132, + "grad_norm": 0.4411223310245759, + "learning_rate": 3.0056878777106295e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8631333410739899, + "step": 2870 + }, + { + "epoch": 1.3795585412667946, + "grad_norm": 0.4129525665270095, + "learning_rate": 3.0012442232492007e-05, + "loss": 0.3946, + "mean_token_accuracy": 0.8636532902717591, + "step": 2875 + }, + { + "epoch": 1.381957773512476, + "grad_norm": 0.4103741557418468, + "learning_rate": 2.9968005687877708e-05, + "loss": 0.3983, + "mean_token_accuracy": 0.8622260808944702, + "step": 2880 + }, + { + "epoch": 1.3843570057581573, + "grad_norm": 0.40528936732169213, + "learning_rate": 2.992356914326342e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.8614343047142029, + "step": 2885 + }, + { + "epoch": 1.3867562380038387, + "grad_norm": 0.4800728703117366, + "learning_rate": 2.987913259864913e-05, + "loss": 0.4201, + "mean_token_accuracy": 0.8567433834075928, + "step": 2890 + }, + { + "epoch": 1.3891554702495201, + "grad_norm": 0.4384817482064134, + "learning_rate": 2.9834696054034837e-05, + "loss": 0.4136, + "mean_token_accuracy": 0.8570279777050018, + "step": 2895 + }, + { + "epoch": 1.3915547024952015, + "grad_norm": 0.40788290325888715, + "learning_rate": 2.979025950942055e-05, + "loss": 0.4047, + "mean_token_accuracy": 0.8605920732021332, + "step": 2900 + }, + { + "epoch": 1.393953934740883, + "grad_norm": 0.4371937523140351, + "learning_rate": 2.9745822964806257e-05, + "loss": 0.4055, + "mean_token_accuracy": 0.8605655014514924, + "step": 2905 + }, + { + "epoch": 1.3963531669865643, + "grad_norm": 0.42445573104601925, + "learning_rate": 2.970138642019197e-05, + "loss": 0.4054, + "mean_token_accuracy": 0.8602490782737732, + "step": 2910 + }, + { + "epoch": 1.3987523992322457, + "grad_norm": 0.40880871706092814, + "learning_rate": 2.9656949875577678e-05, + "loss": 0.4028, + "mean_token_accuracy": 0.8603983104228974, + "step": 2915 + }, + { + "epoch": 1.401151631477927, + "grad_norm": 0.42266914941617284, + "learning_rate": 2.9612513330963386e-05, + "loss": 0.397, + "mean_token_accuracy": 0.8627976059913636, + "step": 2920 + }, + { + "epoch": 1.4035508637236085, + "grad_norm": 0.43438131430207655, + "learning_rate": 2.9568076786349098e-05, + "loss": 0.4179, + "mean_token_accuracy": 0.8561128735542297, + "step": 2925 + }, + { + "epoch": 1.4059500959692899, + "grad_norm": 0.41884217203981505, + "learning_rate": 2.9523640241734806e-05, + "loss": 0.4081, + "mean_token_accuracy": 0.8592755794525146, + "step": 2930 + }, + { + "epoch": 1.4083493282149713, + "grad_norm": 0.42071658115511323, + "learning_rate": 2.947920369712051e-05, + "loss": 0.4073, + "mean_token_accuracy": 0.858444768190384, + "step": 2935 + }, + { + "epoch": 1.4107485604606527, + "grad_norm": 0.4398724235442647, + "learning_rate": 2.943476715250622e-05, + "loss": 0.4189, + "mean_token_accuracy": 0.856487900018692, + "step": 2940 + }, + { + "epoch": 1.4131477927063338, + "grad_norm": 0.4153146639627856, + "learning_rate": 2.9390330607891932e-05, + "loss": 0.4054, + "mean_token_accuracy": 0.8598765075206757, + "step": 2945 + }, + { + "epoch": 1.4155470249520152, + "grad_norm": 0.4246136142452087, + "learning_rate": 2.934589406327764e-05, + "loss": 0.4058, + "mean_token_accuracy": 0.859784209728241, + "step": 2950 + }, + { + "epoch": 1.4179462571976966, + "grad_norm": 0.4368135342362413, + "learning_rate": 2.930145751866335e-05, + "loss": 0.4108, + "mean_token_accuracy": 0.8586536586284638, + "step": 2955 + }, + { + "epoch": 1.420345489443378, + "grad_norm": 0.42306966276186897, + "learning_rate": 2.925702097404906e-05, + "loss": 0.4027, + "mean_token_accuracy": 0.8606998920440674, + "step": 2960 + }, + { + "epoch": 1.4227447216890594, + "grad_norm": 0.40958147969244185, + "learning_rate": 2.921258442943477e-05, + "loss": 0.4083, + "mean_token_accuracy": 0.8588349342346191, + "step": 2965 + }, + { + "epoch": 1.4251439539347408, + "grad_norm": 0.4186234420817474, + "learning_rate": 2.9168147884820478e-05, + "loss": 0.4103, + "mean_token_accuracy": 0.8586184978485107, + "step": 2970 + }, + { + "epoch": 1.4275431861804222, + "grad_norm": 0.43568043114669147, + "learning_rate": 2.912371134020619e-05, + "loss": 0.42, + "mean_token_accuracy": 0.8555362820625305, + "step": 2975 + }, + { + "epoch": 1.4299424184261036, + "grad_norm": 0.37993098256654056, + "learning_rate": 2.9079274795591898e-05, + "loss": 0.3977, + "mean_token_accuracy": 0.8621952950954437, + "step": 2980 + }, + { + "epoch": 1.432341650671785, + "grad_norm": 0.40862306924190445, + "learning_rate": 2.9034838250977603e-05, + "loss": 0.4099, + "mean_token_accuracy": 0.8586802303791046, + "step": 2985 + }, + { + "epoch": 1.4347408829174664, + "grad_norm": 0.4256385901866345, + "learning_rate": 2.899040170636331e-05, + "loss": 0.3993, + "mean_token_accuracy": 0.8616679191589356, + "step": 2990 + }, + { + "epoch": 1.4371401151631478, + "grad_norm": 0.4444750312443054, + "learning_rate": 2.8945965161749023e-05, + "loss": 0.4138, + "mean_token_accuracy": 0.8581645727157593, + "step": 2995 + }, + { + "epoch": 1.4395393474088292, + "grad_norm": 0.42054008525573455, + "learning_rate": 2.8901528617134732e-05, + "loss": 0.3985, + "mean_token_accuracy": 0.8622124850749969, + "step": 3000 + }, + { + "epoch": 1.4419385796545106, + "grad_norm": 0.39199991797047684, + "learning_rate": 2.8857092072520444e-05, + "loss": 0.3978, + "mean_token_accuracy": 0.8625296950340271, + "step": 3005 + }, + { + "epoch": 1.444337811900192, + "grad_norm": 0.4252466446636856, + "learning_rate": 2.8812655527906152e-05, + "loss": 0.4053, + "mean_token_accuracy": 0.8601209402084351, + "step": 3010 + }, + { + "epoch": 1.4467370441458733, + "grad_norm": 0.4250623615104366, + "learning_rate": 2.876821898329186e-05, + "loss": 0.4018, + "mean_token_accuracy": 0.8616835534572601, + "step": 3015 + }, + { + "epoch": 1.4491362763915547, + "grad_norm": 0.4072236614562204, + "learning_rate": 2.8723782438677572e-05, + "loss": 0.4009, + "mean_token_accuracy": 0.8617772936820984, + "step": 3020 + }, + { + "epoch": 1.4515355086372361, + "grad_norm": 0.39611034940959056, + "learning_rate": 2.867934589406328e-05, + "loss": 0.3968, + "mean_token_accuracy": 0.8629406571388245, + "step": 3025 + }, + { + "epoch": 1.4539347408829175, + "grad_norm": 0.4027069569602149, + "learning_rate": 2.863490934944899e-05, + "loss": 0.4053, + "mean_token_accuracy": 0.8606762647628784, + "step": 3030 + }, + { + "epoch": 1.456333973128599, + "grad_norm": 0.43051154118237706, + "learning_rate": 2.8590472804834694e-05, + "loss": 0.4094, + "mean_token_accuracy": 0.859013843536377, + "step": 3035 + }, + { + "epoch": 1.4587332053742803, + "grad_norm": 0.42541461100511707, + "learning_rate": 2.8546036260220406e-05, + "loss": 0.4072, + "mean_token_accuracy": 0.860070937871933, + "step": 3040 + }, + { + "epoch": 1.4611324376199617, + "grad_norm": 0.41627586707069786, + "learning_rate": 2.8501599715606115e-05, + "loss": 0.4167, + "mean_token_accuracy": 0.8566191613674163, + "step": 3045 + }, + { + "epoch": 1.463531669865643, + "grad_norm": 0.41940607277568154, + "learning_rate": 2.8457163170991823e-05, + "loss": 0.401, + "mean_token_accuracy": 0.8612983524799347, + "step": 3050 + }, + { + "epoch": 1.4659309021113245, + "grad_norm": 0.3978647928798745, + "learning_rate": 2.8412726626377535e-05, + "loss": 0.4126, + "mean_token_accuracy": 0.8580301880836487, + "step": 3055 + }, + { + "epoch": 1.4683301343570059, + "grad_norm": 0.4688543778463637, + "learning_rate": 2.8368290081763243e-05, + "loss": 0.4012, + "mean_token_accuracy": 0.8619030773639679, + "step": 3060 + }, + { + "epoch": 1.4707293666026873, + "grad_norm": 0.4247011368186658, + "learning_rate": 2.8323853537148952e-05, + "loss": 0.4119, + "mean_token_accuracy": 0.8577637672424316, + "step": 3065 + }, + { + "epoch": 1.4731285988483684, + "grad_norm": 0.3864061246345447, + "learning_rate": 2.8279416992534664e-05, + "loss": 0.4062, + "mean_token_accuracy": 0.8601014137268066, + "step": 3070 + }, + { + "epoch": 1.4755278310940498, + "grad_norm": 0.4127338674644227, + "learning_rate": 2.8234980447920372e-05, + "loss": 0.4116, + "mean_token_accuracy": 0.8582200467586517, + "step": 3075 + }, + { + "epoch": 1.4779270633397312, + "grad_norm": 0.4242265641239629, + "learning_rate": 2.8190543903306084e-05, + "loss": 0.4235, + "mean_token_accuracy": 0.85434011220932, + "step": 3080 + }, + { + "epoch": 1.4803262955854126, + "grad_norm": 0.461521417802285, + "learning_rate": 2.8146107358691786e-05, + "loss": 0.4047, + "mean_token_accuracy": 0.860513162612915, + "step": 3085 + }, + { + "epoch": 1.482725527831094, + "grad_norm": 0.3917183172391874, + "learning_rate": 2.8101670814077498e-05, + "loss": 0.4089, + "mean_token_accuracy": 0.8591904163360595, + "step": 3090 + }, + { + "epoch": 1.4851247600767754, + "grad_norm": 0.43071306913770446, + "learning_rate": 2.8057234269463206e-05, + "loss": 0.4206, + "mean_token_accuracy": 0.8550276577472686, + "step": 3095 + }, + { + "epoch": 1.4875239923224568, + "grad_norm": 0.40999435056243977, + "learning_rate": 2.8012797724848915e-05, + "loss": 0.3958, + "mean_token_accuracy": 0.8640438497066498, + "step": 3100 + }, + { + "epoch": 1.4899232245681382, + "grad_norm": 0.4383534888886836, + "learning_rate": 2.7968361180234626e-05, + "loss": 0.4004, + "mean_token_accuracy": 0.8615210294723511, + "step": 3105 + }, + { + "epoch": 1.4923224568138196, + "grad_norm": 0.4018639584899305, + "learning_rate": 2.7923924635620335e-05, + "loss": 0.4, + "mean_token_accuracy": 0.8615460336208344, + "step": 3110 + }, + { + "epoch": 1.494721689059501, + "grad_norm": 0.4176219228220342, + "learning_rate": 2.7879488091006047e-05, + "loss": 0.4013, + "mean_token_accuracy": 0.8609389662742615, + "step": 3115 + }, + { + "epoch": 1.4971209213051824, + "grad_norm": 0.44302343248595116, + "learning_rate": 2.7835051546391755e-05, + "loss": 0.4106, + "mean_token_accuracy": 0.8591771364212036, + "step": 3120 + }, + { + "epoch": 1.4995201535508638, + "grad_norm": 0.4178864693802271, + "learning_rate": 2.7790615001777464e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.861190801858902, + "step": 3125 + }, + { + "epoch": 1.5019193857965452, + "grad_norm": 0.44169993683798264, + "learning_rate": 2.7746178457163175e-05, + "loss": 0.411, + "mean_token_accuracy": 0.8583481729030609, + "step": 3130 + }, + { + "epoch": 1.5043186180422263, + "grad_norm": 0.4080953695790485, + "learning_rate": 2.7701741912548884e-05, + "loss": 0.3981, + "mean_token_accuracy": 0.8621671617031097, + "step": 3135 + }, + { + "epoch": 1.5067178502879077, + "grad_norm": 0.3993318929444135, + "learning_rate": 2.765730536793459e-05, + "loss": 0.3962, + "mean_token_accuracy": 0.8624468684196472, + "step": 3140 + }, + { + "epoch": 1.5091170825335891, + "grad_norm": 0.42653052791592105, + "learning_rate": 2.7612868823320297e-05, + "loss": 0.4071, + "mean_token_accuracy": 0.858930242061615, + "step": 3145 + }, + { + "epoch": 1.5115163147792705, + "grad_norm": 0.42799410610582433, + "learning_rate": 2.756843227870601e-05, + "loss": 0.3989, + "mean_token_accuracy": 0.8621241927146912, + "step": 3150 + }, + { + "epoch": 1.513915547024952, + "grad_norm": 0.4269578808768684, + "learning_rate": 2.7523995734091718e-05, + "loss": 0.3916, + "mean_token_accuracy": 0.8643751204013824, + "step": 3155 + }, + { + "epoch": 1.5163147792706333, + "grad_norm": 0.407062947790096, + "learning_rate": 2.7479559189477426e-05, + "loss": 0.4188, + "mean_token_accuracy": 0.8555765926837922, + "step": 3160 + }, + { + "epoch": 1.5187140115163147, + "grad_norm": 0.3994207604991086, + "learning_rate": 2.7435122644863138e-05, + "loss": 0.4142, + "mean_token_accuracy": 0.8575942277908325, + "step": 3165 + }, + { + "epoch": 1.521113243761996, + "grad_norm": 0.43620726489357947, + "learning_rate": 2.7390686100248847e-05, + "loss": 0.3989, + "mean_token_accuracy": 0.8623096764087677, + "step": 3170 + }, + { + "epoch": 1.5235124760076775, + "grad_norm": 0.4056020697430398, + "learning_rate": 2.7346249555634555e-05, + "loss": 0.4246, + "mean_token_accuracy": 0.8546995222568512, + "step": 3175 + }, + { + "epoch": 1.5259117082533589, + "grad_norm": 0.4289924980887499, + "learning_rate": 2.7301813011020267e-05, + "loss": 0.4002, + "mean_token_accuracy": 0.8619452834129333, + "step": 3180 + }, + { + "epoch": 1.5283109404990403, + "grad_norm": 0.4000591175003932, + "learning_rate": 2.7257376466405975e-05, + "loss": 0.4014, + "mean_token_accuracy": 0.8615616619586944, + "step": 3185 + }, + { + "epoch": 1.5307101727447217, + "grad_norm": 0.4298865662213465, + "learning_rate": 2.721293992179168e-05, + "loss": 0.4036, + "mean_token_accuracy": 0.8603943943977356, + "step": 3190 + }, + { + "epoch": 1.533109404990403, + "grad_norm": 0.4148912869232642, + "learning_rate": 2.716850337717739e-05, + "loss": 0.3952, + "mean_token_accuracy": 0.863661789894104, + "step": 3195 + }, + { + "epoch": 1.5355086372360844, + "grad_norm": 0.4005028765836154, + "learning_rate": 2.71240668325631e-05, + "loss": 0.4132, + "mean_token_accuracy": 0.8576488554477691, + "step": 3200 + }, + { + "epoch": 1.5379078694817658, + "grad_norm": 0.5863150189773588, + "learning_rate": 2.707963028794881e-05, + "loss": 0.4254, + "mean_token_accuracy": 0.855337244272232, + "step": 3205 + }, + { + "epoch": 1.5403071017274472, + "grad_norm": 0.4479332320371879, + "learning_rate": 2.7035193743334518e-05, + "loss": 0.4111, + "mean_token_accuracy": 0.8578731298446656, + "step": 3210 + }, + { + "epoch": 1.5427063339731286, + "grad_norm": 0.3975764092017198, + "learning_rate": 2.699075719872023e-05, + "loss": 0.4002, + "mean_token_accuracy": 0.8622437357902527, + "step": 3215 + }, + { + "epoch": 1.54510556621881, + "grad_norm": 0.40011161992288835, + "learning_rate": 2.6946320654105938e-05, + "loss": 0.4059, + "mean_token_accuracy": 0.8596208989620209, + "step": 3220 + }, + { + "epoch": 1.5475047984644914, + "grad_norm": 0.38016688564090934, + "learning_rate": 2.690188410949165e-05, + "loss": 0.3985, + "mean_token_accuracy": 0.8621473670005798, + "step": 3225 + }, + { + "epoch": 1.5499040307101728, + "grad_norm": 0.4138659075634796, + "learning_rate": 2.6857447564877358e-05, + "loss": 0.4058, + "mean_token_accuracy": 0.8596255898475647, + "step": 3230 + }, + { + "epoch": 1.5523032629558542, + "grad_norm": 0.422626771580196, + "learning_rate": 2.6813011020263067e-05, + "loss": 0.406, + "mean_token_accuracy": 0.8597724795341491, + "step": 3235 + }, + { + "epoch": 1.5547024952015356, + "grad_norm": 0.4170391560591557, + "learning_rate": 2.6768574475648772e-05, + "loss": 0.4004, + "mean_token_accuracy": 0.8617077708244324, + "step": 3240 + }, + { + "epoch": 1.557101727447217, + "grad_norm": 0.4275526743052804, + "learning_rate": 2.672413793103448e-05, + "loss": 0.3952, + "mean_token_accuracy": 0.8631601929664612, + "step": 3245 + }, + { + "epoch": 1.5595009596928984, + "grad_norm": 0.46752720154183036, + "learning_rate": 2.6679701386420192e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.8601115763187408, + "step": 3250 + }, + { + "epoch": 1.5619001919385798, + "grad_norm": 0.48835763222260903, + "learning_rate": 2.66352648418059e-05, + "loss": 0.3944, + "mean_token_accuracy": 0.8636266410350799, + "step": 3255 + }, + { + "epoch": 1.5642994241842612, + "grad_norm": 0.4312067375707565, + "learning_rate": 2.6590828297191612e-05, + "loss": 0.4027, + "mean_token_accuracy": 0.8612389922142029, + "step": 3260 + }, + { + "epoch": 1.5666986564299425, + "grad_norm": 0.42201364947376946, + "learning_rate": 2.654639175257732e-05, + "loss": 0.4171, + "mean_token_accuracy": 0.8569965362548828, + "step": 3265 + }, + { + "epoch": 1.569097888675624, + "grad_norm": 0.41974298276995264, + "learning_rate": 2.650195520796303e-05, + "loss": 0.4067, + "mean_token_accuracy": 0.8597201347351074, + "step": 3270 + }, + { + "epoch": 1.5714971209213053, + "grad_norm": 0.39120429378823546, + "learning_rate": 2.645751866334874e-05, + "loss": 0.4077, + "mean_token_accuracy": 0.8595295011997223, + "step": 3275 + }, + { + "epoch": 1.5738963531669867, + "grad_norm": 0.3694200404366206, + "learning_rate": 2.641308211873445e-05, + "loss": 0.4008, + "mean_token_accuracy": 0.8617702662944794, + "step": 3280 + }, + { + "epoch": 1.576295585412668, + "grad_norm": 0.41337781650891703, + "learning_rate": 2.636864557412016e-05, + "loss": 0.4017, + "mean_token_accuracy": 0.8608327209949493, + "step": 3285 + }, + { + "epoch": 1.5786948176583493, + "grad_norm": 0.4072382286123216, + "learning_rate": 2.632420902950587e-05, + "loss": 0.3983, + "mean_token_accuracy": 0.862227326631546, + "step": 3290 + }, + { + "epoch": 1.5810940499040307, + "grad_norm": 0.4103556260194619, + "learning_rate": 2.6279772484891575e-05, + "loss": 0.3938, + "mean_token_accuracy": 0.8634500622749328, + "step": 3295 + }, + { + "epoch": 1.583493282149712, + "grad_norm": 0.38369554256372573, + "learning_rate": 2.6235335940277284e-05, + "loss": 0.3826, + "mean_token_accuracy": 0.8668561160564423, + "step": 3300 + }, + { + "epoch": 1.5858925143953935, + "grad_norm": 0.4153210125652736, + "learning_rate": 2.6190899395662992e-05, + "loss": 0.4074, + "mean_token_accuracy": 0.8587499022483825, + "step": 3305 + }, + { + "epoch": 1.5882917466410749, + "grad_norm": 0.3882161148344929, + "learning_rate": 2.6146462851048704e-05, + "loss": 0.405, + "mean_token_accuracy": 0.8604959726333619, + "step": 3310 + }, + { + "epoch": 1.5906909788867563, + "grad_norm": 0.41476299336957645, + "learning_rate": 2.6102026306434412e-05, + "loss": 0.4005, + "mean_token_accuracy": 0.8615757286548614, + "step": 3315 + }, + { + "epoch": 1.5930902111324377, + "grad_norm": 0.39628831496502565, + "learning_rate": 2.6057589761820124e-05, + "loss": 0.3976, + "mean_token_accuracy": 0.8621320128440857, + "step": 3320 + }, + { + "epoch": 1.595489443378119, + "grad_norm": 0.4286234719013056, + "learning_rate": 2.6013153217205833e-05, + "loss": 0.395, + "mean_token_accuracy": 0.8630506694316864, + "step": 3325 + }, + { + "epoch": 1.5978886756238004, + "grad_norm": 0.4238437642153525, + "learning_rate": 2.596871667259154e-05, + "loss": 0.4024, + "mean_token_accuracy": 0.8605756640434266, + "step": 3330 + }, + { + "epoch": 1.6002879078694816, + "grad_norm": 0.40038266554374713, + "learning_rate": 2.5924280127977253e-05, + "loss": 0.3903, + "mean_token_accuracy": 0.8641821384429932, + "step": 3335 + }, + { + "epoch": 1.602687140115163, + "grad_norm": 0.41971373829719233, + "learning_rate": 2.587984358336296e-05, + "loss": 0.4123, + "mean_token_accuracy": 0.8578661143779754, + "step": 3340 + }, + { + "epoch": 1.6050863723608444, + "grad_norm": 0.44399643165166497, + "learning_rate": 2.5835407038748667e-05, + "loss": 0.4126, + "mean_token_accuracy": 0.8577236354351043, + "step": 3345 + }, + { + "epoch": 1.6074856046065258, + "grad_norm": 0.39536435977240997, + "learning_rate": 2.5790970494134375e-05, + "loss": 0.4012, + "mean_token_accuracy": 0.8612061738967896, + "step": 3350 + }, + { + "epoch": 1.6098848368522072, + "grad_norm": 0.38863987975951847, + "learning_rate": 2.5746533949520087e-05, + "loss": 0.3984, + "mean_token_accuracy": 0.8619537353515625, + "step": 3355 + }, + { + "epoch": 1.6122840690978886, + "grad_norm": 0.39991351372949546, + "learning_rate": 2.5702097404905795e-05, + "loss": 0.3899, + "mean_token_accuracy": 0.8649181246757507, + "step": 3360 + }, + { + "epoch": 1.61468330134357, + "grad_norm": 0.398298780822706, + "learning_rate": 2.5657660860291504e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.8612264811992645, + "step": 3365 + }, + { + "epoch": 1.6170825335892514, + "grad_norm": 0.4074770492939784, + "learning_rate": 2.5613224315677216e-05, + "loss": 0.4003, + "mean_token_accuracy": 0.8616210460662842, + "step": 3370 + }, + { + "epoch": 1.6194817658349328, + "grad_norm": 0.45214174453225614, + "learning_rate": 2.5568787771062924e-05, + "loss": 0.4167, + "mean_token_accuracy": 0.8581067621707916, + "step": 3375 + }, + { + "epoch": 1.6218809980806141, + "grad_norm": 0.4256452714311465, + "learning_rate": 2.5524351226448633e-05, + "loss": 0.4161, + "mean_token_accuracy": 0.8563941478729248, + "step": 3380 + }, + { + "epoch": 1.6242802303262955, + "grad_norm": 0.38160619565953624, + "learning_rate": 2.5479914681834344e-05, + "loss": 0.3894, + "mean_token_accuracy": 0.865361112356186, + "step": 3385 + }, + { + "epoch": 1.626679462571977, + "grad_norm": 0.3829043295701325, + "learning_rate": 2.5435478137220053e-05, + "loss": 0.4023, + "mean_token_accuracy": 0.8603506445884704, + "step": 3390 + }, + { + "epoch": 1.6290786948176583, + "grad_norm": 0.4223589108616364, + "learning_rate": 2.5391041592605758e-05, + "loss": 0.4002, + "mean_token_accuracy": 0.8612348198890686, + "step": 3395 + }, + { + "epoch": 1.6314779270633397, + "grad_norm": 0.4200284982519176, + "learning_rate": 2.5346605047991466e-05, + "loss": 0.3995, + "mean_token_accuracy": 0.8623109340667725, + "step": 3400 + }, + { + "epoch": 1.633877159309021, + "grad_norm": 0.4111996448465633, + "learning_rate": 2.5302168503377178e-05, + "loss": 0.4056, + "mean_token_accuracy": 0.8594146549701691, + "step": 3405 + }, + { + "epoch": 1.6362763915547025, + "grad_norm": 0.42908333059801745, + "learning_rate": 2.5257731958762887e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.860127204656601, + "step": 3410 + }, + { + "epoch": 1.638675623800384, + "grad_norm": 0.42152377015778175, + "learning_rate": 2.5213295414148595e-05, + "loss": 0.3973, + "mean_token_accuracy": 0.8624828159809113, + "step": 3415 + }, + { + "epoch": 1.6410748560460653, + "grad_norm": 0.36981715731062414, + "learning_rate": 2.5168858869534307e-05, + "loss": 0.4041, + "mean_token_accuracy": 0.8604373574256897, + "step": 3420 + }, + { + "epoch": 1.6434740882917467, + "grad_norm": 0.394194627184814, + "learning_rate": 2.5124422324920015e-05, + "loss": 0.3878, + "mean_token_accuracy": 0.865257203578949, + "step": 3425 + }, + { + "epoch": 1.645873320537428, + "grad_norm": 0.39941467192632707, + "learning_rate": 2.5079985780305727e-05, + "loss": 0.3909, + "mean_token_accuracy": 0.8645266890525818, + "step": 3430 + }, + { + "epoch": 1.6482725527831095, + "grad_norm": 0.39808401212537314, + "learning_rate": 2.5035549235691436e-05, + "loss": 0.3944, + "mean_token_accuracy": 0.8638250768184662, + "step": 3435 + }, + { + "epoch": 1.6506717850287909, + "grad_norm": 0.4260497481390886, + "learning_rate": 2.4991112691077144e-05, + "loss": 0.4059, + "mean_token_accuracy": 0.8600857853889465, + "step": 3440 + }, + { + "epoch": 1.6530710172744723, + "grad_norm": 0.39232268275793675, + "learning_rate": 2.4946676146462853e-05, + "loss": 0.4011, + "mean_token_accuracy": 0.8610795974731446, + "step": 3445 + }, + { + "epoch": 1.6554702495201536, + "grad_norm": 0.3703252160647266, + "learning_rate": 2.490223960184856e-05, + "loss": 0.401, + "mean_token_accuracy": 0.8614936888217926, + "step": 3450 + }, + { + "epoch": 1.657869481765835, + "grad_norm": 0.38835790843763873, + "learning_rate": 2.4857803057234273e-05, + "loss": 0.3963, + "mean_token_accuracy": 0.8633289515972138, + "step": 3455 + }, + { + "epoch": 1.6602687140115164, + "grad_norm": 0.3792197463821151, + "learning_rate": 2.4813366512619978e-05, + "loss": 0.4005, + "mean_token_accuracy": 0.8607326924800873, + "step": 3460 + }, + { + "epoch": 1.6626679462571978, + "grad_norm": 0.4162862689767936, + "learning_rate": 2.476892996800569e-05, + "loss": 0.4097, + "mean_token_accuracy": 0.8588239908218384, + "step": 3465 + }, + { + "epoch": 1.6650671785028792, + "grad_norm": 0.4013668992861733, + "learning_rate": 2.47244934233914e-05, + "loss": 0.399, + "mean_token_accuracy": 0.8627789199352265, + "step": 3470 + }, + { + "epoch": 1.6674664107485606, + "grad_norm": 0.400314301487421, + "learning_rate": 2.4680056878777107e-05, + "loss": 0.4114, + "mean_token_accuracy": 0.8580051839351654, + "step": 3475 + }, + { + "epoch": 1.669865642994242, + "grad_norm": 0.3846055122002692, + "learning_rate": 2.463562033416282e-05, + "loss": 0.4023, + "mean_token_accuracy": 0.8605178356170654, + "step": 3480 + }, + { + "epoch": 1.6722648752399232, + "grad_norm": 0.4139839951325302, + "learning_rate": 2.4591183789548524e-05, + "loss": 0.4028, + "mean_token_accuracy": 0.8606350421905518, + "step": 3485 + }, + { + "epoch": 1.6746641074856046, + "grad_norm": 0.4053280087192033, + "learning_rate": 2.4546747244934236e-05, + "loss": 0.4203, + "mean_token_accuracy": 0.8548753023147583, + "step": 3490 + }, + { + "epoch": 1.677063339731286, + "grad_norm": 0.40101094344450555, + "learning_rate": 2.4502310700319944e-05, + "loss": 0.4026, + "mean_token_accuracy": 0.8610198259353637, + "step": 3495 + }, + { + "epoch": 1.6794625719769674, + "grad_norm": 0.42870363162924036, + "learning_rate": 2.4457874155705653e-05, + "loss": 0.4022, + "mean_token_accuracy": 0.8605069994926453, + "step": 3500 + }, + { + "epoch": 1.6818618042226487, + "grad_norm": 0.40157835655622814, + "learning_rate": 2.4413437611091364e-05, + "loss": 0.4045, + "mean_token_accuracy": 0.8600060880184174, + "step": 3505 + }, + { + "epoch": 1.6842610364683301, + "grad_norm": 0.49268372911579256, + "learning_rate": 2.436900106647707e-05, + "loss": 0.3982, + "mean_token_accuracy": 0.8626984477043151, + "step": 3510 + }, + { + "epoch": 1.6866602687140115, + "grad_norm": 0.4068686641753371, + "learning_rate": 2.432456452186278e-05, + "loss": 0.4064, + "mean_token_accuracy": 0.8597881138324738, + "step": 3515 + }, + { + "epoch": 1.689059500959693, + "grad_norm": 0.4204236115303968, + "learning_rate": 2.428012797724849e-05, + "loss": 0.4074, + "mean_token_accuracy": 0.8585520923137665, + "step": 3520 + }, + { + "epoch": 1.6914587332053743, + "grad_norm": 0.4231286443089349, + "learning_rate": 2.4235691432634198e-05, + "loss": 0.4112, + "mean_token_accuracy": 0.858112221956253, + "step": 3525 + }, + { + "epoch": 1.6938579654510557, + "grad_norm": 0.41874689386169434, + "learning_rate": 2.419125488801991e-05, + "loss": 0.3886, + "mean_token_accuracy": 0.8646687090396881, + "step": 3530 + }, + { + "epoch": 1.6962571976967369, + "grad_norm": 0.41289939460863173, + "learning_rate": 2.4146818343405615e-05, + "loss": 0.396, + "mean_token_accuracy": 0.8622913122177124, + "step": 3535 + }, + { + "epoch": 1.6986564299424183, + "grad_norm": 0.3700205825502841, + "learning_rate": 2.4102381798791327e-05, + "loss": 0.3928, + "mean_token_accuracy": 0.8635813176631928, + "step": 3540 + }, + { + "epoch": 1.7010556621880997, + "grad_norm": 0.40716705114833024, + "learning_rate": 2.4057945254177036e-05, + "loss": 0.3933, + "mean_token_accuracy": 0.863760232925415, + "step": 3545 + }, + { + "epoch": 1.703454894433781, + "grad_norm": 0.3969600924112917, + "learning_rate": 2.4013508709562747e-05, + "loss": 0.4017, + "mean_token_accuracy": 0.8610733449459076, + "step": 3550 + }, + { + "epoch": 1.7058541266794625, + "grad_norm": 0.40472357502298995, + "learning_rate": 2.3969072164948456e-05, + "loss": 0.3821, + "mean_token_accuracy": 0.8672096610069275, + "step": 3555 + }, + { + "epoch": 1.7082533589251438, + "grad_norm": 0.40035135249600823, + "learning_rate": 2.3924635620334164e-05, + "loss": 0.3981, + "mean_token_accuracy": 0.8622802495956421, + "step": 3560 + }, + { + "epoch": 1.7106525911708252, + "grad_norm": 0.4066364825773236, + "learning_rate": 2.3880199075719873e-05, + "loss": 0.4025, + "mean_token_accuracy": 0.8600334405899048, + "step": 3565 + }, + { + "epoch": 1.7130518234165066, + "grad_norm": 0.38018607577288216, + "learning_rate": 2.383576253110558e-05, + "loss": 0.4043, + "mean_token_accuracy": 0.8596990585327149, + "step": 3570 + }, + { + "epoch": 1.715451055662188, + "grad_norm": 0.42371739940852104, + "learning_rate": 2.3791325986491293e-05, + "loss": 0.3929, + "mean_token_accuracy": 0.8638055503368378, + "step": 3575 + }, + { + "epoch": 1.7178502879078694, + "grad_norm": 0.40407956177407894, + "learning_rate": 2.3746889441877e-05, + "loss": 0.386, + "mean_token_accuracy": 0.8652970552444458, + "step": 3580 + }, + { + "epoch": 1.7202495201535508, + "grad_norm": 0.3997203361126536, + "learning_rate": 2.370245289726271e-05, + "loss": 0.3971, + "mean_token_accuracy": 0.862407797574997, + "step": 3585 + }, + { + "epoch": 1.7226487523992322, + "grad_norm": 0.4177803154829972, + "learning_rate": 2.365801635264842e-05, + "loss": 0.3947, + "mean_token_accuracy": 0.8625789046287536, + "step": 3590 + }, + { + "epoch": 1.7250479846449136, + "grad_norm": 0.4182283380641395, + "learning_rate": 2.3613579808034127e-05, + "loss": 0.4049, + "mean_token_accuracy": 0.8597943723201752, + "step": 3595 + }, + { + "epoch": 1.727447216890595, + "grad_norm": 0.4010408090491947, + "learning_rate": 2.356914326341984e-05, + "loss": 0.386, + "mean_token_accuracy": 0.8659861564636231, + "step": 3600 + }, + { + "epoch": 1.7298464491362764, + "grad_norm": 0.3676558695389256, + "learning_rate": 2.3524706718805547e-05, + "loss": 0.3922, + "mean_token_accuracy": 0.8638743042945862, + "step": 3605 + }, + { + "epoch": 1.7322456813819578, + "grad_norm": 0.3902289844190933, + "learning_rate": 2.3480270174191256e-05, + "loss": 0.4022, + "mean_token_accuracy": 0.861260861158371, + "step": 3610 + }, + { + "epoch": 1.7346449136276392, + "grad_norm": 0.3997085585871422, + "learning_rate": 2.3435833629576964e-05, + "loss": 0.3978, + "mean_token_accuracy": 0.8619546592235565, + "step": 3615 + }, + { + "epoch": 1.7370441458733206, + "grad_norm": 0.3921537008472013, + "learning_rate": 2.3391397084962673e-05, + "loss": 0.3964, + "mean_token_accuracy": 0.8633320808410645, + "step": 3620 + }, + { + "epoch": 1.739443378119002, + "grad_norm": 0.4129934478129214, + "learning_rate": 2.3346960540348384e-05, + "loss": 0.3956, + "mean_token_accuracy": 0.8630765914916992, + "step": 3625 + }, + { + "epoch": 1.7418426103646834, + "grad_norm": 0.42221672710484526, + "learning_rate": 2.3302523995734093e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.8610452234745025, + "step": 3630 + }, + { + "epoch": 1.7442418426103647, + "grad_norm": 0.38243888997259784, + "learning_rate": 2.3258087451119805e-05, + "loss": 0.3924, + "mean_token_accuracy": 0.8632264256477356, + "step": 3635 + }, + { + "epoch": 1.7466410748560461, + "grad_norm": 0.4191338371404725, + "learning_rate": 2.321365090650551e-05, + "loss": 0.4009, + "mean_token_accuracy": 0.8605326831340789, + "step": 3640 + }, + { + "epoch": 1.7490403071017275, + "grad_norm": 0.39265771608065647, + "learning_rate": 2.316921436189122e-05, + "loss": 0.3978, + "mean_token_accuracy": 0.8619015336036682, + "step": 3645 + }, + { + "epoch": 1.751439539347409, + "grad_norm": 1.3814131145219963, + "learning_rate": 2.312477781727693e-05, + "loss": 0.396, + "mean_token_accuracy": 0.8628222942352295, + "step": 3650 + }, + { + "epoch": 1.7538387715930903, + "grad_norm": 0.41013751416722777, + "learning_rate": 2.308034127266264e-05, + "loss": 0.3953, + "mean_token_accuracy": 0.8634633362293244, + "step": 3655 + }, + { + "epoch": 1.7562380038387717, + "grad_norm": 0.41357541362248335, + "learning_rate": 2.303590472804835e-05, + "loss": 0.3924, + "mean_token_accuracy": 0.8636547565460205, + "step": 3660 + }, + { + "epoch": 1.758637236084453, + "grad_norm": 0.39842235759316297, + "learning_rate": 2.2991468183434056e-05, + "loss": 0.4014, + "mean_token_accuracy": 0.8601654767990112, + "step": 3665 + }, + { + "epoch": 1.7610364683301345, + "grad_norm": 0.37838926308872844, + "learning_rate": 2.2947031638819767e-05, + "loss": 0.3843, + "mean_token_accuracy": 0.865840059518814, + "step": 3670 + }, + { + "epoch": 1.763435700575816, + "grad_norm": 0.3917883727663996, + "learning_rate": 2.2902595094205476e-05, + "loss": 0.3779, + "mean_token_accuracy": 0.8679964363574981, + "step": 3675 + }, + { + "epoch": 1.7658349328214973, + "grad_norm": 0.3898434426651832, + "learning_rate": 2.2858158549591184e-05, + "loss": 0.3997, + "mean_token_accuracy": 0.8615639984607697, + "step": 3680 + }, + { + "epoch": 1.7682341650671785, + "grad_norm": 0.38658220390707115, + "learning_rate": 2.2813722004976896e-05, + "loss": 0.3888, + "mean_token_accuracy": 0.8650773525238037, + "step": 3685 + }, + { + "epoch": 1.7706333973128598, + "grad_norm": 0.43035561137005984, + "learning_rate": 2.27692854603626e-05, + "loss": 0.4071, + "mean_token_accuracy": 0.859384971857071, + "step": 3690 + }, + { + "epoch": 1.7730326295585412, + "grad_norm": 0.38721362538552495, + "learning_rate": 2.2724848915748313e-05, + "loss": 0.3933, + "mean_token_accuracy": 0.863738352060318, + "step": 3695 + }, + { + "epoch": 1.7754318618042226, + "grad_norm": 0.4132259137622107, + "learning_rate": 2.268041237113402e-05, + "loss": 0.4064, + "mean_token_accuracy": 0.8590145349502564, + "step": 3700 + }, + { + "epoch": 1.777831094049904, + "grad_norm": 0.4299568515616119, + "learning_rate": 2.263597582651973e-05, + "loss": 0.3928, + "mean_token_accuracy": 0.863865715265274, + "step": 3705 + }, + { + "epoch": 1.7802303262955854, + "grad_norm": 2.9581425527056684, + "learning_rate": 2.2591539281905442e-05, + "loss": 0.3953, + "mean_token_accuracy": 0.8633383333683013, + "step": 3710 + }, + { + "epoch": 1.7826295585412668, + "grad_norm": 0.422515346351166, + "learning_rate": 2.2547102737291147e-05, + "loss": 0.4013, + "mean_token_accuracy": 0.8613436758518219, + "step": 3715 + }, + { + "epoch": 1.7850287907869482, + "grad_norm": 0.3982732965857703, + "learning_rate": 2.250266619267686e-05, + "loss": 0.3903, + "mean_token_accuracy": 0.8644165217876434, + "step": 3720 + }, + { + "epoch": 1.7874280230326296, + "grad_norm": 0.40233117601112617, + "learning_rate": 2.2458229648062567e-05, + "loss": 0.3915, + "mean_token_accuracy": 0.8637703895568848, + "step": 3725 + }, + { + "epoch": 1.789827255278311, + "grad_norm": 0.4025965574518156, + "learning_rate": 2.2413793103448276e-05, + "loss": 0.3855, + "mean_token_accuracy": 0.8658525586128235, + "step": 3730 + }, + { + "epoch": 1.7922264875239922, + "grad_norm": 0.45149891362450206, + "learning_rate": 2.2369356558833988e-05, + "loss": 0.4104, + "mean_token_accuracy": 0.8584145903587341, + "step": 3735 + }, + { + "epoch": 1.7946257197696736, + "grad_norm": 0.4391967468837121, + "learning_rate": 2.2324920014219693e-05, + "loss": 0.4004, + "mean_token_accuracy": 0.8613960266113281, + "step": 3740 + }, + { + "epoch": 1.797024952015355, + "grad_norm": 0.3780249057947582, + "learning_rate": 2.2280483469605405e-05, + "loss": 0.385, + "mean_token_accuracy": 0.8659556925296783, + "step": 3745 + }, + { + "epoch": 1.7994241842610363, + "grad_norm": 0.4164461266316941, + "learning_rate": 2.2236046924991113e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.864171975851059, + "step": 3750 + }, + { + "epoch": 1.8018234165067177, + "grad_norm": 0.419648458986139, + "learning_rate": 2.2191610380376825e-05, + "loss": 0.402, + "mean_token_accuracy": 0.8604326784610749, + "step": 3755 + }, + { + "epoch": 1.8042226487523991, + "grad_norm": 0.40472603981661803, + "learning_rate": 2.2147173835762533e-05, + "loss": 0.3967, + "mean_token_accuracy": 0.8622312307357788, + "step": 3760 + }, + { + "epoch": 1.8066218809980805, + "grad_norm": 0.38537513710763316, + "learning_rate": 2.210273729114824e-05, + "loss": 0.3843, + "mean_token_accuracy": 0.8659494280815124, + "step": 3765 + }, + { + "epoch": 1.809021113243762, + "grad_norm": 0.40414961296090346, + "learning_rate": 2.205830074653395e-05, + "loss": 0.3896, + "mean_token_accuracy": 0.8649540543556213, + "step": 3770 + }, + { + "epoch": 1.8114203454894433, + "grad_norm": 0.40323221984828034, + "learning_rate": 2.201386420191966e-05, + "loss": 0.4042, + "mean_token_accuracy": 0.8601405024528503, + "step": 3775 + }, + { + "epoch": 1.8138195777351247, + "grad_norm": 0.398941675227608, + "learning_rate": 2.196942765730537e-05, + "loss": 0.4121, + "mean_token_accuracy": 0.8570371568202972, + "step": 3780 + }, + { + "epoch": 1.816218809980806, + "grad_norm": 0.37972257074705956, + "learning_rate": 2.192499111269108e-05, + "loss": 0.3927, + "mean_token_accuracy": 0.863745391368866, + "step": 3785 + }, + { + "epoch": 1.8186180422264875, + "grad_norm": 0.40411822515933343, + "learning_rate": 2.1880554568076787e-05, + "loss": 0.3853, + "mean_token_accuracy": 0.8656267642974853, + "step": 3790 + }, + { + "epoch": 1.8210172744721689, + "grad_norm": 0.3860353548447939, + "learning_rate": 2.1836118023462496e-05, + "loss": 0.4031, + "mean_token_accuracy": 0.8604975342750549, + "step": 3795 + }, + { + "epoch": 1.8234165067178503, + "grad_norm": 0.3993705247036203, + "learning_rate": 2.1791681478848204e-05, + "loss": 0.4001, + "mean_token_accuracy": 0.8609272360801696, + "step": 3800 + }, + { + "epoch": 1.8258157389635317, + "grad_norm": 0.4107616220909192, + "learning_rate": 2.1747244934233916e-05, + "loss": 0.3898, + "mean_token_accuracy": 0.8649306237697602, + "step": 3805 + }, + { + "epoch": 1.828214971209213, + "grad_norm": 0.42130544815400517, + "learning_rate": 2.1702808389619625e-05, + "loss": 0.3922, + "mean_token_accuracy": 0.8640716016292572, + "step": 3810 + }, + { + "epoch": 1.8306142034548945, + "grad_norm": 0.39983926601339675, + "learning_rate": 2.1658371845005333e-05, + "loss": 0.3876, + "mean_token_accuracy": 0.8645726799964905, + "step": 3815 + }, + { + "epoch": 1.8330134357005758, + "grad_norm": 0.43776571460399505, + "learning_rate": 2.161393530039104e-05, + "loss": 0.4095, + "mean_token_accuracy": 0.8584106743335724, + "step": 3820 + }, + { + "epoch": 1.8354126679462572, + "grad_norm": 0.398550236871868, + "learning_rate": 2.156949875577675e-05, + "loss": 0.3824, + "mean_token_accuracy": 0.8671510696411133, + "step": 3825 + }, + { + "epoch": 1.8378119001919386, + "grad_norm": 0.3929981521422064, + "learning_rate": 2.1525062211162462e-05, + "loss": 0.3909, + "mean_token_accuracy": 0.864390742778778, + "step": 3830 + }, + { + "epoch": 1.84021113243762, + "grad_norm": 0.40038946784997037, + "learning_rate": 2.148062566654817e-05, + "loss": 0.3879, + "mean_token_accuracy": 0.8649435222148896, + "step": 3835 + }, + { + "epoch": 1.8426103646833014, + "grad_norm": 0.4041167948196589, + "learning_rate": 2.1436189121933882e-05, + "loss": 0.3923, + "mean_token_accuracy": 0.863636976480484, + "step": 3840 + }, + { + "epoch": 1.8450095969289828, + "grad_norm": 0.40739126242654766, + "learning_rate": 2.1391752577319587e-05, + "loss": 0.3835, + "mean_token_accuracy": 0.8662947714328766, + "step": 3845 + }, + { + "epoch": 1.8474088291746642, + "grad_norm": 0.3818901707442913, + "learning_rate": 2.1347316032705296e-05, + "loss": 0.3974, + "mean_token_accuracy": 0.8623984336853028, + "step": 3850 + }, + { + "epoch": 1.8498080614203456, + "grad_norm": 0.4122122337260937, + "learning_rate": 2.1302879488091008e-05, + "loss": 0.3988, + "mean_token_accuracy": 0.8618648052215576, + "step": 3855 + }, + { + "epoch": 1.852207293666027, + "grad_norm": 0.3717712293387995, + "learning_rate": 2.1258442943476716e-05, + "loss": 0.3987, + "mean_token_accuracy": 0.8617163598537445, + "step": 3860 + }, + { + "epoch": 1.8546065259117084, + "grad_norm": 0.3947013818989466, + "learning_rate": 2.1214006398862428e-05, + "loss": 0.3855, + "mean_token_accuracy": 0.86544628739357, + "step": 3865 + }, + { + "epoch": 1.8570057581573898, + "grad_norm": 0.39602921518954387, + "learning_rate": 2.1169569854248133e-05, + "loss": 0.388, + "mean_token_accuracy": 0.8647743582725524, + "step": 3870 + }, + { + "epoch": 1.8594049904030712, + "grad_norm": 0.40195445490269366, + "learning_rate": 2.1125133309633845e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.8621335804462433, + "step": 3875 + }, + { + "epoch": 1.8618042226487526, + "grad_norm": 0.4440168571579167, + "learning_rate": 2.1080696765019553e-05, + "loss": 0.3855, + "mean_token_accuracy": 0.8659838140010834, + "step": 3880 + }, + { + "epoch": 1.8642034548944337, + "grad_norm": 0.3940893324231355, + "learning_rate": 2.1036260220405262e-05, + "loss": 0.3947, + "mean_token_accuracy": 0.8628605663776397, + "step": 3885 + }, + { + "epoch": 1.8666026871401151, + "grad_norm": 0.3930058952084085, + "learning_rate": 2.0991823675790974e-05, + "loss": 0.3828, + "mean_token_accuracy": 0.866622906923294, + "step": 3890 + }, + { + "epoch": 1.8690019193857965, + "grad_norm": 0.3749207822841214, + "learning_rate": 2.094738713117668e-05, + "loss": 0.3774, + "mean_token_accuracy": 0.8686128854751587, + "step": 3895 + }, + { + "epoch": 1.871401151631478, + "grad_norm": 0.3858339851657635, + "learning_rate": 2.090295058656239e-05, + "loss": 0.3888, + "mean_token_accuracy": 0.8645212113857269, + "step": 3900 + }, + { + "epoch": 1.8738003838771593, + "grad_norm": 0.3783986491624493, + "learning_rate": 2.08585140419481e-05, + "loss": 0.39, + "mean_token_accuracy": 0.8643775939941406, + "step": 3905 + }, + { + "epoch": 1.8761996161228407, + "grad_norm": 0.42063415379891794, + "learning_rate": 2.0814077497333808e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.8639869809150695, + "step": 3910 + }, + { + "epoch": 1.878598848368522, + "grad_norm": 0.40149079913253105, + "learning_rate": 2.076964095271952e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.8618444919586181, + "step": 3915 + }, + { + "epoch": 1.8809980806142035, + "grad_norm": 0.39434822315885765, + "learning_rate": 2.0725204408105224e-05, + "loss": 0.3986, + "mean_token_accuracy": 0.8615225970745086, + "step": 3920 + }, + { + "epoch": 1.8833973128598849, + "grad_norm": 0.39080023400068203, + "learning_rate": 2.0680767863490936e-05, + "loss": 0.3931, + "mean_token_accuracy": 0.8635000646114349, + "step": 3925 + }, + { + "epoch": 1.8857965451055663, + "grad_norm": 0.41835303330036994, + "learning_rate": 2.0636331318876645e-05, + "loss": 0.3902, + "mean_token_accuracy": 0.8642172873020172, + "step": 3930 + }, + { + "epoch": 1.8881957773512474, + "grad_norm": 0.40460172529599076, + "learning_rate": 2.0591894774262353e-05, + "loss": 0.3838, + "mean_token_accuracy": 0.8665140569210052, + "step": 3935 + }, + { + "epoch": 1.8905950095969288, + "grad_norm": 0.40767639010951, + "learning_rate": 2.0547458229648065e-05, + "loss": 0.3914, + "mean_token_accuracy": 0.8642727673053742, + "step": 3940 + }, + { + "epoch": 1.8929942418426102, + "grad_norm": 0.3924246672999069, + "learning_rate": 2.050302168503377e-05, + "loss": 0.3876, + "mean_token_accuracy": 0.8654048681259155, + "step": 3945 + }, + { + "epoch": 1.8953934740882916, + "grad_norm": 0.39817152067892664, + "learning_rate": 2.0458585140419482e-05, + "loss": 0.4026, + "mean_token_accuracy": 0.8603006482124329, + "step": 3950 + }, + { + "epoch": 1.897792706333973, + "grad_norm": 0.40930635461715104, + "learning_rate": 2.041414859580519e-05, + "loss": 0.3962, + "mean_token_accuracy": 0.8623538970947265, + "step": 3955 + }, + { + "epoch": 1.9001919385796544, + "grad_norm": 0.39188090083012994, + "learning_rate": 2.03697120511909e-05, + "loss": 0.3873, + "mean_token_accuracy": 0.8655392467975617, + "step": 3960 + }, + { + "epoch": 1.9025911708253358, + "grad_norm": 0.3872694109948547, + "learning_rate": 2.032527550657661e-05, + "loss": 0.3803, + "mean_token_accuracy": 0.8677206397056579, + "step": 3965 + }, + { + "epoch": 1.9049904030710172, + "grad_norm": 0.38762477385935795, + "learning_rate": 2.028083896196232e-05, + "loss": 0.392, + "mean_token_accuracy": 0.8632125437259675, + "step": 3970 + }, + { + "epoch": 1.9073896353166986, + "grad_norm": 0.38142115013351746, + "learning_rate": 2.0236402417348028e-05, + "loss": 0.3987, + "mean_token_accuracy": 0.8616180002689362, + "step": 3975 + }, + { + "epoch": 1.90978886756238, + "grad_norm": 0.37033083914918996, + "learning_rate": 2.0191965872733736e-05, + "loss": 0.3943, + "mean_token_accuracy": 0.8636281967163086, + "step": 3980 + }, + { + "epoch": 1.9121880998080614, + "grad_norm": 0.3953126160258497, + "learning_rate": 2.0147529328119448e-05, + "loss": 0.3836, + "mean_token_accuracy": 0.8667151033878326, + "step": 3985 + }, + { + "epoch": 1.9145873320537428, + "grad_norm": 0.39082658638737106, + "learning_rate": 2.0103092783505157e-05, + "loss": 0.3824, + "mean_token_accuracy": 0.8666440069675445, + "step": 3990 + }, + { + "epoch": 1.9169865642994242, + "grad_norm": 0.4236031781517909, + "learning_rate": 2.0058656238890865e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.8597177863121033, + "step": 3995 + }, + { + "epoch": 1.9193857965451055, + "grad_norm": 0.3914701005030144, + "learning_rate": 2.0014219694276573e-05, + "loss": 0.3926, + "mean_token_accuracy": 0.863865715265274, + "step": 4000 + }, + { + "epoch": 1.921785028790787, + "grad_norm": 0.4487506346824904, + "learning_rate": 1.9969783149662282e-05, + "loss": 0.391, + "mean_token_accuracy": 0.863703978061676, + "step": 4005 + }, + { + "epoch": 1.9241842610364683, + "grad_norm": 0.39004399723283967, + "learning_rate": 1.9925346605047994e-05, + "loss": 0.3873, + "mean_token_accuracy": 0.8656900286674499, + "step": 4010 + }, + { + "epoch": 1.9265834932821497, + "grad_norm": 0.41669329574673414, + "learning_rate": 1.9880910060433702e-05, + "loss": 0.4008, + "mean_token_accuracy": 0.8607615292072296, + "step": 4015 + }, + { + "epoch": 1.9289827255278311, + "grad_norm": 0.4374851172479612, + "learning_rate": 1.983647351581941e-05, + "loss": 0.3861, + "mean_token_accuracy": 0.8661767899990082, + "step": 4020 + }, + { + "epoch": 1.9313819577735125, + "grad_norm": 0.4107608827328522, + "learning_rate": 1.979203697120512e-05, + "loss": 0.3977, + "mean_token_accuracy": 0.8626461088657379, + "step": 4025 + }, + { + "epoch": 1.933781190019194, + "grad_norm": 0.3816761081097792, + "learning_rate": 1.9747600426590828e-05, + "loss": 0.3895, + "mean_token_accuracy": 0.8643610537052154, + "step": 4030 + }, + { + "epoch": 1.9361804222648753, + "grad_norm": 0.3900662876650821, + "learning_rate": 1.970316388197654e-05, + "loss": 0.3897, + "mean_token_accuracy": 0.8647509276866913, + "step": 4035 + }, + { + "epoch": 1.9385796545105567, + "grad_norm": 0.38289370485330754, + "learning_rate": 1.9658727337362248e-05, + "loss": 0.3727, + "mean_token_accuracy": 0.8701489210128784, + "step": 4040 + }, + { + "epoch": 1.940978886756238, + "grad_norm": 0.39717478282372676, + "learning_rate": 1.9614290792747956e-05, + "loss": 0.3975, + "mean_token_accuracy": 0.8616616666316986, + "step": 4045 + }, + { + "epoch": 1.9433781190019195, + "grad_norm": 0.48163705436697307, + "learning_rate": 1.9569854248133665e-05, + "loss": 0.3902, + "mean_token_accuracy": 0.8639610290527344, + "step": 4050 + }, + { + "epoch": 1.9457773512476009, + "grad_norm": 0.37838363527658575, + "learning_rate": 1.9525417703519373e-05, + "loss": 0.3884, + "mean_token_accuracy": 0.8645766973495483, + "step": 4055 + }, + { + "epoch": 1.9481765834932823, + "grad_norm": 0.38263168857995816, + "learning_rate": 1.9480981158905085e-05, + "loss": 0.3893, + "mean_token_accuracy": 0.8647235751152038, + "step": 4060 + }, + { + "epoch": 1.9505758157389637, + "grad_norm": 0.3930060747239377, + "learning_rate": 1.9436544614290794e-05, + "loss": 0.3869, + "mean_token_accuracy": 0.8648993670940399, + "step": 4065 + }, + { + "epoch": 1.952975047984645, + "grad_norm": 0.39704337841199966, + "learning_rate": 1.9392108069676505e-05, + "loss": 0.4068, + "mean_token_accuracy": 0.8590138375759124, + "step": 4070 + }, + { + "epoch": 1.9553742802303264, + "grad_norm": 0.3879254475886731, + "learning_rate": 1.934767152506221e-05, + "loss": 0.3883, + "mean_token_accuracy": 0.8654634654521942, + "step": 4075 + }, + { + "epoch": 1.9577735124760078, + "grad_norm": 0.38844784689750755, + "learning_rate": 1.930323498044792e-05, + "loss": 0.3798, + "mean_token_accuracy": 0.8672338962554932, + "step": 4080 + }, + { + "epoch": 1.960172744721689, + "grad_norm": 0.37446283650301254, + "learning_rate": 1.925879843583363e-05, + "loss": 0.3788, + "mean_token_accuracy": 0.867630785703659, + "step": 4085 + }, + { + "epoch": 1.9625719769673704, + "grad_norm": 0.4861745002358442, + "learning_rate": 1.921436189121934e-05, + "loss": 0.3884, + "mean_token_accuracy": 0.8643026351928711, + "step": 4090 + }, + { + "epoch": 1.9649712092130518, + "grad_norm": 0.46859343821993776, + "learning_rate": 1.916992534660505e-05, + "loss": 0.3938, + "mean_token_accuracy": 0.8629789352416992, + "step": 4095 + }, + { + "epoch": 1.9673704414587332, + "grad_norm": 0.3704611655318632, + "learning_rate": 1.9125488801990756e-05, + "loss": 0.3926, + "mean_token_accuracy": 0.8637086689472199, + "step": 4100 + }, + { + "epoch": 1.9697696737044146, + "grad_norm": 0.39545534380900776, + "learning_rate": 1.9081052257376468e-05, + "loss": 0.3893, + "mean_token_accuracy": 0.8637082576751709, + "step": 4105 + }, + { + "epoch": 1.972168905950096, + "grad_norm": 0.4112546795437855, + "learning_rate": 1.9036615712762177e-05, + "loss": 0.3933, + "mean_token_accuracy": 0.8629828453063965, + "step": 4110 + }, + { + "epoch": 1.9745681381957774, + "grad_norm": 0.3945209577425338, + "learning_rate": 1.8992179168147885e-05, + "loss": 0.3889, + "mean_token_accuracy": 0.8651759505271912, + "step": 4115 + }, + { + "epoch": 1.9769673704414588, + "grad_norm": 0.4004234809978467, + "learning_rate": 1.8947742623533597e-05, + "loss": 0.3943, + "mean_token_accuracy": 0.8631406724452972, + "step": 4120 + }, + { + "epoch": 1.9793666026871402, + "grad_norm": 0.4002978488761072, + "learning_rate": 1.8903306078919302e-05, + "loss": 0.4068, + "mean_token_accuracy": 0.8592036962509155, + "step": 4125 + }, + { + "epoch": 1.9817658349328215, + "grad_norm": 0.42559417968429664, + "learning_rate": 1.8858869534305014e-05, + "loss": 0.3995, + "mean_token_accuracy": 0.8614804029464722, + "step": 4130 + }, + { + "epoch": 1.9841650671785027, + "grad_norm": 0.4441111127890975, + "learning_rate": 1.8814432989690722e-05, + "loss": 0.3811, + "mean_token_accuracy": 0.8669618248939515, + "step": 4135 + }, + { + "epoch": 1.986564299424184, + "grad_norm": 0.37759355410482454, + "learning_rate": 1.876999644507643e-05, + "loss": 0.3955, + "mean_token_accuracy": 0.8630719184875488, + "step": 4140 + }, + { + "epoch": 1.9889635316698655, + "grad_norm": 0.38851753327004795, + "learning_rate": 1.8725559900462143e-05, + "loss": 0.3895, + "mean_token_accuracy": 0.8648181080818176, + "step": 4145 + }, + { + "epoch": 1.991362763915547, + "grad_norm": 0.39128512330716314, + "learning_rate": 1.8681123355847848e-05, + "loss": 0.3873, + "mean_token_accuracy": 0.8651157855987549, + "step": 4150 + }, + { + "epoch": 1.9937619961612283, + "grad_norm": 0.36186928153286846, + "learning_rate": 1.863668681123356e-05, + "loss": 0.3863, + "mean_token_accuracy": 0.8653822124004364, + "step": 4155 + }, + { + "epoch": 1.9961612284069097, + "grad_norm": 0.3955538160916505, + "learning_rate": 1.8592250266619268e-05, + "loss": 0.392, + "mean_token_accuracy": 0.8637274146080017, + "step": 4160 + }, + { + "epoch": 1.998560460652591, + "grad_norm": 0.43379145511335016, + "learning_rate": 1.8547813722004976e-05, + "loss": 0.3956, + "mean_token_accuracy": 0.8617413640022278, + "step": 4165 + }, + { + "epoch": 2.0009596928982725, + "grad_norm": 0.4990607339389488, + "learning_rate": 1.8503377177390688e-05, + "loss": 0.3482, + "mean_token_accuracy": 0.877407294511795, + "step": 4170 + }, + { + "epoch": 2.003358925143954, + "grad_norm": 0.48774642605648677, + "learning_rate": 1.8458940632776397e-05, + "loss": 0.2789, + "mean_token_accuracy": 0.8987186670303344, + "step": 4175 + }, + { + "epoch": 2.0057581573896353, + "grad_norm": 0.550368833299061, + "learning_rate": 1.8414504088162105e-05, + "loss": 0.2838, + "mean_token_accuracy": 0.8964677393436432, + "step": 4180 + }, + { + "epoch": 2.0081573896353166, + "grad_norm": 0.448658045217398, + "learning_rate": 1.8370067543547814e-05, + "loss": 0.2717, + "mean_token_accuracy": 0.9005320727825165, + "step": 4185 + }, + { + "epoch": 2.010556621880998, + "grad_norm": 0.4571573770888426, + "learning_rate": 1.8325630998933526e-05, + "loss": 0.2718, + "mean_token_accuracy": 0.9007344186306, + "step": 4190 + }, + { + "epoch": 2.0129558541266794, + "grad_norm": 0.4566936280537468, + "learning_rate": 1.8281194454319234e-05, + "loss": 0.2681, + "mean_token_accuracy": 0.9025704741477967, + "step": 4195 + }, + { + "epoch": 2.015355086372361, + "grad_norm": 0.5002619274348096, + "learning_rate": 1.8236757909704942e-05, + "loss": 0.2775, + "mean_token_accuracy": 0.8995882570743561, + "step": 4200 + }, + { + "epoch": 2.017754318618042, + "grad_norm": 0.45520437909491956, + "learning_rate": 1.819232136509065e-05, + "loss": 0.2545, + "mean_token_accuracy": 0.9068623960018158, + "step": 4205 + }, + { + "epoch": 2.0201535508637236, + "grad_norm": 0.44638235220575284, + "learning_rate": 1.814788482047636e-05, + "loss": 0.2755, + "mean_token_accuracy": 0.8995202898979187, + "step": 4210 + }, + { + "epoch": 2.022552783109405, + "grad_norm": 0.4321368591508329, + "learning_rate": 1.810344827586207e-05, + "loss": 0.2634, + "mean_token_accuracy": 0.9036517858505249, + "step": 4215 + }, + { + "epoch": 2.0249520153550864, + "grad_norm": 0.49384643713338167, + "learning_rate": 1.805901173124778e-05, + "loss": 0.283, + "mean_token_accuracy": 0.8976084411144256, + "step": 4220 + }, + { + "epoch": 2.027351247600768, + "grad_norm": 0.46123013151789993, + "learning_rate": 1.8014575186633488e-05, + "loss": 0.274, + "mean_token_accuracy": 0.9010094285011292, + "step": 4225 + }, + { + "epoch": 2.029750479846449, + "grad_norm": 0.46138647274505384, + "learning_rate": 1.7970138642019197e-05, + "loss": 0.2712, + "mean_token_accuracy": 0.9008758366107941, + "step": 4230 + }, + { + "epoch": 2.0321497120921306, + "grad_norm": 1.786349693464167, + "learning_rate": 1.7925702097404905e-05, + "loss": 0.2801, + "mean_token_accuracy": 0.9000257790088654, + "step": 4235 + }, + { + "epoch": 2.034548944337812, + "grad_norm": 0.44348259045087873, + "learning_rate": 1.7881265552790617e-05, + "loss": 0.2786, + "mean_token_accuracy": 0.8985202252864838, + "step": 4240 + }, + { + "epoch": 2.0369481765834934, + "grad_norm": 0.5000759402358198, + "learning_rate": 1.7836829008176325e-05, + "loss": 0.2858, + "mean_token_accuracy": 0.8969248056411743, + "step": 4245 + }, + { + "epoch": 2.0393474088291748, + "grad_norm": 0.448800907729344, + "learning_rate": 1.7792392463562034e-05, + "loss": 0.2738, + "mean_token_accuracy": 0.9001585960388183, + "step": 4250 + }, + { + "epoch": 2.041746641074856, + "grad_norm": 0.4571099322615939, + "learning_rate": 1.7747955918947742e-05, + "loss": 0.2701, + "mean_token_accuracy": 0.9012974858283996, + "step": 4255 + }, + { + "epoch": 2.0441458733205375, + "grad_norm": 0.4642114394657761, + "learning_rate": 1.770351937433345e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.9021126329898834, + "step": 4260 + }, + { + "epoch": 2.046545105566219, + "grad_norm": 0.49976082645529357, + "learning_rate": 1.7659082829719163e-05, + "loss": 0.2798, + "mean_token_accuracy": 0.8982514500617981, + "step": 4265 + }, + { + "epoch": 2.0489443378119003, + "grad_norm": 0.4379654371585255, + "learning_rate": 1.761464628510487e-05, + "loss": 0.2668, + "mean_token_accuracy": 0.9023063957691193, + "step": 4270 + }, + { + "epoch": 2.0513435700575817, + "grad_norm": 0.5045991197144502, + "learning_rate": 1.7570209740490583e-05, + "loss": 0.2753, + "mean_token_accuracy": 0.8996968567371368, + "step": 4275 + }, + { + "epoch": 2.053742802303263, + "grad_norm": 0.4561539463036029, + "learning_rate": 1.7525773195876288e-05, + "loss": 0.2568, + "mean_token_accuracy": 0.906041008234024, + "step": 4280 + }, + { + "epoch": 2.0561420345489445, + "grad_norm": 0.5058292980069206, + "learning_rate": 1.7481336651261996e-05, + "loss": 0.2723, + "mean_token_accuracy": 0.9003937780857086, + "step": 4285 + }, + { + "epoch": 2.058541266794626, + "grad_norm": 0.4372928580482913, + "learning_rate": 1.743690010664771e-05, + "loss": 0.2803, + "mean_token_accuracy": 0.8972904622554779, + "step": 4290 + }, + { + "epoch": 2.0609404990403073, + "grad_norm": 0.4757830344041431, + "learning_rate": 1.7392463562033417e-05, + "loss": 0.2626, + "mean_token_accuracy": 0.9038049280643463, + "step": 4295 + }, + { + "epoch": 2.0633397312859887, + "grad_norm": 0.46326509822644846, + "learning_rate": 1.734802701741913e-05, + "loss": 0.2713, + "mean_token_accuracy": 0.9007812976837158, + "step": 4300 + }, + { + "epoch": 2.06573896353167, + "grad_norm": 0.43282465527513386, + "learning_rate": 1.7303590472804834e-05, + "loss": 0.2618, + "mean_token_accuracy": 0.9037814855575561, + "step": 4305 + }, + { + "epoch": 2.068138195777351, + "grad_norm": 0.47292568561468, + "learning_rate": 1.7259153928190546e-05, + "loss": 0.2741, + "mean_token_accuracy": 0.8997789025306702, + "step": 4310 + }, + { + "epoch": 2.0705374280230324, + "grad_norm": 0.47569236672519766, + "learning_rate": 1.7214717383576254e-05, + "loss": 0.2634, + "mean_token_accuracy": 0.9033852636814117, + "step": 4315 + }, + { + "epoch": 2.072936660268714, + "grad_norm": 0.48951867821674516, + "learning_rate": 1.7170280838961963e-05, + "loss": 0.2733, + "mean_token_accuracy": 0.9004883110523224, + "step": 4320 + }, + { + "epoch": 2.075335892514395, + "grad_norm": 0.473035784375584, + "learning_rate": 1.7125844294347674e-05, + "loss": 0.2699, + "mean_token_accuracy": 0.9013882339000702, + "step": 4325 + }, + { + "epoch": 2.0777351247600766, + "grad_norm": 0.46053056456143077, + "learning_rate": 1.708140774973338e-05, + "loss": 0.2544, + "mean_token_accuracy": 0.9071746647357941, + "step": 4330 + }, + { + "epoch": 2.080134357005758, + "grad_norm": 0.42859421990106655, + "learning_rate": 1.703697120511909e-05, + "loss": 0.2674, + "mean_token_accuracy": 0.9021602928638458, + "step": 4335 + }, + { + "epoch": 2.0825335892514394, + "grad_norm": 0.49007953604178717, + "learning_rate": 1.69925346605048e-05, + "loss": 0.2707, + "mean_token_accuracy": 0.900754737854004, + "step": 4340 + }, + { + "epoch": 2.084932821497121, + "grad_norm": 4.37790219062138, + "learning_rate": 1.6948098115890508e-05, + "loss": 0.2883, + "mean_token_accuracy": 0.8987155377864837, + "step": 4345 + }, + { + "epoch": 2.087332053742802, + "grad_norm": 0.49261366427321174, + "learning_rate": 1.690366157127622e-05, + "loss": 0.2783, + "mean_token_accuracy": 0.8987280428409576, + "step": 4350 + }, + { + "epoch": 2.0897312859884836, + "grad_norm": 0.42896597289174915, + "learning_rate": 1.685922502666193e-05, + "loss": 0.2713, + "mean_token_accuracy": 0.900896155834198, + "step": 4355 + }, + { + "epoch": 2.092130518234165, + "grad_norm": 0.45450827173150554, + "learning_rate": 1.6814788482047637e-05, + "loss": 0.273, + "mean_token_accuracy": 0.9007102012634277, + "step": 4360 + }, + { + "epoch": 2.0945297504798464, + "grad_norm": 0.4522864152085452, + "learning_rate": 1.6770351937433345e-05, + "loss": 0.2648, + "mean_token_accuracy": 0.9030673801898956, + "step": 4365 + }, + { + "epoch": 2.0969289827255277, + "grad_norm": 0.4633404403314624, + "learning_rate": 1.6725915392819054e-05, + "loss": 0.2781, + "mean_token_accuracy": 0.898474907875061, + "step": 4370 + }, + { + "epoch": 2.099328214971209, + "grad_norm": 0.46662108591262225, + "learning_rate": 1.6681478848204766e-05, + "loss": 0.2832, + "mean_token_accuracy": 0.8965177476406098, + "step": 4375 + }, + { + "epoch": 2.1017274472168905, + "grad_norm": 0.44562361030276315, + "learning_rate": 1.6637042303590474e-05, + "loss": 0.2681, + "mean_token_accuracy": 0.9022032618522644, + "step": 4380 + }, + { + "epoch": 2.104126679462572, + "grad_norm": 0.44918481180735653, + "learning_rate": 1.6592605758976183e-05, + "loss": 0.2642, + "mean_token_accuracy": 0.9034236431121826, + "step": 4385 + }, + { + "epoch": 2.1065259117082533, + "grad_norm": 0.4649130093432506, + "learning_rate": 1.654816921436189e-05, + "loss": 0.2687, + "mean_token_accuracy": 0.9015680313110351, + "step": 4390 + }, + { + "epoch": 2.1089251439539347, + "grad_norm": 0.47144552458086414, + "learning_rate": 1.6503732669747603e-05, + "loss": 0.2643, + "mean_token_accuracy": 0.9031572341918945, + "step": 4395 + }, + { + "epoch": 2.111324376199616, + "grad_norm": 0.45486321477616076, + "learning_rate": 1.645929612513331e-05, + "loss": 0.2732, + "mean_token_accuracy": 0.900248795747757, + "step": 4400 + }, + { + "epoch": 2.1137236084452975, + "grad_norm": 0.4759359634653368, + "learning_rate": 1.641485958051902e-05, + "loss": 0.2682, + "mean_token_accuracy": 0.9020274698734283, + "step": 4405 + }, + { + "epoch": 2.116122840690979, + "grad_norm": 0.4720824801877516, + "learning_rate": 1.637042303590473e-05, + "loss": 0.2701, + "mean_token_accuracy": 0.9012477397918701, + "step": 4410 + }, + { + "epoch": 2.1185220729366603, + "grad_norm": 0.45178559782138805, + "learning_rate": 1.6325986491290437e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.9030517518520356, + "step": 4415 + }, + { + "epoch": 2.1209213051823417, + "grad_norm": 0.4477863492661275, + "learning_rate": 1.628154994667615e-05, + "loss": 0.2702, + "mean_token_accuracy": 0.9010055363178253, + "step": 4420 + }, + { + "epoch": 2.123320537428023, + "grad_norm": 0.4617479446804104, + "learning_rate": 1.6237113402061857e-05, + "loss": 0.2678, + "mean_token_accuracy": 0.9017586946487427, + "step": 4425 + }, + { + "epoch": 2.1257197696737045, + "grad_norm": 0.468851911945899, + "learning_rate": 1.6192676857447566e-05, + "loss": 0.2687, + "mean_token_accuracy": 0.9018571496009826, + "step": 4430 + }, + { + "epoch": 2.128119001919386, + "grad_norm": 0.4565575005949691, + "learning_rate": 1.6148240312833274e-05, + "loss": 0.2615, + "mean_token_accuracy": 0.9041065096855163, + "step": 4435 + }, + { + "epoch": 2.1305182341650672, + "grad_norm": 0.46814377259524864, + "learning_rate": 1.6103803768218983e-05, + "loss": 0.2743, + "mean_token_accuracy": 0.899460905790329, + "step": 4440 + }, + { + "epoch": 2.1329174664107486, + "grad_norm": 0.48726206079260836, + "learning_rate": 1.6059367223604694e-05, + "loss": 0.2626, + "mean_token_accuracy": 0.9039096117019654, + "step": 4445 + }, + { + "epoch": 2.13531669865643, + "grad_norm": 0.4420924963462651, + "learning_rate": 1.6014930678990403e-05, + "loss": 0.2765, + "mean_token_accuracy": 0.8989108800888062, + "step": 4450 + }, + { + "epoch": 2.1377159309021114, + "grad_norm": 0.4665040262296594, + "learning_rate": 1.597049413437611e-05, + "loss": 0.2746, + "mean_token_accuracy": 0.9000250041484833, + "step": 4455 + }, + { + "epoch": 2.140115163147793, + "grad_norm": 0.4106007778508939, + "learning_rate": 1.592605758976182e-05, + "loss": 0.2679, + "mean_token_accuracy": 0.9022345006465912, + "step": 4460 + }, + { + "epoch": 2.142514395393474, + "grad_norm": 0.4861737684759091, + "learning_rate": 1.5881621045147528e-05, + "loss": 0.2821, + "mean_token_accuracy": 0.8968492388725281, + "step": 4465 + }, + { + "epoch": 2.1449136276391556, + "grad_norm": 0.4714941781640915, + "learning_rate": 1.583718450053324e-05, + "loss": 0.2724, + "mean_token_accuracy": 0.9004875302314759, + "step": 4470 + }, + { + "epoch": 2.147312859884837, + "grad_norm": 0.47685731170785883, + "learning_rate": 1.579274795591895e-05, + "loss": 0.2658, + "mean_token_accuracy": 0.9028298735618592, + "step": 4475 + }, + { + "epoch": 2.1497120921305184, + "grad_norm": 0.48610023240176986, + "learning_rate": 1.5748311411304657e-05, + "loss": 0.272, + "mean_token_accuracy": 0.9006937980651856, + "step": 4480 + }, + { + "epoch": 2.1521113243762, + "grad_norm": 0.4710164843150768, + "learning_rate": 1.5703874866690366e-05, + "loss": 0.2683, + "mean_token_accuracy": 0.9017013192176819, + "step": 4485 + }, + { + "epoch": 2.154510556621881, + "grad_norm": 0.47511883246964265, + "learning_rate": 1.5659438322076074e-05, + "loss": 0.2725, + "mean_token_accuracy": 0.9000279366970062, + "step": 4490 + }, + { + "epoch": 2.1569097888675626, + "grad_norm": 0.4775014098408969, + "learning_rate": 1.5615001777461786e-05, + "loss": 0.2827, + "mean_token_accuracy": 0.896709942817688, + "step": 4495 + }, + { + "epoch": 2.159309021113244, + "grad_norm": 0.43665975325988027, + "learning_rate": 1.5570565232847494e-05, + "loss": 0.2755, + "mean_token_accuracy": 0.8993097960948944, + "step": 4500 + }, + { + "epoch": 2.161708253358925, + "grad_norm": 0.44223033656031446, + "learning_rate": 1.5526128688233206e-05, + "loss": 0.2651, + "mean_token_accuracy": 0.9031178951263428, + "step": 4505 + }, + { + "epoch": 2.1641074856046068, + "grad_norm": 0.4646989173708562, + "learning_rate": 1.548169214361891e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.9026728272438049, + "step": 4510 + }, + { + "epoch": 2.1665067178502877, + "grad_norm": 0.45611006296440343, + "learning_rate": 1.543725559900462e-05, + "loss": 0.2733, + "mean_token_accuracy": 0.9000039041042328, + "step": 4515 + }, + { + "epoch": 2.168905950095969, + "grad_norm": 0.4910530067649774, + "learning_rate": 1.539281905439033e-05, + "loss": 0.2728, + "mean_token_accuracy": 0.9004085659980774, + "step": 4520 + }, + { + "epoch": 2.1713051823416505, + "grad_norm": 0.4902817246313605, + "learning_rate": 1.534838250977604e-05, + "loss": 0.2701, + "mean_token_accuracy": 0.9008141100406647, + "step": 4525 + }, + { + "epoch": 2.173704414587332, + "grad_norm": 0.4519098840938565, + "learning_rate": 1.5303945965161752e-05, + "loss": 0.275, + "mean_token_accuracy": 0.899957025051117, + "step": 4530 + }, + { + "epoch": 2.1761036468330133, + "grad_norm": 0.48915278087110464, + "learning_rate": 1.5259509420547457e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.9028978407382965, + "step": 4535 + }, + { + "epoch": 2.1785028790786947, + "grad_norm": 0.45723186309703706, + "learning_rate": 1.5215072875933167e-05, + "loss": 0.2718, + "mean_token_accuracy": 0.9006772577762604, + "step": 4540 + }, + { + "epoch": 2.180902111324376, + "grad_norm": 0.4353868793708082, + "learning_rate": 1.5170636331318877e-05, + "loss": 0.2694, + "mean_token_accuracy": 0.9016266703605652, + "step": 4545 + }, + { + "epoch": 2.1833013435700575, + "grad_norm": 0.4600745274735988, + "learning_rate": 1.5126199786704587e-05, + "loss": 0.2704, + "mean_token_accuracy": 0.901211804151535, + "step": 4550 + }, + { + "epoch": 2.185700575815739, + "grad_norm": 0.46905406216354795, + "learning_rate": 1.5081763242090296e-05, + "loss": 0.2721, + "mean_token_accuracy": 0.900727391242981, + "step": 4555 + }, + { + "epoch": 2.1880998080614202, + "grad_norm": 0.46220184604712333, + "learning_rate": 1.5037326697476006e-05, + "loss": 0.2727, + "mean_token_accuracy": 0.9003375172615051, + "step": 4560 + }, + { + "epoch": 2.1904990403071016, + "grad_norm": 0.4828821698318418, + "learning_rate": 1.4992890152861713e-05, + "loss": 0.2708, + "mean_token_accuracy": 0.9010523974895477, + "step": 4565 + }, + { + "epoch": 2.192898272552783, + "grad_norm": 0.4640884012585103, + "learning_rate": 1.4948453608247423e-05, + "loss": 0.2638, + "mean_token_accuracy": 0.9035166263580322, + "step": 4570 + }, + { + "epoch": 2.1952975047984644, + "grad_norm": 0.47179862655018706, + "learning_rate": 1.4904017063633133e-05, + "loss": 0.2713, + "mean_token_accuracy": 0.9010164678096771, + "step": 4575 + }, + { + "epoch": 2.197696737044146, + "grad_norm": 0.46938698374728643, + "learning_rate": 1.4859580519018843e-05, + "loss": 0.2772, + "mean_token_accuracy": 0.8985358417034149, + "step": 4580 + }, + { + "epoch": 2.200095969289827, + "grad_norm": 0.6409860361891765, + "learning_rate": 1.4815143974404552e-05, + "loss": 0.2689, + "mean_token_accuracy": 0.9022345066070556, + "step": 4585 + }, + { + "epoch": 2.2024952015355086, + "grad_norm": 0.481090158096422, + "learning_rate": 1.4770707429790258e-05, + "loss": 0.2816, + "mean_token_accuracy": 0.8974818766117096, + "step": 4590 + }, + { + "epoch": 2.20489443378119, + "grad_norm": 0.46329506596417347, + "learning_rate": 1.4726270885175969e-05, + "loss": 0.278, + "mean_token_accuracy": 0.8984702050685882, + "step": 4595 + }, + { + "epoch": 2.2072936660268714, + "grad_norm": 0.4615303070585285, + "learning_rate": 1.4681834340561679e-05, + "loss": 0.2682, + "mean_token_accuracy": 0.9019219934940338, + "step": 4600 + }, + { + "epoch": 2.2096928982725528, + "grad_norm": 0.46163618974055615, + "learning_rate": 1.4637397795947389e-05, + "loss": 0.2649, + "mean_token_accuracy": 0.9029759705066681, + "step": 4605 + }, + { + "epoch": 2.212092130518234, + "grad_norm": 0.4882515005514511, + "learning_rate": 1.4592961251333097e-05, + "loss": 0.277, + "mean_token_accuracy": 0.8990098178386688, + "step": 4610 + }, + { + "epoch": 2.2144913627639156, + "grad_norm": 0.4646349415807869, + "learning_rate": 1.4548524706718806e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.9031439483165741, + "step": 4615 + }, + { + "epoch": 2.216890595009597, + "grad_norm": 0.44449725057095435, + "learning_rate": 1.4504088162104514e-05, + "loss": 0.2684, + "mean_token_accuracy": 0.9020173132419587, + "step": 4620 + }, + { + "epoch": 2.2192898272552783, + "grad_norm": 0.4656768176749509, + "learning_rate": 1.4459651617490225e-05, + "loss": 0.2751, + "mean_token_accuracy": 0.8993194878101349, + "step": 4625 + }, + { + "epoch": 2.2216890595009597, + "grad_norm": 0.4615214335096717, + "learning_rate": 1.4415215072875935e-05, + "loss": 0.2676, + "mean_token_accuracy": 0.9016627192497253, + "step": 4630 + }, + { + "epoch": 2.224088291746641, + "grad_norm": 0.46166194098937074, + "learning_rate": 1.4370778528261645e-05, + "loss": 0.2761, + "mean_token_accuracy": 0.8995867013931275, + "step": 4635 + }, + { + "epoch": 2.2264875239923225, + "grad_norm": 0.4416530636695825, + "learning_rate": 1.4326341983647352e-05, + "loss": 0.2786, + "mean_token_accuracy": 0.8981483101844787, + "step": 4640 + }, + { + "epoch": 2.228886756238004, + "grad_norm": 0.46476362069241783, + "learning_rate": 1.428190543903306e-05, + "loss": 0.2661, + "mean_token_accuracy": 0.9023899972438812, + "step": 4645 + }, + { + "epoch": 2.2312859884836853, + "grad_norm": 0.4642652248450991, + "learning_rate": 1.423746889441877e-05, + "loss": 0.2717, + "mean_token_accuracy": 0.9007234871387482, + "step": 4650 + }, + { + "epoch": 2.2336852207293667, + "grad_norm": 0.4565894514180209, + "learning_rate": 1.419303234980448e-05, + "loss": 0.2732, + "mean_token_accuracy": 0.9001586079597473, + "step": 4655 + }, + { + "epoch": 2.236084452975048, + "grad_norm": 0.45828103387435565, + "learning_rate": 1.414859580519019e-05, + "loss": 0.2727, + "mean_token_accuracy": 0.9002468883991241, + "step": 4660 + }, + { + "epoch": 2.2384836852207295, + "grad_norm": 0.4371651994660012, + "learning_rate": 1.4104159260575897e-05, + "loss": 0.2649, + "mean_token_accuracy": 0.9028993964195251, + "step": 4665 + }, + { + "epoch": 2.240882917466411, + "grad_norm": 0.45449934507751794, + "learning_rate": 1.4059722715961607e-05, + "loss": 0.2809, + "mean_token_accuracy": 0.8974053084850311, + "step": 4670 + }, + { + "epoch": 2.2432821497120923, + "grad_norm": 0.49571433936890613, + "learning_rate": 1.4015286171347316e-05, + "loss": 0.2851, + "mean_token_accuracy": 0.8978420555591583, + "step": 4675 + }, + { + "epoch": 2.2456813819577737, + "grad_norm": 0.4795958379989915, + "learning_rate": 1.3970849626733026e-05, + "loss": 0.2647, + "mean_token_accuracy": 0.9032507300376892, + "step": 4680 + }, + { + "epoch": 2.248080614203455, + "grad_norm": 0.45039732495232765, + "learning_rate": 1.3926413082118736e-05, + "loss": 0.2694, + "mean_token_accuracy": 0.9013946115970611, + "step": 4685 + }, + { + "epoch": 2.2504798464491365, + "grad_norm": 0.44596801571533046, + "learning_rate": 1.3881976537504443e-05, + "loss": 0.27, + "mean_token_accuracy": 0.9010569810867309, + "step": 4690 + }, + { + "epoch": 2.252879078694818, + "grad_norm": 0.4571665567266272, + "learning_rate": 1.3837539992890153e-05, + "loss": 0.2639, + "mean_token_accuracy": 0.9037930130958557, + "step": 4695 + }, + { + "epoch": 2.255278310940499, + "grad_norm": 0.4525712169800606, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.2639, + "mean_token_accuracy": 0.9033798992633819, + "step": 4700 + }, + { + "epoch": 2.2576775431861806, + "grad_norm": 0.4594097941262841, + "learning_rate": 1.3748666903661572e-05, + "loss": 0.2574, + "mean_token_accuracy": 0.9055361032485962, + "step": 4705 + }, + { + "epoch": 2.2600767754318616, + "grad_norm": 0.47646345360050185, + "learning_rate": 1.3704230359047282e-05, + "loss": 0.2716, + "mean_token_accuracy": 0.9010154664516449, + "step": 4710 + }, + { + "epoch": 2.2624760076775434, + "grad_norm": 0.49498838567379894, + "learning_rate": 1.3659793814432989e-05, + "loss": 0.2636, + "mean_token_accuracy": 0.9037136912345887, + "step": 4715 + }, + { + "epoch": 2.2648752399232244, + "grad_norm": 0.4790896027279024, + "learning_rate": 1.3615357269818699e-05, + "loss": 0.269, + "mean_token_accuracy": 0.9017230927944183, + "step": 4720 + }, + { + "epoch": 2.2672744721689058, + "grad_norm": 0.48992595431942215, + "learning_rate": 1.3570920725204409e-05, + "loss": 0.2746, + "mean_token_accuracy": 0.8995851278305054, + "step": 4725 + }, + { + "epoch": 2.269673704414587, + "grad_norm": 0.5359200002174315, + "learning_rate": 1.3526484180590117e-05, + "loss": 0.2669, + "mean_token_accuracy": 0.9026368260383606, + "step": 4730 + }, + { + "epoch": 2.2720729366602685, + "grad_norm": 0.4496883562391663, + "learning_rate": 1.3482047635975828e-05, + "loss": 0.2643, + "mean_token_accuracy": 0.903408020734787, + "step": 4735 + }, + { + "epoch": 2.27447216890595, + "grad_norm": 0.4589288131283035, + "learning_rate": 1.3437611091361538e-05, + "loss": 0.2764, + "mean_token_accuracy": 0.8986647486686706, + "step": 4740 + }, + { + "epoch": 2.2768714011516313, + "grad_norm": 0.5034973762490947, + "learning_rate": 1.3393174546747245e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.9032142698764801, + "step": 4745 + }, + { + "epoch": 2.2792706333973127, + "grad_norm": 0.47904037777632885, + "learning_rate": 1.3348738002132955e-05, + "loss": 0.2573, + "mean_token_accuracy": 0.9054487824440003, + "step": 4750 + }, + { + "epoch": 2.281669865642994, + "grad_norm": 0.4503339806395172, + "learning_rate": 1.3304301457518665e-05, + "loss": 0.2764, + "mean_token_accuracy": 0.8993960499763489, + "step": 4755 + }, + { + "epoch": 2.2840690978886755, + "grad_norm": 0.4492074544524633, + "learning_rate": 1.3259864912904373e-05, + "loss": 0.2613, + "mean_token_accuracy": 0.9042619824409485, + "step": 4760 + }, + { + "epoch": 2.286468330134357, + "grad_norm": 0.46624116200742, + "learning_rate": 1.3215428368290083e-05, + "loss": 0.2743, + "mean_token_accuracy": 0.8995259821414947, + "step": 4765 + }, + { + "epoch": 2.2888675623800383, + "grad_norm": 0.48969471633508643, + "learning_rate": 1.317099182367579e-05, + "loss": 0.2773, + "mean_token_accuracy": 0.8987483620643616, + "step": 4770 + }, + { + "epoch": 2.2912667946257197, + "grad_norm": 0.461439163954989, + "learning_rate": 1.31265552790615e-05, + "loss": 0.2643, + "mean_token_accuracy": 0.9034291326999664, + "step": 4775 + }, + { + "epoch": 2.293666026871401, + "grad_norm": 0.4658475465623233, + "learning_rate": 1.308211873444721e-05, + "loss": 0.2774, + "mean_token_accuracy": 0.8987905502319335, + "step": 4780 + }, + { + "epoch": 2.2960652591170825, + "grad_norm": 0.48408792260508365, + "learning_rate": 1.3037682189832919e-05, + "loss": 0.2753, + "mean_token_accuracy": 0.8993827760219574, + "step": 4785 + }, + { + "epoch": 2.298464491362764, + "grad_norm": 0.4537574035724088, + "learning_rate": 1.299324564521863e-05, + "loss": 0.2699, + "mean_token_accuracy": 0.9014258801937103, + "step": 4790 + }, + { + "epoch": 2.3008637236084453, + "grad_norm": 0.4510379588751342, + "learning_rate": 1.2948809100604336e-05, + "loss": 0.2572, + "mean_token_accuracy": 0.9058745920658111, + "step": 4795 + }, + { + "epoch": 2.3032629558541267, + "grad_norm": 0.49423695496099035, + "learning_rate": 1.2904372555990046e-05, + "loss": 0.2674, + "mean_token_accuracy": 0.9029056489467621, + "step": 4800 + }, + { + "epoch": 2.305662188099808, + "grad_norm": 0.45087124194764544, + "learning_rate": 1.2859936011375756e-05, + "loss": 0.2705, + "mean_token_accuracy": 0.9013524353504181, + "step": 4805 + }, + { + "epoch": 2.3080614203454894, + "grad_norm": 0.45105794906636565, + "learning_rate": 1.2815499466761466e-05, + "loss": 0.258, + "mean_token_accuracy": 0.9049526512622833, + "step": 4810 + }, + { + "epoch": 2.310460652591171, + "grad_norm": 0.4573799419919393, + "learning_rate": 1.2771062922147175e-05, + "loss": 0.2609, + "mean_token_accuracy": 0.9045884072780609, + "step": 4815 + }, + { + "epoch": 2.3128598848368522, + "grad_norm": 0.45047866282195803, + "learning_rate": 1.2726626377532883e-05, + "loss": 0.2661, + "mean_token_accuracy": 0.9023790538311005, + "step": 4820 + }, + { + "epoch": 2.3152591170825336, + "grad_norm": 0.44484306126020534, + "learning_rate": 1.2682189832918592e-05, + "loss": 0.2697, + "mean_token_accuracy": 0.9012295663356781, + "step": 4825 + }, + { + "epoch": 2.317658349328215, + "grad_norm": 0.4570040655210912, + "learning_rate": 1.2637753288304302e-05, + "loss": 0.2716, + "mean_token_accuracy": 0.9005773723125458, + "step": 4830 + }, + { + "epoch": 2.3200575815738964, + "grad_norm": 0.44512252026288296, + "learning_rate": 1.2593316743690012e-05, + "loss": 0.2708, + "mean_token_accuracy": 0.9009211480617523, + "step": 4835 + }, + { + "epoch": 2.322456813819578, + "grad_norm": 0.4647677877596983, + "learning_rate": 1.2548880199075722e-05, + "loss": 0.2654, + "mean_token_accuracy": 0.9026337563991547, + "step": 4840 + }, + { + "epoch": 2.324856046065259, + "grad_norm": 0.4451962255029952, + "learning_rate": 1.2504443654461429e-05, + "loss": 0.2722, + "mean_token_accuracy": 0.901082444190979, + "step": 4845 + }, + { + "epoch": 2.3272552783109406, + "grad_norm": 0.464769462545177, + "learning_rate": 1.2460007109847138e-05, + "loss": 0.2673, + "mean_token_accuracy": 0.9020293533802033, + "step": 4850 + }, + { + "epoch": 2.329654510556622, + "grad_norm": 0.5852191961334264, + "learning_rate": 1.2415570565232848e-05, + "loss": 0.2708, + "mean_token_accuracy": 0.9010414719581604, + "step": 4855 + }, + { + "epoch": 2.3320537428023034, + "grad_norm": 0.4452112718520486, + "learning_rate": 1.2371134020618558e-05, + "loss": 0.267, + "mean_token_accuracy": 0.902037626504898, + "step": 4860 + }, + { + "epoch": 2.3344529750479848, + "grad_norm": 0.4660962453351574, + "learning_rate": 1.2326697476004266e-05, + "loss": 0.2628, + "mean_token_accuracy": 0.903652572631836, + "step": 4865 + }, + { + "epoch": 2.336852207293666, + "grad_norm": 0.4510731188676562, + "learning_rate": 1.2282260931389976e-05, + "loss": 0.2614, + "mean_token_accuracy": 0.9041078269481659, + "step": 4870 + }, + { + "epoch": 2.3392514395393476, + "grad_norm": 0.45955652339680203, + "learning_rate": 1.2237824386775685e-05, + "loss": 0.2632, + "mean_token_accuracy": 0.9035033404827117, + "step": 4875 + }, + { + "epoch": 2.341650671785029, + "grad_norm": 0.45395656931314454, + "learning_rate": 1.2193387842161393e-05, + "loss": 0.2648, + "mean_token_accuracy": 0.9031056582927703, + "step": 4880 + }, + { + "epoch": 2.3440499040307103, + "grad_norm": 0.48526237862405125, + "learning_rate": 1.2148951297547104e-05, + "loss": 0.2633, + "mean_token_accuracy": 0.9035713195800781, + "step": 4885 + }, + { + "epoch": 2.3464491362763917, + "grad_norm": 0.4446519439579417, + "learning_rate": 1.2104514752932812e-05, + "loss": 0.261, + "mean_token_accuracy": 0.9043677151203156, + "step": 4890 + }, + { + "epoch": 2.348848368522073, + "grad_norm": 0.460184455442555, + "learning_rate": 1.2060078208318522e-05, + "loss": 0.2618, + "mean_token_accuracy": 0.9043756604194642, + "step": 4895 + }, + { + "epoch": 2.3512476007677545, + "grad_norm": 0.46259357140850754, + "learning_rate": 1.2015641663704232e-05, + "loss": 0.2641, + "mean_token_accuracy": 0.9031814455986023, + "step": 4900 + }, + { + "epoch": 2.3536468330134355, + "grad_norm": 0.44949250560949733, + "learning_rate": 1.1971205119089939e-05, + "loss": 0.2573, + "mean_token_accuracy": 0.9055706620216369, + "step": 4905 + }, + { + "epoch": 2.3560460652591173, + "grad_norm": 0.4763595046478618, + "learning_rate": 1.192676857447565e-05, + "loss": 0.2703, + "mean_token_accuracy": 0.9010188162326813, + "step": 4910 + }, + { + "epoch": 2.3584452975047983, + "grad_norm": 0.4747501461498077, + "learning_rate": 1.1882332029861358e-05, + "loss": 0.2751, + "mean_token_accuracy": 0.8994937121868134, + "step": 4915 + }, + { + "epoch": 2.36084452975048, + "grad_norm": 0.434688069222164, + "learning_rate": 1.1837895485247068e-05, + "loss": 0.2611, + "mean_token_accuracy": 0.9040064930915832, + "step": 4920 + }, + { + "epoch": 2.363243761996161, + "grad_norm": 0.44744820965947996, + "learning_rate": 1.1793458940632778e-05, + "loss": 0.2663, + "mean_token_accuracy": 0.9024548411369324, + "step": 4925 + }, + { + "epoch": 2.3656429942418424, + "grad_norm": 0.45455409918577533, + "learning_rate": 1.1749022396018487e-05, + "loss": 0.2665, + "mean_token_accuracy": 0.9024829626083374, + "step": 4930 + }, + { + "epoch": 2.368042226487524, + "grad_norm": 0.4526421090000582, + "learning_rate": 1.1704585851404195e-05, + "loss": 0.267, + "mean_token_accuracy": 0.902225923538208, + "step": 4935 + }, + { + "epoch": 2.370441458733205, + "grad_norm": 0.4565988566092905, + "learning_rate": 1.1660149306789903e-05, + "loss": 0.2749, + "mean_token_accuracy": 0.8996913850307464, + "step": 4940 + }, + { + "epoch": 2.3728406909788866, + "grad_norm": 0.45706018927722675, + "learning_rate": 1.1615712762175614e-05, + "loss": 0.2646, + "mean_token_accuracy": 0.9030478537082672, + "step": 4945 + }, + { + "epoch": 2.375239923224568, + "grad_norm": 0.43642789516463637, + "learning_rate": 1.1571276217561324e-05, + "loss": 0.2688, + "mean_token_accuracy": 0.9015024363994598, + "step": 4950 + }, + { + "epoch": 2.3776391554702494, + "grad_norm": 0.47138729568799137, + "learning_rate": 1.1526839672947032e-05, + "loss": 0.2725, + "mean_token_accuracy": 0.9005305051803589, + "step": 4955 + }, + { + "epoch": 2.380038387715931, + "grad_norm": 0.46096707859577163, + "learning_rate": 1.1482403128332742e-05, + "loss": 0.2629, + "mean_token_accuracy": 0.9035275757312775, + "step": 4960 + }, + { + "epoch": 2.382437619961612, + "grad_norm": 0.46062913409984646, + "learning_rate": 1.1437966583718449e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.9022954523563385, + "step": 4965 + }, + { + "epoch": 2.3848368522072936, + "grad_norm": 0.46322200081563997, + "learning_rate": 1.139353003910416e-05, + "loss": 0.2645, + "mean_token_accuracy": 0.9036223649978637, + "step": 4970 + }, + { + "epoch": 2.387236084452975, + "grad_norm": 0.46121487559901814, + "learning_rate": 1.134909349448987e-05, + "loss": 0.2632, + "mean_token_accuracy": 0.9033205091953278, + "step": 4975 + }, + { + "epoch": 2.3896353166986564, + "grad_norm": 0.4412637352222931, + "learning_rate": 1.1304656949875578e-05, + "loss": 0.2629, + "mean_token_accuracy": 0.9037455499172211, + "step": 4980 + }, + { + "epoch": 2.3920345489443378, + "grad_norm": 0.43743475515560587, + "learning_rate": 1.1260220405261288e-05, + "loss": 0.2639, + "mean_token_accuracy": 0.9039111793041229, + "step": 4985 + }, + { + "epoch": 2.394433781190019, + "grad_norm": 0.4702348583607359, + "learning_rate": 1.1215783860646997e-05, + "loss": 0.2705, + "mean_token_accuracy": 0.9011352300643921, + "step": 4990 + }, + { + "epoch": 2.3968330134357005, + "grad_norm": 0.4498054424074278, + "learning_rate": 1.1171347316032705e-05, + "loss": 0.2697, + "mean_token_accuracy": 0.90220405459404, + "step": 4995 + }, + { + "epoch": 2.399232245681382, + "grad_norm": 0.4514753499047538, + "learning_rate": 1.1126910771418415e-05, + "loss": 0.269, + "mean_token_accuracy": 0.901602441072464, + "step": 5000 + }, + { + "epoch": 2.4016314779270633, + "grad_norm": 0.43564513806024646, + "learning_rate": 1.1082474226804124e-05, + "loss": 0.2593, + "mean_token_accuracy": 0.9050846874713898, + "step": 5005 + }, + { + "epoch": 2.4040307101727447, + "grad_norm": 0.4490319843983511, + "learning_rate": 1.1038037682189834e-05, + "loss": 0.2631, + "mean_token_accuracy": 0.9036346018314362, + "step": 5010 + }, + { + "epoch": 2.406429942418426, + "grad_norm": 0.4862626649550366, + "learning_rate": 1.0993601137575544e-05, + "loss": 0.2656, + "mean_token_accuracy": 0.9032509863376618, + "step": 5015 + }, + { + "epoch": 2.4088291746641075, + "grad_norm": 0.43251142361474554, + "learning_rate": 1.0949164592961252e-05, + "loss": 0.2622, + "mean_token_accuracy": 0.9033509850502014, + "step": 5020 + }, + { + "epoch": 2.411228406909789, + "grad_norm": 0.457498561849623, + "learning_rate": 1.0904728048346961e-05, + "loss": 0.2636, + "mean_token_accuracy": 0.9033877074718475, + "step": 5025 + }, + { + "epoch": 2.4136276391554703, + "grad_norm": 0.4506931899290913, + "learning_rate": 1.086029150373267e-05, + "loss": 0.2677, + "mean_token_accuracy": 0.9021892070770263, + "step": 5030 + }, + { + "epoch": 2.4160268714011517, + "grad_norm": 0.4557896328818507, + "learning_rate": 1.081585495911838e-05, + "loss": 0.2644, + "mean_token_accuracy": 0.902996277809143, + "step": 5035 + }, + { + "epoch": 2.418426103646833, + "grad_norm": 0.49573202206092815, + "learning_rate": 1.077141841450409e-05, + "loss": 0.2786, + "mean_token_accuracy": 0.8987530529499054, + "step": 5040 + }, + { + "epoch": 2.4208253358925145, + "grad_norm": 0.47783860783254734, + "learning_rate": 1.0726981869889798e-05, + "loss": 0.2681, + "mean_token_accuracy": 0.9018009006977081, + "step": 5045 + }, + { + "epoch": 2.423224568138196, + "grad_norm": 0.4294184963917144, + "learning_rate": 1.0682545325275507e-05, + "loss": 0.2651, + "mean_token_accuracy": 0.9027284860610962, + "step": 5050 + }, + { + "epoch": 2.4256238003838773, + "grad_norm": 0.44114115756512584, + "learning_rate": 1.0638108780661215e-05, + "loss": 0.2592, + "mean_token_accuracy": 0.905122983455658, + "step": 5055 + }, + { + "epoch": 2.4280230326295587, + "grad_norm": 0.4552986094382972, + "learning_rate": 1.0593672236046925e-05, + "loss": 0.2713, + "mean_token_accuracy": 0.900846141576767, + "step": 5060 + }, + { + "epoch": 2.43042226487524, + "grad_norm": 0.44135073807729075, + "learning_rate": 1.0549235691432635e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.9026603162288666, + "step": 5065 + }, + { + "epoch": 2.4328214971209214, + "grad_norm": 0.4658946339558394, + "learning_rate": 1.0504799146818344e-05, + "loss": 0.2617, + "mean_token_accuracy": 0.9037299215793609, + "step": 5070 + }, + { + "epoch": 2.435220729366603, + "grad_norm": 0.4600480372973933, + "learning_rate": 1.0460362602204054e-05, + "loss": 0.2659, + "mean_token_accuracy": 0.9025923550128937, + "step": 5075 + }, + { + "epoch": 2.4376199616122842, + "grad_norm": 0.46058445465120573, + "learning_rate": 1.0415926057589762e-05, + "loss": 0.2613, + "mean_token_accuracy": 0.9045198023319244, + "step": 5080 + }, + { + "epoch": 2.4400191938579656, + "grad_norm": 0.42788161382405593, + "learning_rate": 1.0371489512975471e-05, + "loss": 0.267, + "mean_token_accuracy": 0.9020305931568146, + "step": 5085 + }, + { + "epoch": 2.442418426103647, + "grad_norm": 0.44027187566248915, + "learning_rate": 1.0327052968361181e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.9023063957691193, + "step": 5090 + }, + { + "epoch": 2.4448176583493284, + "grad_norm": 0.4420597215561833, + "learning_rate": 1.028261642374689e-05, + "loss": 0.2624, + "mean_token_accuracy": 0.9036471009254455, + "step": 5095 + }, + { + "epoch": 2.4472168905950094, + "grad_norm": 0.4501730001489318, + "learning_rate": 1.02381798791326e-05, + "loss": 0.2671, + "mean_token_accuracy": 0.902330607175827, + "step": 5100 + }, + { + "epoch": 2.449616122840691, + "grad_norm": 0.44641850092775204, + "learning_rate": 1.0193743334518308e-05, + "loss": 0.2679, + "mean_token_accuracy": 0.9021657586097718, + "step": 5105 + }, + { + "epoch": 2.452015355086372, + "grad_norm": 0.4490534199420156, + "learning_rate": 1.0149306789904017e-05, + "loss": 0.2638, + "mean_token_accuracy": 0.9032556712627411, + "step": 5110 + }, + { + "epoch": 2.454414587332054, + "grad_norm": 0.44072785923447616, + "learning_rate": 1.0104870245289727e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.9025269448757172, + "step": 5115 + }, + { + "epoch": 2.456813819577735, + "grad_norm": 0.4349885463972589, + "learning_rate": 1.0060433700675435e-05, + "loss": 0.2563, + "mean_token_accuracy": 0.9060261607170105, + "step": 5120 + }, + { + "epoch": 2.4592130518234163, + "grad_norm": 0.43592396045568815, + "learning_rate": 1.0015997156061145e-05, + "loss": 0.2573, + "mean_token_accuracy": 0.9053417444229126, + "step": 5125 + }, + { + "epoch": 2.4616122840690977, + "grad_norm": 0.46686763325718933, + "learning_rate": 9.971560611446856e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9016594767570496, + "step": 5130 + }, + { + "epoch": 2.464011516314779, + "grad_norm": 0.46845074051085117, + "learning_rate": 9.927124066832564e-06, + "loss": 0.2669, + "mean_token_accuracy": 0.9023274838924408, + "step": 5135 + }, + { + "epoch": 2.4664107485604605, + "grad_norm": 0.43958699879094765, + "learning_rate": 9.882687522218272e-06, + "loss": 0.2618, + "mean_token_accuracy": 0.9038767993450165, + "step": 5140 + }, + { + "epoch": 2.468809980806142, + "grad_norm": 0.41776189535267333, + "learning_rate": 9.838250977603981e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9043119847774506, + "step": 5145 + }, + { + "epoch": 2.4712092130518233, + "grad_norm": 0.4538080255278082, + "learning_rate": 9.793814432989691e-06, + "loss": 0.2597, + "mean_token_accuracy": 0.90516517162323, + "step": 5150 + }, + { + "epoch": 2.4736084452975047, + "grad_norm": 0.45447861990535154, + "learning_rate": 9.749377888375401e-06, + "loss": 0.2608, + "mean_token_accuracy": 0.9045280575752258, + "step": 5155 + }, + { + "epoch": 2.476007677543186, + "grad_norm": 0.4444329474266233, + "learning_rate": 9.70494134376111e-06, + "loss": 0.2665, + "mean_token_accuracy": 0.9021993517875672, + "step": 5160 + }, + { + "epoch": 2.4784069097888675, + "grad_norm": 0.43102288802079947, + "learning_rate": 9.660504799146818e-06, + "loss": 0.2644, + "mean_token_accuracy": 0.9035103678703308, + "step": 5165 + }, + { + "epoch": 2.480806142034549, + "grad_norm": 0.4376707591162667, + "learning_rate": 9.616068254532528e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9035827338695526, + "step": 5170 + }, + { + "epoch": 2.4832053742802302, + "grad_norm": 0.462803097758271, + "learning_rate": 9.571631709918237e-06, + "loss": 0.2633, + "mean_token_accuracy": 0.9033353626728058, + "step": 5175 + }, + { + "epoch": 2.4856046065259116, + "grad_norm": 0.46402677328691877, + "learning_rate": 9.527195165303947e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9018805861473084, + "step": 5180 + }, + { + "epoch": 2.488003838771593, + "grad_norm": 0.4626165921167753, + "learning_rate": 9.482758620689655e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9025485992431641, + "step": 5185 + }, + { + "epoch": 2.4904030710172744, + "grad_norm": 0.4405898286968182, + "learning_rate": 9.438322076075366e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9050846874713898, + "step": 5190 + }, + { + "epoch": 2.492802303262956, + "grad_norm": 0.4491809630944021, + "learning_rate": 9.393885531461074e-06, + "loss": 0.2581, + "mean_token_accuracy": 0.9051714241504669, + "step": 5195 + }, + { + "epoch": 2.495201535508637, + "grad_norm": 0.45982750692789043, + "learning_rate": 9.349448986846782e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.9034877181053161, + "step": 5200 + }, + { + "epoch": 2.4976007677543186, + "grad_norm": 0.4469470317195643, + "learning_rate": 9.305012442232493e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.9061425805091858, + "step": 5205 + }, + { + "epoch": 2.5, + "grad_norm": 0.46660175179380375, + "learning_rate": 9.260575897618201e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9026032865047455, + "step": 5210 + }, + { + "epoch": 2.5023992322456814, + "grad_norm": 0.4504938084645796, + "learning_rate": 9.216139353003911e-06, + "loss": 0.2604, + "mean_token_accuracy": 0.9043690323829651, + "step": 5215 + }, + { + "epoch": 2.504798464491363, + "grad_norm": 0.4917779663483051, + "learning_rate": 9.171702808389621e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9041010320186615, + "step": 5220 + }, + { + "epoch": 2.507197696737044, + "grad_norm": 0.44848121064946134, + "learning_rate": 9.127266263775328e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9026572048664093, + "step": 5225 + }, + { + "epoch": 2.5095969289827256, + "grad_norm": 0.45719698912437934, + "learning_rate": 9.082829719161038e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9016813576221466, + "step": 5230 + }, + { + "epoch": 2.511996161228407, + "grad_norm": 0.45266032559301805, + "learning_rate": 9.038393174546747e-06, + "loss": 0.2664, + "mean_token_accuracy": 0.9022712409496307, + "step": 5235 + }, + { + "epoch": 2.5143953934740884, + "grad_norm": 0.4621304603744557, + "learning_rate": 8.993956629932457e-06, + "loss": 0.2701, + "mean_token_accuracy": 0.9014797747135163, + "step": 5240 + }, + { + "epoch": 2.5167946257197698, + "grad_norm": 0.42584001010410305, + "learning_rate": 8.949520085318167e-06, + "loss": 0.2581, + "mean_token_accuracy": 0.9049862444400787, + "step": 5245 + }, + { + "epoch": 2.519193857965451, + "grad_norm": 0.42966249908808063, + "learning_rate": 8.905083540703876e-06, + "loss": 0.2603, + "mean_token_accuracy": 0.9045995116233826, + "step": 5250 + }, + { + "epoch": 2.5215930902111325, + "grad_norm": 0.4319478750873697, + "learning_rate": 8.860646996089584e-06, + "loss": 0.2609, + "mean_token_accuracy": 0.9043649673461914, + "step": 5255 + }, + { + "epoch": 2.523992322456814, + "grad_norm": 0.42974453071915686, + "learning_rate": 8.816210451475294e-06, + "loss": 0.264, + "mean_token_accuracy": 0.902919715642929, + "step": 5260 + }, + { + "epoch": 2.5263915547024953, + "grad_norm": 0.4397803370543199, + "learning_rate": 8.771773906861003e-06, + "loss": 0.2568, + "mean_token_accuracy": 0.9055386304855346, + "step": 5265 + }, + { + "epoch": 2.5287907869481767, + "grad_norm": 0.43395124387170547, + "learning_rate": 8.727337362246713e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9064959287643433, + "step": 5270 + }, + { + "epoch": 2.531190019193858, + "grad_norm": 0.4568247587575843, + "learning_rate": 8.682900817632421e-06, + "loss": 0.259, + "mean_token_accuracy": 0.9051628172397613, + "step": 5275 + }, + { + "epoch": 2.5335892514395395, + "grad_norm": 0.4527876906013087, + "learning_rate": 8.638464273018131e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9071043610572815, + "step": 5280 + }, + { + "epoch": 2.535988483685221, + "grad_norm": 0.4546151972549622, + "learning_rate": 8.59402772840384e-06, + "loss": 0.258, + "mean_token_accuracy": 0.9052597045898437, + "step": 5285 + }, + { + "epoch": 2.5383877159309023, + "grad_norm": 0.4356879694330123, + "learning_rate": 8.549591183789548e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.9034970939159394, + "step": 5290 + }, + { + "epoch": 2.5407869481765832, + "grad_norm": 0.44317873246471456, + "learning_rate": 8.505154639175259e-06, + "loss": 0.2665, + "mean_token_accuracy": 0.9028915882110595, + "step": 5295 + }, + { + "epoch": 2.543186180422265, + "grad_norm": 0.4465050891994833, + "learning_rate": 8.460718094560967e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.9050167143344879, + "step": 5300 + }, + { + "epoch": 2.545585412667946, + "grad_norm": 0.41974202659893994, + "learning_rate": 8.416281549946677e-06, + "loss": 0.2541, + "mean_token_accuracy": 0.9067621350288391, + "step": 5305 + }, + { + "epoch": 2.547984644913628, + "grad_norm": 0.4539270877064396, + "learning_rate": 8.371845005332386e-06, + "loss": 0.27, + "mean_token_accuracy": 0.9010617911815644, + "step": 5310 + }, + { + "epoch": 2.550383877159309, + "grad_norm": 0.4516961601525185, + "learning_rate": 8.327408460718094e-06, + "loss": 0.255, + "mean_token_accuracy": 0.906544154882431, + "step": 5315 + }, + { + "epoch": 2.5527831094049906, + "grad_norm": 0.4313966938059142, + "learning_rate": 8.282971916103804e-06, + "loss": 0.2603, + "mean_token_accuracy": 0.9046440362930298, + "step": 5320 + }, + { + "epoch": 2.5551823416506716, + "grad_norm": 0.45263025739656687, + "learning_rate": 8.238535371489513e-06, + "loss": 0.258, + "mean_token_accuracy": 0.9054847240447998, + "step": 5325 + }, + { + "epoch": 2.5575815738963534, + "grad_norm": 0.4528658588049054, + "learning_rate": 8.194098826875223e-06, + "loss": 0.2574, + "mean_token_accuracy": 0.9053511083126068, + "step": 5330 + }, + { + "epoch": 2.5599808061420344, + "grad_norm": 0.4480117751583745, + "learning_rate": 8.149662282260933e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9041584849357605, + "step": 5335 + }, + { + "epoch": 2.5623800383877158, + "grad_norm": 0.4432714854943306, + "learning_rate": 8.10522573764664e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9051284432411194, + "step": 5340 + }, + { + "epoch": 2.564779270633397, + "grad_norm": 0.4713748453981665, + "learning_rate": 8.06078919303235e-06, + "loss": 0.2672, + "mean_token_accuracy": 0.9021687090396882, + "step": 5345 + }, + { + "epoch": 2.5671785028790786, + "grad_norm": 0.4292455291598537, + "learning_rate": 8.016352648418058e-06, + "loss": 0.2569, + "mean_token_accuracy": 0.9057573795318603, + "step": 5350 + }, + { + "epoch": 2.56957773512476, + "grad_norm": 0.4533127233186409, + "learning_rate": 7.971916103803769e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9021807849407196, + "step": 5355 + }, + { + "epoch": 2.5719769673704413, + "grad_norm": 0.45378175639273344, + "learning_rate": 7.927479559189479e-06, + "loss": 0.2581, + "mean_token_accuracy": 0.9051651895046234, + "step": 5360 + }, + { + "epoch": 2.5743761996161227, + "grad_norm": 0.4305214724468804, + "learning_rate": 7.883043014575187e-06, + "loss": 0.2603, + "mean_token_accuracy": 0.9044901371002197, + "step": 5365 + }, + { + "epoch": 2.576775431861804, + "grad_norm": 0.4338096404785789, + "learning_rate": 7.838606469960896e-06, + "loss": 0.2531, + "mean_token_accuracy": 0.9074086010456085, + "step": 5370 + }, + { + "epoch": 2.5791746641074855, + "grad_norm": 0.4489769755708051, + "learning_rate": 7.794169925346606e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9032134890556336, + "step": 5375 + }, + { + "epoch": 2.581573896353167, + "grad_norm": 0.4329446145757721, + "learning_rate": 7.749733380732314e-06, + "loss": 0.2535, + "mean_token_accuracy": 0.9067066788673401, + "step": 5380 + }, + { + "epoch": 2.5839731285988483, + "grad_norm": 0.4669192159499125, + "learning_rate": 7.705296836118024e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9048237323760986, + "step": 5385 + }, + { + "epoch": 2.5863723608445297, + "grad_norm": 0.4382912605817621, + "learning_rate": 7.660860291503733e-06, + "loss": 0.2607, + "mean_token_accuracy": 0.9042885541915894, + "step": 5390 + }, + { + "epoch": 2.588771593090211, + "grad_norm": 0.44945986639739094, + "learning_rate": 7.616423746889442e-06, + "loss": 0.2604, + "mean_token_accuracy": 0.9042424559593201, + "step": 5395 + }, + { + "epoch": 2.5911708253358925, + "grad_norm": 0.44732804884036653, + "learning_rate": 7.571987202275152e-06, + "loss": 0.2505, + "mean_token_accuracy": 0.907641065120697, + "step": 5400 + }, + { + "epoch": 2.593570057581574, + "grad_norm": 0.45128927801380114, + "learning_rate": 7.52755065766086e-06, + "loss": 0.2513, + "mean_token_accuracy": 0.9073582708835601, + "step": 5405 + }, + { + "epoch": 2.5959692898272553, + "grad_norm": 0.4836265077335699, + "learning_rate": 7.48311411304657e-06, + "loss": 0.2526, + "mean_token_accuracy": 0.9070647537708283, + "step": 5410 + }, + { + "epoch": 2.5983685220729367, + "grad_norm": 0.4242626701014422, + "learning_rate": 7.438677568432279e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.905083417892456, + "step": 5415 + }, + { + "epoch": 2.600767754318618, + "grad_norm": 0.46111786679737476, + "learning_rate": 7.394241023817988e-06, + "loss": 0.2626, + "mean_token_accuracy": 0.9037265062332154, + "step": 5420 + }, + { + "epoch": 2.6031669865642995, + "grad_norm": 0.4339306870051137, + "learning_rate": 7.349804479203698e-06, + "loss": 0.2593, + "mean_token_accuracy": 0.9046136617660523, + "step": 5425 + }, + { + "epoch": 2.605566218809981, + "grad_norm": 0.4579505156147382, + "learning_rate": 7.3053679345894065e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9056503415107727, + "step": 5430 + }, + { + "epoch": 2.6079654510556622, + "grad_norm": 0.45657019011680416, + "learning_rate": 7.260931389975116e-06, + "loss": 0.2592, + "mean_token_accuracy": 0.9047815501689911, + "step": 5435 + }, + { + "epoch": 2.6103646833013436, + "grad_norm": 0.4304320724321448, + "learning_rate": 7.216494845360824e-06, + "loss": 0.2585, + "mean_token_accuracy": 0.9053276717662812, + "step": 5440 + }, + { + "epoch": 2.612763915547025, + "grad_norm": 0.4873685490263298, + "learning_rate": 7.1720583007465344e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9017115533351898, + "step": 5445 + }, + { + "epoch": 2.6151631477927064, + "grad_norm": 0.5584142725364183, + "learning_rate": 7.127621756132244e-06, + "loss": 0.2692, + "mean_token_accuracy": 0.901091468334198, + "step": 5450 + }, + { + "epoch": 2.617562380038388, + "grad_norm": 0.4393456506678057, + "learning_rate": 7.083185211517952e-06, + "loss": 0.2593, + "mean_token_accuracy": 0.9047893643379211, + "step": 5455 + }, + { + "epoch": 2.619961612284069, + "grad_norm": 0.46284001629573074, + "learning_rate": 7.038748666903662e-06, + "loss": 0.2621, + "mean_token_accuracy": 0.903766393661499, + "step": 5460 + }, + { + "epoch": 2.6223608445297506, + "grad_norm": 0.43277293461684013, + "learning_rate": 6.994312122289372e-06, + "loss": 0.2599, + "mean_token_accuracy": 0.9046470880508423, + "step": 5465 + }, + { + "epoch": 2.624760076775432, + "grad_norm": 0.49262225304162727, + "learning_rate": 6.94987557767508e-06, + "loss": 0.2537, + "mean_token_accuracy": 0.9066051006317138, + "step": 5470 + }, + { + "epoch": 2.6271593090211134, + "grad_norm": 0.4466173887837331, + "learning_rate": 6.9054390330607895e-06, + "loss": 0.2659, + "mean_token_accuracy": 0.9025446891784668, + "step": 5475 + }, + { + "epoch": 2.629558541266795, + "grad_norm": 0.48143348155877075, + "learning_rate": 6.861002488446498e-06, + "loss": 0.2669, + "mean_token_accuracy": 0.9022337436676026, + "step": 5480 + }, + { + "epoch": 2.631957773512476, + "grad_norm": 0.4569390008901632, + "learning_rate": 6.816565943832208e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9040025889873504, + "step": 5485 + }, + { + "epoch": 2.634357005758157, + "grad_norm": 0.4324944567222657, + "learning_rate": 6.772129399217917e-06, + "loss": 0.2615, + "mean_token_accuracy": 0.9045298337936402, + "step": 5490 + }, + { + "epoch": 2.636756238003839, + "grad_norm": 0.45413612337003467, + "learning_rate": 6.727692854603626e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.9049846887588501, + "step": 5495 + }, + { + "epoch": 2.63915547024952, + "grad_norm": 0.4402754826227816, + "learning_rate": 6.683256309989336e-06, + "loss": 0.254, + "mean_token_accuracy": 0.9072097957134246, + "step": 5500 + }, + { + "epoch": 2.6415547024952017, + "grad_norm": 0.43715463368337787, + "learning_rate": 6.6388197653750445e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9015414953231812, + "step": 5505 + }, + { + "epoch": 2.6439539347408827, + "grad_norm": 0.4827231208784812, + "learning_rate": 6.594383220760754e-06, + "loss": 0.2647, + "mean_token_accuracy": 0.9032431781291962, + "step": 5510 + }, + { + "epoch": 2.6463531669865645, + "grad_norm": 0.4402689194179168, + "learning_rate": 6.549946676146464e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.907448124885559, + "step": 5515 + }, + { + "epoch": 2.6487523992322455, + "grad_norm": 0.4516566081943414, + "learning_rate": 6.505510131532172e-06, + "loss": 0.2618, + "mean_token_accuracy": 0.9039760291576385, + "step": 5520 + }, + { + "epoch": 2.6511516314779273, + "grad_norm": 0.45641350808044095, + "learning_rate": 6.461073586917882e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9020151495933533, + "step": 5525 + }, + { + "epoch": 2.6535508637236083, + "grad_norm": 0.45820168634061464, + "learning_rate": 6.41663704230359e-06, + "loss": 0.2594, + "mean_token_accuracy": 0.9047295212745666, + "step": 5530 + }, + { + "epoch": 2.65595009596929, + "grad_norm": 0.48078712024469483, + "learning_rate": 6.3722004976892995e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9041463553905487, + "step": 5535 + }, + { + "epoch": 2.658349328214971, + "grad_norm": 0.4310848038599131, + "learning_rate": 6.32776395307501e-06, + "loss": 0.2614, + "mean_token_accuracy": 0.9039002537727356, + "step": 5540 + }, + { + "epoch": 2.6607485604606524, + "grad_norm": 0.45563225963393356, + "learning_rate": 6.283327408460718e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.905088609457016, + "step": 5545 + }, + { + "epoch": 2.663147792706334, + "grad_norm": 0.4432971692450577, + "learning_rate": 6.238890863846427e-06, + "loss": 0.2539, + "mean_token_accuracy": 0.9068746447563172, + "step": 5550 + }, + { + "epoch": 2.6655470249520152, + "grad_norm": 0.45973148608425424, + "learning_rate": 6.194454319232137e-06, + "loss": 0.2636, + "mean_token_accuracy": 0.9035470664501191, + "step": 5555 + }, + { + "epoch": 2.6679462571976966, + "grad_norm": 0.42951757627124687, + "learning_rate": 6.150017774617846e-06, + "loss": 0.2559, + "mean_token_accuracy": 0.9060870885849, + "step": 5560 + }, + { + "epoch": 2.670345489443378, + "grad_norm": 0.4359773306703161, + "learning_rate": 6.1055812300035545e-06, + "loss": 0.2538, + "mean_token_accuracy": 0.9069285571575165, + "step": 5565 + }, + { + "epoch": 2.6727447216890594, + "grad_norm": 0.43446645300714337, + "learning_rate": 6.061144685389265e-06, + "loss": 0.2485, + "mean_token_accuracy": 0.9087942957878112, + "step": 5570 + }, + { + "epoch": 2.675143953934741, + "grad_norm": 0.45104511017647375, + "learning_rate": 6.016708140774974e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9027559340000153, + "step": 5575 + }, + { + "epoch": 2.677543186180422, + "grad_norm": 0.4522396662766918, + "learning_rate": 5.9722715961606824e-06, + "loss": 0.2585, + "mean_token_accuracy": 0.9057000160217286, + "step": 5580 + }, + { + "epoch": 2.6799424184261036, + "grad_norm": 0.4290452834201096, + "learning_rate": 5.927835051546392e-06, + "loss": 0.2581, + "mean_token_accuracy": 0.9052300214767456, + "step": 5585 + }, + { + "epoch": 2.682341650671785, + "grad_norm": 0.4232161059713672, + "learning_rate": 5.883398506932102e-06, + "loss": 0.2542, + "mean_token_accuracy": 0.906181687116623, + "step": 5590 + }, + { + "epoch": 2.6847408829174664, + "grad_norm": 0.43860671983027966, + "learning_rate": 5.83896196231781e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9042408883571624, + "step": 5595 + }, + { + "epoch": 2.6871401151631478, + "grad_norm": 0.4256076226233432, + "learning_rate": 5.79452541770352e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9054723083972931, + "step": 5600 + }, + { + "epoch": 2.689539347408829, + "grad_norm": 0.45573244253579825, + "learning_rate": 5.750088873089229e-06, + "loss": 0.2511, + "mean_token_accuracy": 0.9077927350997925, + "step": 5605 + }, + { + "epoch": 2.6919385796545106, + "grad_norm": 0.44045582178642395, + "learning_rate": 5.7056523284749374e-06, + "loss": 0.2567, + "mean_token_accuracy": 0.9058355093002319, + "step": 5610 + }, + { + "epoch": 2.694337811900192, + "grad_norm": 0.47977934654072885, + "learning_rate": 5.661215783860648e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9032580137252808, + "step": 5615 + }, + { + "epoch": 2.6967370441458733, + "grad_norm": 0.4379358865890731, + "learning_rate": 5.616779239246357e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9041158795356751, + "step": 5620 + }, + { + "epoch": 2.6991362763915547, + "grad_norm": 0.4445723936789973, + "learning_rate": 5.572342694632065e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9042987108230591, + "step": 5625 + }, + { + "epoch": 2.701535508637236, + "grad_norm": 0.4325473400172011, + "learning_rate": 5.527906150017775e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9031509757041931, + "step": 5630 + }, + { + "epoch": 2.7039347408829175, + "grad_norm": 0.4145757455771518, + "learning_rate": 5.483469605403484e-06, + "loss": 0.249, + "mean_token_accuracy": 0.9083544373512268, + "step": 5635 + }, + { + "epoch": 2.706333973128599, + "grad_norm": 0.45663630022058826, + "learning_rate": 5.439033060789193e-06, + "loss": 0.2601, + "mean_token_accuracy": 0.904858124256134, + "step": 5640 + }, + { + "epoch": 2.7087332053742803, + "grad_norm": 0.43350528253109694, + "learning_rate": 5.394596516174903e-06, + "loss": 0.2486, + "mean_token_accuracy": 0.9084395945072175, + "step": 5645 + }, + { + "epoch": 2.7111324376199617, + "grad_norm": 0.43791759142392406, + "learning_rate": 5.350159971560612e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9072106003761291, + "step": 5650 + }, + { + "epoch": 2.713531669865643, + "grad_norm": 0.4479123601679622, + "learning_rate": 5.30572342694632e-06, + "loss": 0.256, + "mean_token_accuracy": 0.9059386551380157, + "step": 5655 + }, + { + "epoch": 2.7159309021113245, + "grad_norm": 0.49310385054485034, + "learning_rate": 5.2612868823320305e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9039049267768859, + "step": 5660 + }, + { + "epoch": 2.718330134357006, + "grad_norm": 0.4480477544426612, + "learning_rate": 5.216850337717739e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9020596444606781, + "step": 5665 + }, + { + "epoch": 2.7207293666026873, + "grad_norm": 0.44784864613557246, + "learning_rate": 5.172413793103448e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9044237077236176, + "step": 5670 + }, + { + "epoch": 2.7231285988483687, + "grad_norm": 0.4503609778786115, + "learning_rate": 5.127977248489158e-06, + "loss": 0.2537, + "mean_token_accuracy": 0.9068785607814789, + "step": 5675 + }, + { + "epoch": 2.72552783109405, + "grad_norm": 0.5298964325267139, + "learning_rate": 5.083540703874867e-06, + "loss": 0.26, + "mean_token_accuracy": 0.9050964057445526, + "step": 5680 + }, + { + "epoch": 2.7279270633397315, + "grad_norm": 0.44324275118851564, + "learning_rate": 5.039104159260576e-06, + "loss": 0.2575, + "mean_token_accuracy": 0.9054651856422424, + "step": 5685 + }, + { + "epoch": 2.730326295585413, + "grad_norm": 0.44159279875032903, + "learning_rate": 4.9946676146462856e-06, + "loss": 0.2568, + "mean_token_accuracy": 0.9059472501277923, + "step": 5690 + }, + { + "epoch": 2.732725527831094, + "grad_norm": 0.4500000199898264, + "learning_rate": 4.950231070031994e-06, + "loss": 0.2595, + "mean_token_accuracy": 0.9048760831356049, + "step": 5695 + }, + { + "epoch": 2.7351247600767756, + "grad_norm": 0.4399650800626675, + "learning_rate": 4.905794525417703e-06, + "loss": 0.2558, + "mean_token_accuracy": 0.9062402307987213, + "step": 5700 + }, + { + "epoch": 2.7375239923224566, + "grad_norm": 0.43485531521512283, + "learning_rate": 4.8613579808034135e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9057276964187622, + "step": 5705 + }, + { + "epoch": 2.7399232245681384, + "grad_norm": 0.4379171063459807, + "learning_rate": 4.816921436189122e-06, + "loss": 0.257, + "mean_token_accuracy": 0.9057183146476746, + "step": 5710 + }, + { + "epoch": 2.7423224568138194, + "grad_norm": 0.44593794361071326, + "learning_rate": 4.772484891574831e-06, + "loss": 0.2555, + "mean_token_accuracy": 0.9061355412006378, + "step": 5715 + }, + { + "epoch": 2.744721689059501, + "grad_norm": 0.4454660382415944, + "learning_rate": 4.7280483469605406e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9044401228427887, + "step": 5720 + }, + { + "epoch": 2.747120921305182, + "grad_norm": 0.466071797740869, + "learning_rate": 4.68361180234625e-06, + "loss": 0.2489, + "mean_token_accuracy": 0.9086145997047425, + "step": 5725 + }, + { + "epoch": 2.749520153550864, + "grad_norm": 0.4312082041812329, + "learning_rate": 4.639175257731959e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9071082472801208, + "step": 5730 + }, + { + "epoch": 2.751919385796545, + "grad_norm": 0.45469928675392296, + "learning_rate": 4.5947387131176685e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9056995809078217, + "step": 5735 + }, + { + "epoch": 2.7543186180422263, + "grad_norm": 0.4476907768840197, + "learning_rate": 4.550302168503377e-06, + "loss": 0.2521, + "mean_token_accuracy": 0.9072535812854767, + "step": 5740 + }, + { + "epoch": 2.7567178502879077, + "grad_norm": 0.42447627907739277, + "learning_rate": 4.505865623889086e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.907138729095459, + "step": 5745 + }, + { + "epoch": 2.759117082533589, + "grad_norm": 0.45012533604922167, + "learning_rate": 4.4614290792747964e-06, + "loss": 0.2509, + "mean_token_accuracy": 0.9077301681041717, + "step": 5750 + }, + { + "epoch": 2.7615163147792705, + "grad_norm": 0.4409578164982194, + "learning_rate": 4.416992534660505e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.906362110376358, + "step": 5755 + }, + { + "epoch": 2.763915547024952, + "grad_norm": 0.48147201127170397, + "learning_rate": 4.372555990046214e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9043830871582031, + "step": 5760 + }, + { + "epoch": 2.7663147792706333, + "grad_norm": 0.4497274342056709, + "learning_rate": 4.3281194454319235e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9043799638748169, + "step": 5765 + }, + { + "epoch": 2.7687140115163147, + "grad_norm": 0.4413348164130912, + "learning_rate": 4.283682900817633e-06, + "loss": 0.2536, + "mean_token_accuracy": 0.9070066928863525, + "step": 5770 + }, + { + "epoch": 2.771113243761996, + "grad_norm": 0.4573010381962229, + "learning_rate": 4.239246356203342e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9065881192684173, + "step": 5775 + }, + { + "epoch": 2.7735124760076775, + "grad_norm": 0.4370029883452461, + "learning_rate": 4.1948098115890514e-06, + "loss": 0.2448, + "mean_token_accuracy": 0.9098528385162353, + "step": 5780 + }, + { + "epoch": 2.775911708253359, + "grad_norm": 0.43272263858643206, + "learning_rate": 4.15037326697476e-06, + "loss": 0.2514, + "mean_token_accuracy": 0.9076957941055298, + "step": 5785 + }, + { + "epoch": 2.7783109404990403, + "grad_norm": 0.44030045712445476, + "learning_rate": 4.105936722360469e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.9051995396614074, + "step": 5790 + }, + { + "epoch": 2.7807101727447217, + "grad_norm": 0.42445012441746516, + "learning_rate": 4.0615001777461785e-06, + "loss": 0.2557, + "mean_token_accuracy": 0.9057902097702026, + "step": 5795 + }, + { + "epoch": 2.783109404990403, + "grad_norm": 0.44008392900477256, + "learning_rate": 4.017063633131888e-06, + "loss": 0.2571, + "mean_token_accuracy": 0.9057230174541473, + "step": 5800 + }, + { + "epoch": 2.7855086372360844, + "grad_norm": 0.46535904070186546, + "learning_rate": 3.972627088517597e-06, + "loss": 0.2621, + "mean_token_accuracy": 0.9043276190757752, + "step": 5805 + }, + { + "epoch": 2.787907869481766, + "grad_norm": 0.4181165672832623, + "learning_rate": 3.9281905439033065e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9042987048625946, + "step": 5810 + }, + { + "epoch": 2.7903071017274472, + "grad_norm": 0.4522846803133769, + "learning_rate": 3.883753999289016e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9064529299736023, + "step": 5815 + }, + { + "epoch": 2.7927063339731286, + "grad_norm": 0.44815812391112714, + "learning_rate": 3.839317454674725e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9048435807228088, + "step": 5820 + }, + { + "epoch": 2.79510556621881, + "grad_norm": 0.46421235672539135, + "learning_rate": 3.794880910060434e-06, + "loss": 0.2592, + "mean_token_accuracy": 0.9051807999610901, + "step": 5825 + }, + { + "epoch": 2.7975047984644914, + "grad_norm": 3.709167924530769, + "learning_rate": 3.750444365446143e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9065676093101501, + "step": 5830 + }, + { + "epoch": 2.799904030710173, + "grad_norm": 0.4240367546919462, + "learning_rate": 3.706007820831852e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9064964950084686, + "step": 5835 + }, + { + "epoch": 2.802303262955854, + "grad_norm": 0.4456102248393658, + "learning_rate": 3.661571276217562e-06, + "loss": 0.2539, + "mean_token_accuracy": 0.906609308719635, + "step": 5840 + }, + { + "epoch": 2.8047024952015356, + "grad_norm": 0.44600145811223735, + "learning_rate": 3.617134731603271e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.905487060546875, + "step": 5845 + }, + { + "epoch": 2.807101727447217, + "grad_norm": 0.4376857420314512, + "learning_rate": 3.5726981869889797e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9028728306293488, + "step": 5850 + }, + { + "epoch": 2.8095009596928984, + "grad_norm": 0.43309004835981246, + "learning_rate": 3.528261642374689e-06, + "loss": 0.255, + "mean_token_accuracy": 0.9064680218696595, + "step": 5855 + }, + { + "epoch": 2.8119001919385798, + "grad_norm": 0.45511351183444065, + "learning_rate": 3.483825097760398e-06, + "loss": 0.2535, + "mean_token_accuracy": 0.9068301141262054, + "step": 5860 + }, + { + "epoch": 2.814299424184261, + "grad_norm": 0.45459227347655273, + "learning_rate": 3.4393885531461076e-06, + "loss": 0.2542, + "mean_token_accuracy": 0.9065988540649415, + "step": 5865 + }, + { + "epoch": 2.8166986564299425, + "grad_norm": 0.4474470069462497, + "learning_rate": 3.394952008531817e-06, + "loss": 0.2528, + "mean_token_accuracy": 0.9072067022323609, + "step": 5870 + }, + { + "epoch": 2.819097888675624, + "grad_norm": 0.43956377189608403, + "learning_rate": 3.350515463917526e-06, + "loss": 0.2584, + "mean_token_accuracy": 0.9054464340209961, + "step": 5875 + }, + { + "epoch": 2.8214971209213053, + "grad_norm": 0.4369485147974561, + "learning_rate": 3.3060789193032347e-06, + "loss": 0.2533, + "mean_token_accuracy": 0.9070504426956176, + "step": 5880 + }, + { + "epoch": 2.8238963531669867, + "grad_norm": 0.4300540536602179, + "learning_rate": 3.2616423746889444e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9066879153251648, + "step": 5885 + }, + { + "epoch": 2.8262955854126677, + "grad_norm": 0.43603672723055475, + "learning_rate": 3.2172058300746537e-06, + "loss": 0.2585, + "mean_token_accuracy": 0.905434650182724, + "step": 5890 + }, + { + "epoch": 2.8286948176583495, + "grad_norm": 0.44584348089802384, + "learning_rate": 3.1727692854603626e-06, + "loss": 0.2431, + "mean_token_accuracy": 0.910099059343338, + "step": 5895 + }, + { + "epoch": 2.8310940499040305, + "grad_norm": 0.4255916066845973, + "learning_rate": 3.128332740846072e-06, + "loss": 0.2459, + "mean_token_accuracy": 0.9094623148441314, + "step": 5900 + }, + { + "epoch": 2.8334932821497123, + "grad_norm": 0.43813793048191685, + "learning_rate": 3.0838961962317812e-06, + "loss": 0.2567, + "mean_token_accuracy": 0.9055675387382507, + "step": 5905 + }, + { + "epoch": 2.8358925143953932, + "grad_norm": 0.4209655933645841, + "learning_rate": 3.03945965161749e-06, + "loss": 0.2541, + "mean_token_accuracy": 0.9070176243782043, + "step": 5910 + }, + { + "epoch": 2.838291746641075, + "grad_norm": 0.4428447675746139, + "learning_rate": 2.9950231070031994e-06, + "loss": 0.2586, + "mean_token_accuracy": 0.9049026608467102, + "step": 5915 + }, + { + "epoch": 2.840690978886756, + "grad_norm": 0.4404991474851633, + "learning_rate": 2.9505865623889087e-06, + "loss": 0.2584, + "mean_token_accuracy": 0.9048484981060028, + "step": 5920 + }, + { + "epoch": 2.843090211132438, + "grad_norm": 0.4440934912962013, + "learning_rate": 2.906150017774618e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9047917068004608, + "step": 5925 + }, + { + "epoch": 2.845489443378119, + "grad_norm": 0.4453569173499868, + "learning_rate": 2.861713473160327e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9035783410072327, + "step": 5930 + }, + { + "epoch": 2.8478886756238007, + "grad_norm": 0.42075498659491584, + "learning_rate": 2.8172769285460367e-06, + "loss": 0.2535, + "mean_token_accuracy": 0.907467657327652, + "step": 5935 + }, + { + "epoch": 2.8502879078694816, + "grad_norm": 0.4463099961746632, + "learning_rate": 2.7728403839317456e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.9073926568031311, + "step": 5940 + }, + { + "epoch": 2.852687140115163, + "grad_norm": 0.4270911922738369, + "learning_rate": 2.7284038393174545e-06, + "loss": 0.248, + "mean_token_accuracy": 0.9087224245071411, + "step": 5945 + }, + { + "epoch": 2.8550863723608444, + "grad_norm": 0.4420467470565063, + "learning_rate": 2.683967294703164e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9067691683769226, + "step": 5950 + }, + { + "epoch": 2.857485604606526, + "grad_norm": 0.45704237431731065, + "learning_rate": 2.639530750088873e-06, + "loss": 0.2532, + "mean_token_accuracy": 0.9069824755191803, + "step": 5955 + }, + { + "epoch": 2.859884836852207, + "grad_norm": 0.4293486256624201, + "learning_rate": 2.5950942054745824e-06, + "loss": 0.2571, + "mean_token_accuracy": 0.9057079017162323, + "step": 5960 + }, + { + "epoch": 2.8622840690978886, + "grad_norm": 0.4474552765618025, + "learning_rate": 2.5506576608602917e-06, + "loss": 0.2439, + "mean_token_accuracy": 0.9103707253932953, + "step": 5965 + }, + { + "epoch": 2.86468330134357, + "grad_norm": 0.4626661887370989, + "learning_rate": 2.506221116246001e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9054373741149903, + "step": 5970 + }, + { + "epoch": 2.8670825335892514, + "grad_norm": 0.44243947426860386, + "learning_rate": 2.46178457163171e-06, + "loss": 0.2511, + "mean_token_accuracy": 0.9077575266361236, + "step": 5975 + }, + { + "epoch": 2.8694817658349328, + "grad_norm": 0.4297876711361223, + "learning_rate": 2.417348027017419e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9047979533672332, + "step": 5980 + }, + { + "epoch": 2.871880998080614, + "grad_norm": 0.4522470971407196, + "learning_rate": 2.3729114824031285e-06, + "loss": 0.2515, + "mean_token_accuracy": 0.9076629757881165, + "step": 5985 + }, + { + "epoch": 2.8742802303262955, + "grad_norm": 0.4595793236712429, + "learning_rate": 2.3284749377888374e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.903690081834793, + "step": 5990 + }, + { + "epoch": 2.876679462571977, + "grad_norm": 0.4346630104029628, + "learning_rate": 2.2840383931745467e-06, + "loss": 0.2511, + "mean_token_accuracy": 0.9076651513576508, + "step": 5995 + }, + { + "epoch": 2.8790786948176583, + "grad_norm": 0.45090891263086474, + "learning_rate": 2.239601848560256e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9068621635437012, + "step": 6000 + }, + { + "epoch": 2.8814779270633397, + "grad_norm": 0.4124115306908796, + "learning_rate": 2.1951653039459653e-06, + "loss": 0.2496, + "mean_token_accuracy": 0.9084739625453949, + "step": 6005 + }, + { + "epoch": 2.883877159309021, + "grad_norm": 0.447834623689059, + "learning_rate": 2.1507287593316742e-06, + "loss": 0.2546, + "mean_token_accuracy": 0.906530886888504, + "step": 6010 + }, + { + "epoch": 2.8862763915547025, + "grad_norm": 0.4196910908382261, + "learning_rate": 2.106292214717384e-06, + "loss": 0.2557, + "mean_token_accuracy": 0.9063535273075104, + "step": 6015 + }, + { + "epoch": 2.888675623800384, + "grad_norm": 0.42896879529787013, + "learning_rate": 2.061855670103093e-06, + "loss": 0.2504, + "mean_token_accuracy": 0.9081020653247833, + "step": 6020 + }, + { + "epoch": 2.8910748560460653, + "grad_norm": 0.4349599465103453, + "learning_rate": 2.017419125488802e-06, + "loss": 0.2555, + "mean_token_accuracy": 0.9059644281864166, + "step": 6025 + }, + { + "epoch": 2.8934740882917467, + "grad_norm": 0.41251060268351214, + "learning_rate": 1.9729825808745115e-06, + "loss": 0.2472, + "mean_token_accuracy": 0.9089505612850189, + "step": 6030 + }, + { + "epoch": 2.895873320537428, + "grad_norm": 0.437831283605074, + "learning_rate": 1.9285460362602203e-06, + "loss": 0.252, + "mean_token_accuracy": 0.9072934269905091, + "step": 6035 + }, + { + "epoch": 2.8982725527831095, + "grad_norm": 0.45750950782775357, + "learning_rate": 1.8841094916459297e-06, + "loss": 0.2538, + "mean_token_accuracy": 0.9064003944396972, + "step": 6040 + }, + { + "epoch": 2.900671785028791, + "grad_norm": 0.44178784137868615, + "learning_rate": 1.8396729470316388e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.908105194568634, + "step": 6045 + }, + { + "epoch": 2.9030710172744723, + "grad_norm": 0.4285607000273115, + "learning_rate": 1.7952364024173483e-06, + "loss": 0.2509, + "mean_token_accuracy": 0.9079598605632782, + "step": 6050 + }, + { + "epoch": 2.9054702495201536, + "grad_norm": 0.47649370622698345, + "learning_rate": 1.7507998578030572e-06, + "loss": 0.2466, + "mean_token_accuracy": 0.9093623101711273, + "step": 6055 + }, + { + "epoch": 2.907869481765835, + "grad_norm": 0.42412747407093854, + "learning_rate": 1.7063633131887667e-06, + "loss": 0.2564, + "mean_token_accuracy": 0.906291025876999, + "step": 6060 + }, + { + "epoch": 2.9102687140115164, + "grad_norm": 0.436149863588771, + "learning_rate": 1.6619267685744758e-06, + "loss": 0.2541, + "mean_token_accuracy": 0.9071348130702972, + "step": 6065 + }, + { + "epoch": 2.912667946257198, + "grad_norm": 0.4369219283799936, + "learning_rate": 1.617490223960185e-06, + "loss": 0.2491, + "mean_token_accuracy": 0.9085583508014679, + "step": 6070 + }, + { + "epoch": 2.915067178502879, + "grad_norm": 0.437785169638069, + "learning_rate": 1.5730536793458942e-06, + "loss": 0.248, + "mean_token_accuracy": 0.9086403906345367, + "step": 6075 + }, + { + "epoch": 2.9174664107485606, + "grad_norm": 0.43690114298680655, + "learning_rate": 1.5286171347316033e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.907256692647934, + "step": 6080 + }, + { + "epoch": 2.919865642994242, + "grad_norm": 0.4494829201863112, + "learning_rate": 1.4841805901173126e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9058855175971985, + "step": 6085 + }, + { + "epoch": 2.9222648752399234, + "grad_norm": 0.4209094239321652, + "learning_rate": 1.439744045503022e-06, + "loss": 0.2517, + "mean_token_accuracy": 0.9072973370552063, + "step": 6090 + }, + { + "epoch": 2.9246641074856043, + "grad_norm": 0.4382591677968272, + "learning_rate": 1.3953075008887308e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9073457717895508, + "step": 6095 + }, + { + "epoch": 2.927063339731286, + "grad_norm": 0.42827302601483624, + "learning_rate": 1.35087095627444e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.9061714768409729, + "step": 6100 + }, + { + "epoch": 2.929462571976967, + "grad_norm": 0.447579716688551, + "learning_rate": 1.3064344116601494e-06, + "loss": 0.2553, + "mean_token_accuracy": 0.9065722763538361, + "step": 6105 + }, + { + "epoch": 2.931861804222649, + "grad_norm": 0.42827536998813603, + "learning_rate": 1.2619978670458585e-06, + "loss": 0.2476, + "mean_token_accuracy": 0.909209954738617, + "step": 6110 + }, + { + "epoch": 2.93426103646833, + "grad_norm": 0.4540116982203758, + "learning_rate": 1.2175613224315678e-06, + "loss": 0.2546, + "mean_token_accuracy": 0.9069316864013672, + "step": 6115 + }, + { + "epoch": 2.9366602687140118, + "grad_norm": 0.4374462373091291, + "learning_rate": 1.173124777817277e-06, + "loss": 0.2533, + "mean_token_accuracy": 0.9067832350730896, + "step": 6120 + }, + { + "epoch": 2.9390595009596927, + "grad_norm": 0.41988101865620747, + "learning_rate": 1.1286882332029862e-06, + "loss": 0.25, + "mean_token_accuracy": 0.9078817367553711, + "step": 6125 + }, + { + "epoch": 2.9414587332053745, + "grad_norm": 0.44991139970047894, + "learning_rate": 1.0842516885886955e-06, + "loss": 0.2526, + "mean_token_accuracy": 0.9072129487991333, + "step": 6130 + }, + { + "epoch": 2.9438579654510555, + "grad_norm": 0.4322545174476245, + "learning_rate": 1.0398151439744046e-06, + "loss": 0.2452, + "mean_token_accuracy": 0.909875613451004, + "step": 6135 + }, + { + "epoch": 2.946257197696737, + "grad_norm": 0.44606631531514007, + "learning_rate": 9.953785993601137e-07, + "loss": 0.2499, + "mean_token_accuracy": 0.9080965995788575, + "step": 6140 + }, + { + "epoch": 2.9486564299424183, + "grad_norm": 0.4174486331140099, + "learning_rate": 9.509420547458229e-07, + "loss": 0.2461, + "mean_token_accuracy": 0.9096888959407806, + "step": 6145 + }, + { + "epoch": 2.9510556621880997, + "grad_norm": 0.4314477007269236, + "learning_rate": 9.065055101315321e-07, + "loss": 0.254, + "mean_token_accuracy": 0.9069934070110321, + "step": 6150 + }, + { + "epoch": 2.953454894433781, + "grad_norm": 0.42487353340805534, + "learning_rate": 8.620689655172415e-07, + "loss": 0.242, + "mean_token_accuracy": 0.9112311661243438, + "step": 6155 + }, + { + "epoch": 2.9558541266794625, + "grad_norm": 0.45994818847778246, + "learning_rate": 8.176324209029507e-07, + "loss": 0.2509, + "mean_token_accuracy": 0.9082200407981873, + "step": 6160 + }, + { + "epoch": 2.958253358925144, + "grad_norm": 0.4334130334082495, + "learning_rate": 7.731958762886599e-07, + "loss": 0.247, + "mean_token_accuracy": 0.9093940913677215, + "step": 6165 + }, + { + "epoch": 2.9606525911708252, + "grad_norm": 0.44597082395725934, + "learning_rate": 7.28759331674369e-07, + "loss": 0.2533, + "mean_token_accuracy": 0.9066949546337127, + "step": 6170 + }, + { + "epoch": 2.9630518234165066, + "grad_norm": 0.4557245109030177, + "learning_rate": 6.843227870600783e-07, + "loss": 0.2553, + "mean_token_accuracy": 0.9065178632736206, + "step": 6175 + }, + { + "epoch": 2.965451055662188, + "grad_norm": 0.4478527557349485, + "learning_rate": 6.398862424457875e-07, + "loss": 0.2445, + "mean_token_accuracy": 0.9099810898303986, + "step": 6180 + }, + { + "epoch": 2.9678502879078694, + "grad_norm": 0.41646361122311465, + "learning_rate": 5.954496978314967e-07, + "loss": 0.2449, + "mean_token_accuracy": 0.9099904656410217, + "step": 6185 + }, + { + "epoch": 2.970249520153551, + "grad_norm": 0.40765652811684916, + "learning_rate": 5.510131532172059e-07, + "loss": 0.2459, + "mean_token_accuracy": 0.9095560610294342, + "step": 6190 + }, + { + "epoch": 2.972648752399232, + "grad_norm": 0.425191482050169, + "learning_rate": 5.06576608602915e-07, + "loss": 0.2544, + "mean_token_accuracy": 0.9066918194293976, + "step": 6195 + }, + { + "epoch": 2.9750479846449136, + "grad_norm": 0.43069330230599245, + "learning_rate": 4.6214006398862424e-07, + "loss": 0.2597, + "mean_token_accuracy": 0.9043260455131531, + "step": 6200 + }, + { + "epoch": 2.977447216890595, + "grad_norm": 0.4344718819179382, + "learning_rate": 4.177035193743335e-07, + "loss": 0.2452, + "mean_token_accuracy": 0.9098771750926972, + "step": 6205 + }, + { + "epoch": 2.9798464491362764, + "grad_norm": 0.4325317501106857, + "learning_rate": 3.7326697476004265e-07, + "loss": 0.2505, + "mean_token_accuracy": 0.907985383272171, + "step": 6210 + }, + { + "epoch": 2.982245681381958, + "grad_norm": 0.4126353892608848, + "learning_rate": 3.288304301457519e-07, + "loss": 0.2437, + "mean_token_accuracy": 0.9105557441711426, + "step": 6215 + }, + { + "epoch": 2.984644913627639, + "grad_norm": 0.4173259485121355, + "learning_rate": 2.8439388553146106e-07, + "loss": 0.2484, + "mean_token_accuracy": 0.9091325998306274, + "step": 6220 + }, + { + "epoch": 2.9870441458733206, + "grad_norm": 0.43216646266753433, + "learning_rate": 2.399573409171703e-07, + "loss": 0.2476, + "mean_token_accuracy": 0.9084903717041015, + "step": 6225 + }, + { + "epoch": 2.989443378119002, + "grad_norm": 0.44710775517043494, + "learning_rate": 1.955207963028795e-07, + "loss": 0.2503, + "mean_token_accuracy": 0.9080036342144012, + "step": 6230 + }, + { + "epoch": 2.9918426103646834, + "grad_norm": 0.45620131139782905, + "learning_rate": 1.510842516885887e-07, + "loss": 0.2535, + "mean_token_accuracy": 0.9065212190151215, + "step": 6235 + }, + { + "epoch": 2.9942418426103647, + "grad_norm": 0.4430030123252174, + "learning_rate": 1.0664770707429792e-07, + "loss": 0.2494, + "mean_token_accuracy": 0.908391398191452, + "step": 6240 + }, + { + "epoch": 2.996641074856046, + "grad_norm": 0.4180541397263229, + "learning_rate": 6.221116246000711e-08, + "loss": 0.2481, + "mean_token_accuracy": 0.9089529037475585, + "step": 6245 + }, + { + "epoch": 2.9990403071017275, + "grad_norm": 0.43586407533469335, + "learning_rate": 1.7774617845716316e-08, + "loss": 0.2476, + "mean_token_accuracy": 0.9093365132808685, + "step": 6250 + }, + { + "epoch": 3.0, + "mean_token_accuracy": 0.909225344657898, + "step": 6252, + "total_flos": 2553013234827264.0, + "train_loss": 0.4164313976191132, + "train_runtime": 41448.963, + "train_samples_per_second": 2.413, + "train_steps_per_second": 0.151 + } + ], + "logging_steps": 5, + "max_steps": 6252, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2553013234827264.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}