{ "best_metric": 0.15935022, "best_model_checkpoint": "/home/sushant/D1/MIUA/kvasir-format/training/v0-20250203-134351/checkpoint-10400", "epoch": 4.763719512195122, "eval_steps": 800, "global_step": 12500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00038109756097560977, "grad_norm": 1.696879506111145, "learning_rate": 9.999999856658455e-05, "loss": 1.2426021099090576, "memory(GiB)": 30.36, "step": 1, "token_acc": 0.6371814092953523, "train_speed(iter/s)": 0.148648 }, { "epoch": 0.0019054878048780487, "grad_norm": 0.4253880977630615, "learning_rate": 9.99999641646178e-05, "loss": 0.9356827139854431, "memory(GiB)": 62.88, "step": 5, "token_acc": 0.6943032896496389, "train_speed(iter/s)": 0.256205 }, { "epoch": 0.0038109756097560975, "grad_norm": 0.24213987588882446, "learning_rate": 9.999985665852258e-05, "loss": 0.7650547027587891, "memory(GiB)": 74.88, "step": 10, "token_acc": 0.7004583445672689, "train_speed(iter/s)": 0.293321 }, { "epoch": 0.005716463414634146, "grad_norm": 0.3122990131378174, "learning_rate": 9.99996774818684e-05, "loss": 0.7247544288635254, "memory(GiB)": 103.13, "step": 15, "token_acc": 0.7016219239373602, "train_speed(iter/s)": 0.284693 }, { "epoch": 0.007621951219512195, "grad_norm": 0.2306428849697113, "learning_rate": 9.999942663491213e-05, "loss": 0.7128955364227295, "memory(GiB)": 103.13, "step": 20, "token_acc": 0.7067496280265115, "train_speed(iter/s)": 0.289136 }, { "epoch": 0.009527439024390244, "grad_norm": 0.2907879054546356, "learning_rate": 9.999910411801334e-05, "loss": 0.7169328212738038, "memory(GiB)": 103.13, "step": 25, "token_acc": 0.6980940794809408, "train_speed(iter/s)": 0.284103 }, { "epoch": 0.011432926829268292, "grad_norm": 0.23731078207492828, "learning_rate": 9.999870993163431e-05, "loss": 0.5781228542327881, "memory(GiB)": 115.14, "step": 30, "token_acc": 0.7203408836061898, "train_speed(iter/s)": 0.278186 }, { "epoch": 0.013338414634146341, "grad_norm": 0.1967283934354782, "learning_rate": 9.999824407634009e-05, "loss": 0.7009762287139892, "memory(GiB)": 132.65, "step": 35, "token_acc": 0.706540825285338, "train_speed(iter/s)": 0.275851 }, { "epoch": 0.01524390243902439, "grad_norm": 0.4780254065990448, "learning_rate": 9.999770655279843e-05, "loss": 0.6918815612792969, "memory(GiB)": 132.65, "step": 40, "token_acc": 0.709316468012649, "train_speed(iter/s)": 0.283815 }, { "epoch": 0.01714939024390244, "grad_norm": 0.2728957235813141, "learning_rate": 9.999709736177985e-05, "loss": 0.6752120018005371, "memory(GiB)": 132.65, "step": 45, "token_acc": 0.722938989722283, "train_speed(iter/s)": 0.286427 }, { "epoch": 0.019054878048780487, "grad_norm": 0.18847467005252838, "learning_rate": 9.999641650415752e-05, "loss": 0.6873096942901611, "memory(GiB)": 132.65, "step": 50, "token_acc": 0.716307424341389, "train_speed(iter/s)": 0.283735 }, { "epoch": 0.020960365853658538, "grad_norm": 0.18952184915542603, "learning_rate": 9.999566398090745e-05, "loss": 0.648162841796875, "memory(GiB)": 144.69, "step": 55, "token_acc": 0.7134015069967707, "train_speed(iter/s)": 0.284029 }, { "epoch": 0.022865853658536585, "grad_norm": 0.9265457391738892, "learning_rate": 9.99948397931083e-05, "loss": 0.6501975059509277, "memory(GiB)": 144.69, "step": 60, "token_acc": 0.7278516057585825, "train_speed(iter/s)": 0.287907 }, { "epoch": 0.024771341463414635, "grad_norm": 0.21030771732330322, "learning_rate": 9.999394394194146e-05, "loss": 0.6837770938873291, "memory(GiB)": 144.69, "step": 65, "token_acc": 0.7203331020124913, "train_speed(iter/s)": 0.291824 }, { "epoch": 0.026676829268292682, "grad_norm": 0.4528329074382782, "learning_rate": 9.999297642869105e-05, "loss": 0.6526165008544922, "memory(GiB)": 144.69, "step": 70, "token_acc": 0.7306688089977907, "train_speed(iter/s)": 0.294291 }, { "epoch": 0.028582317073170733, "grad_norm": 0.5394845604896545, "learning_rate": 9.999193725474396e-05, "loss": 0.6295528888702393, "memory(GiB)": 144.69, "step": 75, "token_acc": 0.7206040992448759, "train_speed(iter/s)": 0.298286 }, { "epoch": 0.03048780487804878, "grad_norm": 0.6294591426849365, "learning_rate": 9.999082642158973e-05, "loss": 0.6407809257507324, "memory(GiB)": 144.69, "step": 80, "token_acc": 0.7232625884311278, "train_speed(iter/s)": 0.302149 }, { "epoch": 0.03239329268292683, "grad_norm": 0.6595126986503601, "learning_rate": 9.998964393082063e-05, "loss": 0.6469155311584472, "memory(GiB)": 144.69, "step": 85, "token_acc": 0.724197247706422, "train_speed(iter/s)": 0.296591 }, { "epoch": 0.03429878048780488, "grad_norm": 0.5030885934829712, "learning_rate": 9.998838978413168e-05, "loss": 0.6591710090637207, "memory(GiB)": 144.69, "step": 90, "token_acc": 0.7248803827751196, "train_speed(iter/s)": 0.299184 }, { "epoch": 0.036204268292682924, "grad_norm": 0.5166705250740051, "learning_rate": 9.99870639833206e-05, "loss": 0.6369208335876465, "memory(GiB)": 144.69, "step": 95, "token_acc": 0.7260526315789474, "train_speed(iter/s)": 0.302029 }, { "epoch": 0.038109756097560975, "grad_norm": 0.7396672964096069, "learning_rate": 9.99856665302878e-05, "loss": 0.6004275798797607, "memory(GiB)": 144.69, "step": 100, "token_acc": 0.7415181840371003, "train_speed(iter/s)": 0.303265 }, { "epoch": 0.040015243902439025, "grad_norm": 1.3864667415618896, "learning_rate": 9.998419742703641e-05, "loss": 0.6409744262695313, "memory(GiB)": 144.69, "step": 105, "token_acc": 0.7332049121117264, "train_speed(iter/s)": 0.305058 }, { "epoch": 0.041920731707317076, "grad_norm": 0.8232730031013489, "learning_rate": 9.998265667567226e-05, "loss": 0.5957656860351562, "memory(GiB)": 144.69, "step": 110, "token_acc": 0.7519294377067255, "train_speed(iter/s)": 0.303936 }, { "epoch": 0.04382621951219512, "grad_norm": 0.5548597574234009, "learning_rate": 9.998104427840392e-05, "loss": 0.6493963718414306, "memory(GiB)": 144.69, "step": 115, "token_acc": 0.7338171262699564, "train_speed(iter/s)": 0.304557 }, { "epoch": 0.04573170731707317, "grad_norm": 1.3648840188980103, "learning_rate": 9.997936023754257e-05, "loss": 0.6067759037017822, "memory(GiB)": 144.69, "step": 120, "token_acc": 0.7444350521273598, "train_speed(iter/s)": 0.304833 }, { "epoch": 0.04763719512195122, "grad_norm": 0.8255302309989929, "learning_rate": 9.997760455550218e-05, "loss": 0.6007946491241455, "memory(GiB)": 144.69, "step": 125, "token_acc": 0.7488651102464332, "train_speed(iter/s)": 0.305082 }, { "epoch": 0.04954268292682927, "grad_norm": 0.90826016664505, "learning_rate": 9.997577723479938e-05, "loss": 0.5391964912414551, "memory(GiB)": 144.69, "step": 130, "token_acc": 0.7569282136894825, "train_speed(iter/s)": 0.307213 }, { "epoch": 0.051448170731707314, "grad_norm": 0.7670543789863586, "learning_rate": 9.997387827805345e-05, "loss": 0.5742276668548584, "memory(GiB)": 144.69, "step": 135, "token_acc": 0.7615220987946112, "train_speed(iter/s)": 0.308648 }, { "epoch": 0.053353658536585365, "grad_norm": 1.1576682329177856, "learning_rate": 9.997190768798639e-05, "loss": 0.528875207901001, "memory(GiB)": 144.69, "step": 140, "token_acc": 0.778289221327196, "train_speed(iter/s)": 0.310545 }, { "epoch": 0.055259146341463415, "grad_norm": 0.6357249617576599, "learning_rate": 9.996986546742288e-05, "loss": 0.5997395515441895, "memory(GiB)": 144.69, "step": 145, "token_acc": 0.7545501820072803, "train_speed(iter/s)": 0.311112 }, { "epoch": 0.057164634146341466, "grad_norm": 1.2529560327529907, "learning_rate": 9.996775161929027e-05, "loss": 0.5837967395782471, "memory(GiB)": 144.69, "step": 150, "token_acc": 0.7550722522259524, "train_speed(iter/s)": 0.308966 }, { "epoch": 0.05907012195121951, "grad_norm": 0.8575006723403931, "learning_rate": 9.996556614661858e-05, "loss": 0.5776948928833008, "memory(GiB)": 144.69, "step": 155, "token_acc": 0.7643463497453311, "train_speed(iter/s)": 0.30699 }, { "epoch": 0.06097560975609756, "grad_norm": 1.1311726570129395, "learning_rate": 9.99633090525405e-05, "loss": 0.5516763687133789, "memory(GiB)": 144.69, "step": 160, "token_acc": 0.7639930755914599, "train_speed(iter/s)": 0.308398 }, { "epoch": 0.06288109756097561, "grad_norm": 1.0136122703552246, "learning_rate": 9.996098034029139e-05, "loss": 0.5646200180053711, "memory(GiB)": 144.69, "step": 165, "token_acc": 0.7690641247833622, "train_speed(iter/s)": 0.307206 }, { "epoch": 0.06478658536585366, "grad_norm": 0.7757333517074585, "learning_rate": 9.995858001320926e-05, "loss": 0.5743335247039795, "memory(GiB)": 144.69, "step": 170, "token_acc": 0.7662052296198638, "train_speed(iter/s)": 0.305379 }, { "epoch": 0.06669207317073171, "grad_norm": 2.381329298019409, "learning_rate": 9.995610807473475e-05, "loss": 0.49055495262146, "memory(GiB)": 144.69, "step": 175, "token_acc": 0.7945125510799766, "train_speed(iter/s)": 0.307077 }, { "epoch": 0.06859756097560976, "grad_norm": 1.3259377479553223, "learning_rate": 9.995356452841122e-05, "loss": 0.5513786792755127, "memory(GiB)": 144.69, "step": 180, "token_acc": 0.7623084447342681, "train_speed(iter/s)": 0.308351 }, { "epoch": 0.0705030487804878, "grad_norm": 0.5245610475540161, "learning_rate": 9.995094937788458e-05, "loss": 0.5207012176513672, "memory(GiB)": 153.57, "step": 185, "token_acc": 0.7682627378759975, "train_speed(iter/s)": 0.306603 }, { "epoch": 0.07240853658536585, "grad_norm": 0.8260420560836792, "learning_rate": 9.994826262690347e-05, "loss": 0.5617597579956055, "memory(GiB)": 153.57, "step": 190, "token_acc": 0.7690340909090909, "train_speed(iter/s)": 0.307287 }, { "epoch": 0.0743140243902439, "grad_norm": 0.9222752451896667, "learning_rate": 9.99455042793191e-05, "loss": 0.49406919479370115, "memory(GiB)": 153.57, "step": 195, "token_acc": 0.7964505613908004, "train_speed(iter/s)": 0.30861 }, { "epoch": 0.07621951219512195, "grad_norm": 0.7990961074829102, "learning_rate": 9.994267433908533e-05, "loss": 0.5644562721252442, "memory(GiB)": 153.57, "step": 200, "token_acc": 0.7681776133209991, "train_speed(iter/s)": 0.309466 }, { "epoch": 0.078125, "grad_norm": 0.7610871195793152, "learning_rate": 9.993977281025862e-05, "loss": 0.5237982749938965, "memory(GiB)": 153.57, "step": 205, "token_acc": 0.7726671953476077, "train_speed(iter/s)": 0.306487 }, { "epoch": 0.08003048780487805, "grad_norm": 2.034952163696289, "learning_rate": 9.99367996969981e-05, "loss": 0.517772626876831, "memory(GiB)": 153.57, "step": 210, "token_acc": 0.7790805307004011, "train_speed(iter/s)": 0.3041 }, { "epoch": 0.0819359756097561, "grad_norm": 0.7524840831756592, "learning_rate": 9.993375500356545e-05, "loss": 0.48874654769897463, "memory(GiB)": 153.57, "step": 215, "token_acc": 0.7960644007155635, "train_speed(iter/s)": 0.303232 }, { "epoch": 0.08384146341463415, "grad_norm": 0.701473593711853, "learning_rate": 9.9930638734325e-05, "loss": 0.5243041515350342, "memory(GiB)": 153.57, "step": 220, "token_acc": 0.7804634438673592, "train_speed(iter/s)": 0.302291 }, { "epoch": 0.0857469512195122, "grad_norm": 1.6834393739700317, "learning_rate": 9.992745089374364e-05, "loss": 0.4132716178894043, "memory(GiB)": 153.57, "step": 225, "token_acc": 0.8126286890871655, "train_speed(iter/s)": 0.302002 }, { "epoch": 0.08765243902439024, "grad_norm": 2.1764485836029053, "learning_rate": 9.992419148639087e-05, "loss": 0.4478772163391113, "memory(GiB)": 153.57, "step": 230, "token_acc": 0.803426939552594, "train_speed(iter/s)": 0.303355 }, { "epoch": 0.08955792682926829, "grad_norm": 1.1476976871490479, "learning_rate": 9.99208605169388e-05, "loss": 0.4870914459228516, "memory(GiB)": 153.57, "step": 235, "token_acc": 0.7763544168668268, "train_speed(iter/s)": 0.303923 }, { "epoch": 0.09146341463414634, "grad_norm": 0.8466851711273193, "learning_rate": 9.991745799016206e-05, "loss": 0.48582167625427247, "memory(GiB)": 153.57, "step": 240, "token_acc": 0.7961073318216175, "train_speed(iter/s)": 0.304253 }, { "epoch": 0.09336890243902439, "grad_norm": 1.5338777303695679, "learning_rate": 9.99139839109379e-05, "loss": 0.5131222248077393, "memory(GiB)": 153.57, "step": 245, "token_acc": 0.7808156755525176, "train_speed(iter/s)": 0.304442 }, { "epoch": 0.09527439024390244, "grad_norm": 1.489113450050354, "learning_rate": 9.991043828424612e-05, "loss": 0.5177230834960938, "memory(GiB)": 153.57, "step": 250, "token_acc": 0.781027998328458, "train_speed(iter/s)": 0.304976 }, { "epoch": 0.09717987804878049, "grad_norm": 1.2395884990692139, "learning_rate": 9.990682111516907e-05, "loss": 0.435179328918457, "memory(GiB)": 153.57, "step": 255, "token_acc": 0.7899662074343644, "train_speed(iter/s)": 0.306146 }, { "epoch": 0.09908536585365854, "grad_norm": 1.1467424631118774, "learning_rate": 9.990313240889167e-05, "loss": 0.4834613800048828, "memory(GiB)": 153.57, "step": 260, "token_acc": 0.7911464245175936, "train_speed(iter/s)": 0.306937 }, { "epoch": 0.10099085365853659, "grad_norm": 2.271202802658081, "learning_rate": 9.989937217070133e-05, "loss": 0.4522128105163574, "memory(GiB)": 153.57, "step": 265, "token_acc": 0.8074656188605108, "train_speed(iter/s)": 0.308034 }, { "epoch": 0.10289634146341463, "grad_norm": 1.7939646244049072, "learning_rate": 9.989554040598807e-05, "loss": 0.5214954376220703, "memory(GiB)": 153.57, "step": 270, "token_acc": 0.774122113255299, "train_speed(iter/s)": 0.308356 }, { "epoch": 0.10480182926829268, "grad_norm": 1.0914019346237183, "learning_rate": 9.989163712024439e-05, "loss": 0.46280508041381835, "memory(GiB)": 153.57, "step": 275, "token_acc": 0.8106642566651604, "train_speed(iter/s)": 0.30768 }, { "epoch": 0.10670731707317073, "grad_norm": 0.8718067407608032, "learning_rate": 9.988766231906533e-05, "loss": 0.510709285736084, "memory(GiB)": 153.57, "step": 280, "token_acc": 0.7829476248477466, "train_speed(iter/s)": 0.308121 }, { "epoch": 0.10861280487804878, "grad_norm": 1.9101378917694092, "learning_rate": 9.98836160081484e-05, "loss": 0.38759574890136717, "memory(GiB)": 153.57, "step": 285, "token_acc": 0.8107227488151659, "train_speed(iter/s)": 0.30612 }, { "epoch": 0.11051829268292683, "grad_norm": 1.4688981771469116, "learning_rate": 9.987949819329365e-05, "loss": 0.46673293113708497, "memory(GiB)": 153.57, "step": 290, "token_acc": 0.7875420005169295, "train_speed(iter/s)": 0.305227 }, { "epoch": 0.11242378048780488, "grad_norm": 1.7204208374023438, "learning_rate": 9.987530888040366e-05, "loss": 0.41396570205688477, "memory(GiB)": 153.57, "step": 295, "token_acc": 0.802451333813987, "train_speed(iter/s)": 0.305843 }, { "epoch": 0.11432926829268293, "grad_norm": 1.3491666316986084, "learning_rate": 9.98710480754834e-05, "loss": 0.5031266689300538, "memory(GiB)": 153.57, "step": 300, "token_acc": 0.7869555889294143, "train_speed(iter/s)": 0.306276 }, { "epoch": 0.11623475609756098, "grad_norm": 1.0393218994140625, "learning_rate": 9.986671578464042e-05, "loss": 0.41568660736083984, "memory(GiB)": 153.57, "step": 305, "token_acc": 0.7983990512896532, "train_speed(iter/s)": 0.306978 }, { "epoch": 0.11814024390243902, "grad_norm": 0.6684570908546448, "learning_rate": 9.986231201408467e-05, "loss": 0.49887924194335936, "memory(GiB)": 153.57, "step": 310, "token_acc": 0.7855191256830601, "train_speed(iter/s)": 0.307255 }, { "epoch": 0.12004573170731707, "grad_norm": 1.9641035795211792, "learning_rate": 9.985783677012857e-05, "loss": 0.4773710250854492, "memory(GiB)": 153.57, "step": 315, "token_acc": 0.7961414790996785, "train_speed(iter/s)": 0.308117 }, { "epoch": 0.12195121951219512, "grad_norm": 1.1398526430130005, "learning_rate": 9.985329005918702e-05, "loss": 0.539489459991455, "memory(GiB)": 153.57, "step": 320, "token_acc": 0.7856799615569438, "train_speed(iter/s)": 0.307063 }, { "epoch": 0.12385670731707317, "grad_norm": 1.4153881072998047, "learning_rate": 9.984867188777736e-05, "loss": 0.5129752159118652, "memory(GiB)": 153.57, "step": 325, "token_acc": 0.7842354333037563, "train_speed(iter/s)": 0.307287 }, { "epoch": 0.12576219512195122, "grad_norm": 0.6432765126228333, "learning_rate": 9.98439822625193e-05, "loss": 0.48185577392578127, "memory(GiB)": 153.57, "step": 330, "token_acc": 0.7848401105408607, "train_speed(iter/s)": 0.307758 }, { "epoch": 0.12766768292682926, "grad_norm": 0.47792312502861023, "learning_rate": 9.983922119013507e-05, "loss": 0.5080225944519043, "memory(GiB)": 153.57, "step": 335, "token_acc": 0.7840214637054702, "train_speed(iter/s)": 0.308034 }, { "epoch": 0.12957317073170732, "grad_norm": 1.2594367265701294, "learning_rate": 9.983438867744923e-05, "loss": 0.46195096969604493, "memory(GiB)": 153.57, "step": 340, "token_acc": 0.7945709281961472, "train_speed(iter/s)": 0.308343 }, { "epoch": 0.13147865853658536, "grad_norm": 1.1266683340072632, "learning_rate": 9.98294847313888e-05, "loss": 0.4289883613586426, "memory(GiB)": 153.57, "step": 345, "token_acc": 0.8058976020738821, "train_speed(iter/s)": 0.30899 }, { "epoch": 0.13338414634146342, "grad_norm": 3.524914026260376, "learning_rate": 9.982450935898316e-05, "loss": 0.38611726760864257, "memory(GiB)": 153.57, "step": 350, "token_acc": 0.8310904872389792, "train_speed(iter/s)": 0.309819 }, { "epoch": 0.13528963414634146, "grad_norm": 1.0945130586624146, "learning_rate": 9.981946256736409e-05, "loss": 0.4993405818939209, "memory(GiB)": 153.57, "step": 355, "token_acc": 0.8051716451181453, "train_speed(iter/s)": 0.310128 }, { "epoch": 0.13719512195121952, "grad_norm": 0.9205936789512634, "learning_rate": 9.981434436376572e-05, "loss": 0.4454987049102783, "memory(GiB)": 153.57, "step": 360, "token_acc": 0.8009535160905841, "train_speed(iter/s)": 0.310809 }, { "epoch": 0.13910060975609756, "grad_norm": 1.151134967803955, "learning_rate": 9.980915475552459e-05, "loss": 0.42605247497558596, "memory(GiB)": 153.57, "step": 365, "token_acc": 0.823049645390071, "train_speed(iter/s)": 0.310314 }, { "epoch": 0.1410060975609756, "grad_norm": 0.7890369296073914, "learning_rate": 9.980389375007955e-05, "loss": 0.40081005096435546, "memory(GiB)": 153.57, "step": 370, "token_acc": 0.8163841807909604, "train_speed(iter/s)": 0.310764 }, { "epoch": 0.14291158536585366, "grad_norm": 1.4265743494033813, "learning_rate": 9.979856135497179e-05, "loss": 0.4613976955413818, "memory(GiB)": 153.57, "step": 375, "token_acc": 0.7951704545454545, "train_speed(iter/s)": 0.31042 }, { "epoch": 0.1448170731707317, "grad_norm": 1.3474416732788086, "learning_rate": 9.979315757784488e-05, "loss": 0.4359407424926758, "memory(GiB)": 153.57, "step": 380, "token_acc": 0.8074885199576122, "train_speed(iter/s)": 0.310978 }, { "epoch": 0.14672256097560976, "grad_norm": 1.9620078802108765, "learning_rate": 9.978768242644466e-05, "loss": 0.4471263408660889, "memory(GiB)": 153.57, "step": 385, "token_acc": 0.8052341597796143, "train_speed(iter/s)": 0.311333 }, { "epoch": 0.1486280487804878, "grad_norm": 0.9120054841041565, "learning_rate": 9.97821359086193e-05, "loss": 0.47540459632873533, "memory(GiB)": 153.57, "step": 390, "token_acc": 0.8127255460588794, "train_speed(iter/s)": 0.311329 }, { "epoch": 0.15053353658536586, "grad_norm": 1.2256112098693848, "learning_rate": 9.977651803231925e-05, "loss": 0.44778943061828613, "memory(GiB)": 153.57, "step": 395, "token_acc": 0.8203125, "train_speed(iter/s)": 0.311961 }, { "epoch": 0.1524390243902439, "grad_norm": 0.8936465382575989, "learning_rate": 9.977082880559725e-05, "loss": 0.39266037940979004, "memory(GiB)": 153.57, "step": 400, "token_acc": 0.8015016424213984, "train_speed(iter/s)": 0.312057 }, { "epoch": 0.15434451219512196, "grad_norm": 1.4819918870925903, "learning_rate": 9.976506823660836e-05, "loss": 0.3923534631729126, "memory(GiB)": 153.57, "step": 405, "token_acc": 0.8365099534095722, "train_speed(iter/s)": 0.309743 }, { "epoch": 0.15625, "grad_norm": 1.4214024543762207, "learning_rate": 9.975923633360985e-05, "loss": 0.43950958251953126, "memory(GiB)": 153.57, "step": 410, "token_acc": 0.8033507278220269, "train_speed(iter/s)": 0.31005 }, { "epoch": 0.15815548780487804, "grad_norm": 1.3596444129943848, "learning_rate": 9.975333310496125e-05, "loss": 0.42273478507995604, "memory(GiB)": 153.57, "step": 415, "token_acc": 0.8087674714104193, "train_speed(iter/s)": 0.31054 }, { "epoch": 0.1600609756097561, "grad_norm": 1.1213244199752808, "learning_rate": 9.974735855912436e-05, "loss": 0.43076467514038086, "memory(GiB)": 153.57, "step": 420, "token_acc": 0.8133777671982861, "train_speed(iter/s)": 0.310586 }, { "epoch": 0.16196646341463414, "grad_norm": 1.4750664234161377, "learning_rate": 9.974131270466317e-05, "loss": 0.4879480838775635, "memory(GiB)": 153.57, "step": 425, "token_acc": 0.8067885117493473, "train_speed(iter/s)": 0.310836 }, { "epoch": 0.1638719512195122, "grad_norm": 0.7136356234550476, "learning_rate": 9.97351955502439e-05, "loss": 0.5007833957672119, "memory(GiB)": 153.57, "step": 430, "token_acc": 0.7970895675343308, "train_speed(iter/s)": 0.310795 }, { "epoch": 0.16577743902439024, "grad_norm": 1.3333110809326172, "learning_rate": 9.972900710463498e-05, "loss": 0.47020254135131834, "memory(GiB)": 153.57, "step": 435, "token_acc": 0.8029827315541601, "train_speed(iter/s)": 0.30911 }, { "epoch": 0.1676829268292683, "grad_norm": 0.9534010887145996, "learning_rate": 9.972274737670701e-05, "loss": 0.4713101863861084, "memory(GiB)": 153.57, "step": 440, "token_acc": 0.7960257159555816, "train_speed(iter/s)": 0.309153 }, { "epoch": 0.16958841463414634, "grad_norm": 1.1067560911178589, "learning_rate": 9.97164163754328e-05, "loss": 0.4506572723388672, "memory(GiB)": 153.57, "step": 445, "token_acc": 0.8136155022947476, "train_speed(iter/s)": 0.309638 }, { "epoch": 0.1714939024390244, "grad_norm": 1.3601276874542236, "learning_rate": 9.971001410988728e-05, "loss": 0.4192339420318604, "memory(GiB)": 153.57, "step": 450, "token_acc": 0.8178294573643411, "train_speed(iter/s)": 0.30906 }, { "epoch": 0.17339939024390244, "grad_norm": 1.3667134046554565, "learning_rate": 9.970354058924758e-05, "loss": 0.41399898529052737, "memory(GiB)": 153.57, "step": 455, "token_acc": 0.827444179492738, "train_speed(iter/s)": 0.308575 }, { "epoch": 0.17530487804878048, "grad_norm": 0.8167189359664917, "learning_rate": 9.969699582279292e-05, "loss": 0.45358867645263673, "memory(GiB)": 153.57, "step": 460, "token_acc": 0.8155536770921387, "train_speed(iter/s)": 0.308679 }, { "epoch": 0.17721036585365854, "grad_norm": 1.9072154760360718, "learning_rate": 9.969037981990468e-05, "loss": 0.4715822696685791, "memory(GiB)": 153.57, "step": 465, "token_acc": 0.8006810442678775, "train_speed(iter/s)": 0.308933 }, { "epoch": 0.17911585365853658, "grad_norm": 0.8909900188446045, "learning_rate": 9.968369259006634e-05, "loss": 0.46567497253417967, "memory(GiB)": 153.57, "step": 470, "token_acc": 0.8158076790038049, "train_speed(iter/s)": 0.308983 }, { "epoch": 0.18102134146341464, "grad_norm": 0.9646848440170288, "learning_rate": 9.967693414286347e-05, "loss": 0.46303772926330566, "memory(GiB)": 153.57, "step": 475, "token_acc": 0.8148549155478562, "train_speed(iter/s)": 0.309338 }, { "epoch": 0.18292682926829268, "grad_norm": 1.3290550708770752, "learning_rate": 9.967010448798375e-05, "loss": 0.3938359260559082, "memory(GiB)": 153.57, "step": 480, "token_acc": 0.8362627197039778, "train_speed(iter/s)": 0.309798 }, { "epoch": 0.18483231707317074, "grad_norm": 1.4168596267700195, "learning_rate": 9.966320363521691e-05, "loss": 0.41080598831176757, "memory(GiB)": 153.57, "step": 485, "token_acc": 0.8250447227191413, "train_speed(iter/s)": 0.310153 }, { "epoch": 0.18673780487804878, "grad_norm": 0.7716750502586365, "learning_rate": 9.965623159445471e-05, "loss": 0.4360342025756836, "memory(GiB)": 153.57, "step": 490, "token_acc": 0.8024718126626192, "train_speed(iter/s)": 0.309557 }, { "epoch": 0.18864329268292682, "grad_norm": 1.6948847770690918, "learning_rate": 9.964918837569099e-05, "loss": 0.34740898609161375, "memory(GiB)": 153.57, "step": 495, "token_acc": 0.8381171067738232, "train_speed(iter/s)": 0.310018 }, { "epoch": 0.19054878048780488, "grad_norm": 1.908328890800476, "learning_rate": 9.964207398902163e-05, "loss": 0.45706806182861326, "memory(GiB)": 153.57, "step": 500, "token_acc": 0.8060481317289424, "train_speed(iter/s)": 0.309991 }, { "epoch": 0.19245426829268292, "grad_norm": 1.374306321144104, "learning_rate": 9.963488844464447e-05, "loss": 0.3638502359390259, "memory(GiB)": 153.57, "step": 505, "token_acc": 0.8300480769230769, "train_speed(iter/s)": 0.310244 }, { "epoch": 0.19435975609756098, "grad_norm": 1.7781366109848022, "learning_rate": 9.96276317528594e-05, "loss": 0.4779669284820557, "memory(GiB)": 153.57, "step": 510, "token_acc": 0.8081134892981583, "train_speed(iter/s)": 0.310521 }, { "epoch": 0.19626524390243902, "grad_norm": 1.068763017654419, "learning_rate": 9.962030392406828e-05, "loss": 0.4296130180358887, "memory(GiB)": 153.57, "step": 515, "token_acc": 0.8282771535580524, "train_speed(iter/s)": 0.310562 }, { "epoch": 0.19817073170731708, "grad_norm": 1.3310242891311646, "learning_rate": 9.96129049687749e-05, "loss": 0.43368139266967776, "memory(GiB)": 153.57, "step": 520, "token_acc": 0.8151658767772512, "train_speed(iter/s)": 0.310946 }, { "epoch": 0.20007621951219512, "grad_norm": 0.9026315212249756, "learning_rate": 9.960543489758507e-05, "loss": 0.4287989616394043, "memory(GiB)": 153.57, "step": 525, "token_acc": 0.8304142340839589, "train_speed(iter/s)": 0.311337 }, { "epoch": 0.20198170731707318, "grad_norm": 1.421353816986084, "learning_rate": 9.959789372120649e-05, "loss": 0.34484403133392333, "memory(GiB)": 153.57, "step": 530, "token_acc": 0.8631006346328196, "train_speed(iter/s)": 0.311006 }, { "epoch": 0.20388719512195122, "grad_norm": 1.283668041229248, "learning_rate": 9.95902814504488e-05, "loss": 0.4666155815124512, "memory(GiB)": 153.57, "step": 535, "token_acc": 0.8160482789450156, "train_speed(iter/s)": 0.311127 }, { "epoch": 0.20579268292682926, "grad_norm": 2.3560991287231445, "learning_rate": 9.958259809622352e-05, "loss": 0.32131757736206057, "memory(GiB)": 153.57, "step": 540, "token_acc": 0.855686274509804, "train_speed(iter/s)": 0.311491 }, { "epoch": 0.20769817073170732, "grad_norm": 1.1170979738235474, "learning_rate": 9.957484366954413e-05, "loss": 0.44246749877929686, "memory(GiB)": 153.57, "step": 545, "token_acc": 0.8176144244105409, "train_speed(iter/s)": 0.311019 }, { "epoch": 0.20960365853658536, "grad_norm": 1.875386357307434, "learning_rate": 9.956701818152591e-05, "loss": 0.4013582706451416, "memory(GiB)": 153.57, "step": 550, "token_acc": 0.8396017699115044, "train_speed(iter/s)": 0.311178 }, { "epoch": 0.21150914634146342, "grad_norm": 1.0859744548797607, "learning_rate": 9.955912164338605e-05, "loss": 0.44288172721862795, "memory(GiB)": 153.57, "step": 555, "token_acc": 0.8239514348785872, "train_speed(iter/s)": 0.310684 }, { "epoch": 0.21341463414634146, "grad_norm": 1.5161244869232178, "learning_rate": 9.955115406644356e-05, "loss": 0.37770676612854004, "memory(GiB)": 153.57, "step": 560, "token_acc": 0.8376811594202899, "train_speed(iter/s)": 0.310971 }, { "epoch": 0.21532012195121952, "grad_norm": 1.484931468963623, "learning_rate": 9.954311546211931e-05, "loss": 0.45250706672668456, "memory(GiB)": 153.57, "step": 565, "token_acc": 0.8178203506235315, "train_speed(iter/s)": 0.311139 }, { "epoch": 0.21722560975609756, "grad_norm": 2.0869081020355225, "learning_rate": 9.953500584193592e-05, "loss": 0.4231415271759033, "memory(GiB)": 153.57, "step": 570, "token_acc": 0.8228689275893676, "train_speed(iter/s)": 0.31075 }, { "epoch": 0.2191310975609756, "grad_norm": 1.3073168992996216, "learning_rate": 9.952682521751788e-05, "loss": 0.3898351192474365, "memory(GiB)": 153.57, "step": 575, "token_acc": 0.8352674066599395, "train_speed(iter/s)": 0.311012 }, { "epoch": 0.22103658536585366, "grad_norm": 1.4631339311599731, "learning_rate": 9.95185736005914e-05, "loss": 0.34795660972595216, "memory(GiB)": 153.57, "step": 580, "token_acc": 0.8557710064635272, "train_speed(iter/s)": 0.311059 }, { "epoch": 0.2229420731707317, "grad_norm": 1.7442132234573364, "learning_rate": 9.951025100298448e-05, "loss": 0.35686917304992677, "memory(GiB)": 153.57, "step": 585, "token_acc": 0.8433300557194362, "train_speed(iter/s)": 0.310827 }, { "epoch": 0.22484756097560976, "grad_norm": 1.2351348400115967, "learning_rate": 9.950185743662685e-05, "loss": 0.4236060619354248, "memory(GiB)": 153.57, "step": 590, "token_acc": 0.8318731748018356, "train_speed(iter/s)": 0.309618 }, { "epoch": 0.2267530487804878, "grad_norm": 0.6756398677825928, "learning_rate": 9.949339291355e-05, "loss": 0.46345057487487795, "memory(GiB)": 153.57, "step": 595, "token_acc": 0.8076726342710997, "train_speed(iter/s)": 0.309051 }, { "epoch": 0.22865853658536586, "grad_norm": 1.2255867719650269, "learning_rate": 9.948485744588709e-05, "loss": 0.4371930122375488, "memory(GiB)": 153.57, "step": 600, "token_acc": 0.8255331753554502, "train_speed(iter/s)": 0.309273 }, { "epoch": 0.2305640243902439, "grad_norm": 5.449406623840332, "learning_rate": 9.9476251045873e-05, "loss": 0.32289276123046873, "memory(GiB)": 153.57, "step": 605, "token_acc": 0.8543247344461306, "train_speed(iter/s)": 0.309595 }, { "epoch": 0.23246951219512196, "grad_norm": 1.3812782764434814, "learning_rate": 9.946757372584423e-05, "loss": 0.34678101539611816, "memory(GiB)": 153.57, "step": 610, "token_acc": 0.8317467345583351, "train_speed(iter/s)": 0.30984 }, { "epoch": 0.234375, "grad_norm": 2.337050199508667, "learning_rate": 9.945882549823906e-05, "loss": 0.3779390096664429, "memory(GiB)": 153.57, "step": 615, "token_acc": 0.8173277661795407, "train_speed(iter/s)": 0.310215 }, { "epoch": 0.23628048780487804, "grad_norm": 1.5657837390899658, "learning_rate": 9.945000637559727e-05, "loss": 0.3572204351425171, "memory(GiB)": 153.57, "step": 620, "token_acc": 0.811469170261355, "train_speed(iter/s)": 0.310651 }, { "epoch": 0.2381859756097561, "grad_norm": 6.6037774085998535, "learning_rate": 9.944111637056035e-05, "loss": 0.29207763671875, "memory(GiB)": 153.57, "step": 625, "token_acc": 0.8626497005988024, "train_speed(iter/s)": 0.309734 }, { "epoch": 0.24009146341463414, "grad_norm": 2.991128444671631, "learning_rate": 9.943215549587138e-05, "loss": 0.2919323921203613, "memory(GiB)": 153.57, "step": 630, "token_acc": 0.8434617471513837, "train_speed(iter/s)": 0.310041 }, { "epoch": 0.2419969512195122, "grad_norm": 1.221647024154663, "learning_rate": 9.9423123764375e-05, "loss": 0.42578864097595215, "memory(GiB)": 153.57, "step": 635, "token_acc": 0.8268354430379747, "train_speed(iter/s)": 0.310205 }, { "epoch": 0.24390243902439024, "grad_norm": 1.2326760292053223, "learning_rate": 9.941402118901744e-05, "loss": 0.383359956741333, "memory(GiB)": 153.57, "step": 640, "token_acc": 0.8314436885865457, "train_speed(iter/s)": 0.310402 }, { "epoch": 0.2458079268292683, "grad_norm": 1.9157187938690186, "learning_rate": 9.940484778284645e-05, "loss": 0.3489060401916504, "memory(GiB)": 153.57, "step": 645, "token_acc": 0.8557730723132241, "train_speed(iter/s)": 0.310627 }, { "epoch": 0.24771341463414634, "grad_norm": 1.3391838073730469, "learning_rate": 9.939560355901136e-05, "loss": 0.3835107088088989, "memory(GiB)": 153.57, "step": 650, "token_acc": 0.8439497987212882, "train_speed(iter/s)": 0.310771 }, { "epoch": 0.2496189024390244, "grad_norm": 2.713625907897949, "learning_rate": 9.938628853076295e-05, "loss": 0.3594775676727295, "memory(GiB)": 153.57, "step": 655, "token_acc": 0.8260434056761269, "train_speed(iter/s)": 0.311076 }, { "epoch": 0.25152439024390244, "grad_norm": 1.2089886665344238, "learning_rate": 9.937690271145354e-05, "loss": 0.34580564498901367, "memory(GiB)": 153.57, "step": 660, "token_acc": 0.8448594674556213, "train_speed(iter/s)": 0.311177 }, { "epoch": 0.2534298780487805, "grad_norm": 1.0695185661315918, "learning_rate": 9.936744611453694e-05, "loss": 0.4314561367034912, "memory(GiB)": 153.57, "step": 665, "token_acc": 0.822770179156232, "train_speed(iter/s)": 0.311116 }, { "epoch": 0.2553353658536585, "grad_norm": 1.5431596040725708, "learning_rate": 9.935791875356832e-05, "loss": 0.3657357215881348, "memory(GiB)": 153.57, "step": 670, "token_acc": 0.8518620689655172, "train_speed(iter/s)": 0.310757 }, { "epoch": 0.2572408536585366, "grad_norm": 1.1308174133300781, "learning_rate": 9.934832064220438e-05, "loss": 0.3807420253753662, "memory(GiB)": 153.57, "step": 675, "token_acc": 0.8290273556231003, "train_speed(iter/s)": 0.311132 }, { "epoch": 0.25914634146341464, "grad_norm": 1.6009891033172607, "learning_rate": 9.933865179420321e-05, "loss": 0.3346934080123901, "memory(GiB)": 153.57, "step": 680, "token_acc": 0.8486238532110092, "train_speed(iter/s)": 0.311536 }, { "epoch": 0.2610518292682927, "grad_norm": 1.1436727046966553, "learning_rate": 9.932891222342425e-05, "loss": 0.3988161325454712, "memory(GiB)": 153.57, "step": 685, "token_acc": 0.8403465346534653, "train_speed(iter/s)": 0.310831 }, { "epoch": 0.2629573170731707, "grad_norm": 1.3102149963378906, "learning_rate": 9.931910194382837e-05, "loss": 0.3845363616943359, "memory(GiB)": 153.57, "step": 690, "token_acc": 0.8533511552415505, "train_speed(iter/s)": 0.310913 }, { "epoch": 0.2648628048780488, "grad_norm": 1.4337329864501953, "learning_rate": 9.930922096947776e-05, "loss": 0.35768885612487794, "memory(GiB)": 153.57, "step": 695, "token_acc": 0.8594049904030711, "train_speed(iter/s)": 0.310989 }, { "epoch": 0.26676829268292684, "grad_norm": 2.0538153648376465, "learning_rate": 9.929926931453599e-05, "loss": 0.4194492340087891, "memory(GiB)": 153.57, "step": 700, "token_acc": 0.827349737797499, "train_speed(iter/s)": 0.310499 }, { "epoch": 0.2686737804878049, "grad_norm": 1.389617681503296, "learning_rate": 9.92892469932679e-05, "loss": 0.3058375358581543, "memory(GiB)": 153.57, "step": 705, "token_acc": 0.8659734257654534, "train_speed(iter/s)": 0.310801 }, { "epoch": 0.2705792682926829, "grad_norm": 1.3852825164794922, "learning_rate": 9.927915402003964e-05, "loss": 0.3847295522689819, "memory(GiB)": 153.57, "step": 710, "token_acc": 0.8488352027610009, "train_speed(iter/s)": 0.310897 }, { "epoch": 0.27248475609756095, "grad_norm": 2.2490694522857666, "learning_rate": 9.92689904093186e-05, "loss": 0.3394075155258179, "memory(GiB)": 153.57, "step": 715, "token_acc": 0.8628652214891611, "train_speed(iter/s)": 0.31112 }, { "epoch": 0.27439024390243905, "grad_norm": 0.8888046741485596, "learning_rate": 9.92587561756735e-05, "loss": 0.3484663963317871, "memory(GiB)": 153.57, "step": 720, "token_acc": 0.833295938972403, "train_speed(iter/s)": 0.310874 }, { "epoch": 0.2762957317073171, "grad_norm": 0.8579233884811401, "learning_rate": 9.924845133377422e-05, "loss": 0.3932963848114014, "memory(GiB)": 153.57, "step": 725, "token_acc": 0.8470131885182312, "train_speed(iter/s)": 0.310909 }, { "epoch": 0.2782012195121951, "grad_norm": 1.3329341411590576, "learning_rate": 9.92380758983919e-05, "loss": 0.35467479228973386, "memory(GiB)": 153.57, "step": 730, "token_acc": 0.8615515771526002, "train_speed(iter/s)": 0.310977 }, { "epoch": 0.28010670731707316, "grad_norm": 1.3868855237960815, "learning_rate": 9.922762988439883e-05, "loss": 0.40759963989257814, "memory(GiB)": 153.57, "step": 735, "token_acc": 0.8302419354838709, "train_speed(iter/s)": 0.311105 }, { "epoch": 0.2820121951219512, "grad_norm": 1.6492371559143066, "learning_rate": 9.921711330676848e-05, "loss": 0.3404561519622803, "memory(GiB)": 153.57, "step": 740, "token_acc": 0.8474804634869307, "train_speed(iter/s)": 0.311274 }, { "epoch": 0.2839176829268293, "grad_norm": 1.4900654554367065, "learning_rate": 9.92065261805755e-05, "loss": 0.4260540962219238, "memory(GiB)": 153.57, "step": 745, "token_acc": 0.8333060422466022, "train_speed(iter/s)": 0.311285 }, { "epoch": 0.2858231707317073, "grad_norm": 2.670952320098877, "learning_rate": 9.919586852099562e-05, "loss": 0.35645380020141604, "memory(GiB)": 153.57, "step": 750, "token_acc": 0.8512443438914027, "train_speed(iter/s)": 0.310947 }, { "epoch": 0.28772865853658536, "grad_norm": 1.3680213689804077, "learning_rate": 9.918514034330568e-05, "loss": 0.36694512367248533, "memory(GiB)": 153.57, "step": 755, "token_acc": 0.8536126779478915, "train_speed(iter/s)": 0.310325 }, { "epoch": 0.2896341463414634, "grad_norm": 1.4212994575500488, "learning_rate": 9.917434166288364e-05, "loss": 0.332599401473999, "memory(GiB)": 153.57, "step": 760, "token_acc": 0.856, "train_speed(iter/s)": 0.310461 }, { "epoch": 0.2915396341463415, "grad_norm": 2.3020665645599365, "learning_rate": 9.916347249520849e-05, "loss": 0.2964786529541016, "memory(GiB)": 153.57, "step": 765, "token_acc": 0.8662068965517241, "train_speed(iter/s)": 0.310327 }, { "epoch": 0.2934451219512195, "grad_norm": 1.3500005006790161, "learning_rate": 9.915253285586024e-05, "loss": 0.36644272804260253, "memory(GiB)": 153.57, "step": 770, "token_acc": 0.8448098663926003, "train_speed(iter/s)": 0.310439 }, { "epoch": 0.29535060975609756, "grad_norm": 1.093353033065796, "learning_rate": 9.914152276051995e-05, "loss": 0.3281266689300537, "memory(GiB)": 153.57, "step": 775, "token_acc": 0.852264291017075, "train_speed(iter/s)": 0.310624 }, { "epoch": 0.2972560975609756, "grad_norm": 1.6583073139190674, "learning_rate": 9.913044222496966e-05, "loss": 0.37622694969177245, "memory(GiB)": 153.57, "step": 780, "token_acc": 0.8491657397107898, "train_speed(iter/s)": 0.310731 }, { "epoch": 0.29916158536585363, "grad_norm": 1.8219716548919678, "learning_rate": 9.911929126509238e-05, "loss": 0.2646802425384521, "memory(GiB)": 153.57, "step": 785, "token_acc": 0.8403225806451613, "train_speed(iter/s)": 0.311145 }, { "epoch": 0.3010670731707317, "grad_norm": 1.1776328086853027, "learning_rate": 9.910806989687206e-05, "loss": 0.35205371379852296, "memory(GiB)": 153.57, "step": 790, "token_acc": 0.8572727272727273, "train_speed(iter/s)": 0.311311 }, { "epoch": 0.30297256097560976, "grad_norm": 2.1954917907714844, "learning_rate": 9.909677813639359e-05, "loss": 0.3720370054244995, "memory(GiB)": 153.57, "step": 795, "token_acc": 0.844718309859155, "train_speed(iter/s)": 0.311637 }, { "epoch": 0.3048780487804878, "grad_norm": 6.0450263023376465, "learning_rate": 9.908541599984276e-05, "loss": 0.3368817329406738, "memory(GiB)": 153.57, "step": 800, "token_acc": 0.8467432950191571, "train_speed(iter/s)": 0.311811 }, { "epoch": 0.3048780487804878, "eval_loss": 0.28021880984306335, "eval_runtime": 33.446, "eval_samples_per_second": 3.169, "eval_steps_per_second": 3.169, "eval_token_acc": 0.8490319336183052, "step": 800 }, { "epoch": 0.30678353658536583, "grad_norm": 1.5186457633972168, "learning_rate": 9.90739835035062e-05, "loss": 0.30771028995513916, "memory(GiB)": 153.57, "step": 805, "token_acc": 0.8529958403398531, "train_speed(iter/s)": 0.308129 }, { "epoch": 0.3086890243902439, "grad_norm": 2.0427680015563965, "learning_rate": 9.906248066377143e-05, "loss": 0.39708528518676756, "memory(GiB)": 153.57, "step": 810, "token_acc": 0.8487591240875912, "train_speed(iter/s)": 0.308503 }, { "epoch": 0.31059451219512196, "grad_norm": 1.0382192134857178, "learning_rate": 9.905090749712684e-05, "loss": 0.31704351902008054, "memory(GiB)": 153.57, "step": 815, "token_acc": 0.8469642362073746, "train_speed(iter/s)": 0.308817 }, { "epoch": 0.3125, "grad_norm": 2.6036629676818848, "learning_rate": 9.903926402016153e-05, "loss": 0.2888906478881836, "memory(GiB)": 153.57, "step": 820, "token_acc": 0.8590357011409643, "train_speed(iter/s)": 0.30917 }, { "epoch": 0.31440548780487804, "grad_norm": 1.473454236984253, "learning_rate": 9.902755024956547e-05, "loss": 0.28834679126739504, "memory(GiB)": 153.57, "step": 825, "token_acc": 0.8703612073231074, "train_speed(iter/s)": 0.309341 }, { "epoch": 0.3163109756097561, "grad_norm": 1.2300288677215576, "learning_rate": 9.901576620212933e-05, "loss": 0.28536596298217776, "memory(GiB)": 153.57, "step": 830, "token_acc": 0.8602941176470589, "train_speed(iter/s)": 0.30966 }, { "epoch": 0.31821646341463417, "grad_norm": 1.5768479108810425, "learning_rate": 9.900391189474458e-05, "loss": 0.3324174642562866, "memory(GiB)": 153.57, "step": 835, "token_acc": 0.8645544554455445, "train_speed(iter/s)": 0.308902 }, { "epoch": 0.3201219512195122, "grad_norm": 1.060900092124939, "learning_rate": 9.899198734440335e-05, "loss": 0.34772191047668455, "memory(GiB)": 153.57, "step": 840, "token_acc": 0.8451741293532339, "train_speed(iter/s)": 0.309028 }, { "epoch": 0.32202743902439024, "grad_norm": 1.4851446151733398, "learning_rate": 9.897999256819845e-05, "loss": 0.39832630157470705, "memory(GiB)": 153.57, "step": 845, "token_acc": 0.8331556503198294, "train_speed(iter/s)": 0.308734 }, { "epoch": 0.3239329268292683, "grad_norm": 1.7241132259368896, "learning_rate": 9.896792758332341e-05, "loss": 0.31618561744689944, "memory(GiB)": 153.57, "step": 850, "token_acc": 0.8529128550794415, "train_speed(iter/s)": 0.309028 }, { "epoch": 0.32583841463414637, "grad_norm": 1.3019485473632812, "learning_rate": 9.895579240707236e-05, "loss": 0.3674612522125244, "memory(GiB)": 153.57, "step": 855, "token_acc": 0.8548450372695174, "train_speed(iter/s)": 0.30919 }, { "epoch": 0.3277439024390244, "grad_norm": 0.9063626527786255, "learning_rate": 9.894358705684002e-05, "loss": 0.41171698570251464, "memory(GiB)": 153.57, "step": 860, "token_acc": 0.8324324324324325, "train_speed(iter/s)": 0.308891 }, { "epoch": 0.32964939024390244, "grad_norm": 0.9362164735794067, "learning_rate": 9.893131155012175e-05, "loss": 0.35434119701385497, "memory(GiB)": 153.57, "step": 865, "token_acc": 0.8395114634668952, "train_speed(iter/s)": 0.309071 }, { "epoch": 0.3315548780487805, "grad_norm": 1.6629903316497803, "learning_rate": 9.891896590451344e-05, "loss": 0.3100865364074707, "memory(GiB)": 153.57, "step": 870, "token_acc": 0.8783990808119494, "train_speed(iter/s)": 0.309179 }, { "epoch": 0.3334603658536585, "grad_norm": 1.1553665399551392, "learning_rate": 9.890655013771153e-05, "loss": 0.38008410930633546, "memory(GiB)": 153.57, "step": 875, "token_acc": 0.8563350415267715, "train_speed(iter/s)": 0.309167 }, { "epoch": 0.3353658536585366, "grad_norm": 1.9387842416763306, "learning_rate": 9.889406426751296e-05, "loss": 0.3831655025482178, "memory(GiB)": 153.57, "step": 880, "token_acc": 0.8185573562122657, "train_speed(iter/s)": 0.309346 }, { "epoch": 0.33727134146341464, "grad_norm": 1.531265139579773, "learning_rate": 9.888150831181518e-05, "loss": 0.3262618541717529, "memory(GiB)": 153.57, "step": 885, "token_acc": 0.8583121648320632, "train_speed(iter/s)": 0.309558 }, { "epoch": 0.3391768292682927, "grad_norm": 1.1302748918533325, "learning_rate": 9.886888228861608e-05, "loss": 0.33860507011413576, "memory(GiB)": 153.57, "step": 890, "token_acc": 0.856843462840561, "train_speed(iter/s)": 0.30947 }, { "epoch": 0.3410823170731707, "grad_norm": 2.573133707046509, "learning_rate": 9.885618621601402e-05, "loss": 0.31659350395202634, "memory(GiB)": 153.57, "step": 895, "token_acc": 0.8636363636363636, "train_speed(iter/s)": 0.309732 }, { "epoch": 0.3429878048780488, "grad_norm": 1.1128087043762207, "learning_rate": 9.88434201122077e-05, "loss": 0.35303335189819335, "memory(GiB)": 153.57, "step": 900, "token_acc": 0.8465239355304306, "train_speed(iter/s)": 0.309551 }, { "epoch": 0.34489329268292684, "grad_norm": 1.0344233512878418, "learning_rate": 9.883058399549629e-05, "loss": 0.3345011234283447, "memory(GiB)": 153.57, "step": 905, "token_acc": 0.8483021263091082, "train_speed(iter/s)": 0.309572 }, { "epoch": 0.3467987804878049, "grad_norm": 3.962538003921509, "learning_rate": 9.881767788427925e-05, "loss": 0.291802191734314, "memory(GiB)": 153.57, "step": 910, "token_acc": 0.8690080683436165, "train_speed(iter/s)": 0.309496 }, { "epoch": 0.3487042682926829, "grad_norm": 1.197187900543213, "learning_rate": 9.880470179705639e-05, "loss": 0.28061885833740235, "memory(GiB)": 153.57, "step": 915, "token_acc": 0.8718108276291225, "train_speed(iter/s)": 0.309767 }, { "epoch": 0.35060975609756095, "grad_norm": 1.3509570360183716, "learning_rate": 9.879165575242787e-05, "loss": 0.3853476047515869, "memory(GiB)": 153.57, "step": 920, "token_acc": 0.8448235569058191, "train_speed(iter/s)": 0.309946 }, { "epoch": 0.35251524390243905, "grad_norm": 1.469258427619934, "learning_rate": 9.877853976909405e-05, "loss": 0.306730318069458, "memory(GiB)": 153.57, "step": 925, "token_acc": 0.8617731172545281, "train_speed(iter/s)": 0.310267 }, { "epoch": 0.3544207317073171, "grad_norm": 1.4270503520965576, "learning_rate": 9.876535386585561e-05, "loss": 0.3397212028503418, "memory(GiB)": 153.57, "step": 930, "token_acc": 0.87, "train_speed(iter/s)": 0.310289 }, { "epoch": 0.3563262195121951, "grad_norm": 1.4659042358398438, "learning_rate": 9.87520980616134e-05, "loss": 0.3211731672286987, "memory(GiB)": 153.57, "step": 935, "token_acc": 0.8738187882156754, "train_speed(iter/s)": 0.310437 }, { "epoch": 0.35823170731707316, "grad_norm": 1.0458544492721558, "learning_rate": 9.873877237536853e-05, "loss": 0.39742674827575686, "memory(GiB)": 153.57, "step": 940, "token_acc": 0.8440026507620941, "train_speed(iter/s)": 0.309946 }, { "epoch": 0.3601371951219512, "grad_norm": 1.86162269115448, "learning_rate": 9.87253768262222e-05, "loss": 0.29307739734649657, "memory(GiB)": 153.57, "step": 945, "token_acc": 0.857736240913811, "train_speed(iter/s)": 0.310237 }, { "epoch": 0.3620426829268293, "grad_norm": 1.8918557167053223, "learning_rate": 9.871191143337582e-05, "loss": 0.2750221252441406, "memory(GiB)": 153.57, "step": 950, "token_acc": 0.8745574102175012, "train_speed(iter/s)": 0.310335 }, { "epoch": 0.3639481707317073, "grad_norm": 0.9023714661598206, "learning_rate": 9.869837621613088e-05, "loss": 0.2827669620513916, "memory(GiB)": 153.57, "step": 955, "token_acc": 0.8716334382651276, "train_speed(iter/s)": 0.310579 }, { "epoch": 0.36585365853658536, "grad_norm": 1.7347506284713745, "learning_rate": 9.868477119388896e-05, "loss": 0.27705333232879636, "memory(GiB)": 153.57, "step": 960, "token_acc": 0.8773792093704246, "train_speed(iter/s)": 0.310776 }, { "epoch": 0.3677591463414634, "grad_norm": 0.9929677248001099, "learning_rate": 9.867109638615172e-05, "loss": 0.3268923282623291, "memory(GiB)": 153.57, "step": 965, "token_acc": 0.8566456384598651, "train_speed(iter/s)": 0.310434 }, { "epoch": 0.3696646341463415, "grad_norm": 1.3946431875228882, "learning_rate": 9.865735181252085e-05, "loss": 0.3901082515716553, "memory(GiB)": 153.57, "step": 970, "token_acc": 0.8454545454545455, "train_speed(iter/s)": 0.310524 }, { "epoch": 0.3715701219512195, "grad_norm": 2.392336845397949, "learning_rate": 9.8643537492698e-05, "loss": 0.2495579719543457, "memory(GiB)": 153.57, "step": 975, "token_acc": 0.8920968212306795, "train_speed(iter/s)": 0.310709 }, { "epoch": 0.37347560975609756, "grad_norm": 1.43564772605896, "learning_rate": 9.862965344648485e-05, "loss": 0.40538721084594725, "memory(GiB)": 153.57, "step": 980, "token_acc": 0.8362467349809122, "train_speed(iter/s)": 0.310776 }, { "epoch": 0.3753810975609756, "grad_norm": 1.4556761980056763, "learning_rate": 9.8615699693783e-05, "loss": 0.36866064071655275, "memory(GiB)": 153.57, "step": 985, "token_acc": 0.8581974776991694, "train_speed(iter/s)": 0.31099 }, { "epoch": 0.37728658536585363, "grad_norm": 1.5627130270004272, "learning_rate": 9.860167625459398e-05, "loss": 0.36900322437286376, "memory(GiB)": 153.57, "step": 990, "token_acc": 0.8496594226402855, "train_speed(iter/s)": 0.311029 }, { "epoch": 0.3791920731707317, "grad_norm": 0.8968172669410706, "learning_rate": 9.858758314901918e-05, "loss": 0.3868812084197998, "memory(GiB)": 153.57, "step": 995, "token_acc": 0.8518386924853437, "train_speed(iter/s)": 0.311 }, { "epoch": 0.38109756097560976, "grad_norm": 1.6610108613967896, "learning_rate": 9.85734203972599e-05, "loss": 0.24545371532440186, "memory(GiB)": 153.57, "step": 1000, "token_acc": 0.8892431972789115, "train_speed(iter/s)": 0.311152 }, { "epoch": 0.3830030487804878, "grad_norm": 1.901943564414978, "learning_rate": 9.855918801961721e-05, "loss": 0.3306408405303955, "memory(GiB)": 153.57, "step": 1005, "token_acc": 0.8614425162689805, "train_speed(iter/s)": 0.311274 }, { "epoch": 0.38490853658536583, "grad_norm": 1.204458236694336, "learning_rate": 9.854488603649206e-05, "loss": 0.32497172355651854, "memory(GiB)": 153.57, "step": 1010, "token_acc": 0.8586998087954111, "train_speed(iter/s)": 0.311387 }, { "epoch": 0.3868140243902439, "grad_norm": 1.1066267490386963, "learning_rate": 9.853051446838511e-05, "loss": 0.3373617649078369, "memory(GiB)": 153.57, "step": 1015, "token_acc": 0.8752805386341777, "train_speed(iter/s)": 0.311434 }, { "epoch": 0.38871951219512196, "grad_norm": 0.9805747270584106, "learning_rate": 9.851607333589677e-05, "loss": 0.3690987825393677, "memory(GiB)": 153.57, "step": 1020, "token_acc": 0.8323299888517279, "train_speed(iter/s)": 0.311621 }, { "epoch": 0.390625, "grad_norm": 1.253299593925476, "learning_rate": 9.850156265972721e-05, "loss": 0.2591085910797119, "memory(GiB)": 153.57, "step": 1025, "token_acc": 0.8738562091503268, "train_speed(iter/s)": 0.31164 }, { "epoch": 0.39253048780487804, "grad_norm": 1.443713665008545, "learning_rate": 9.848698246067623e-05, "loss": 0.31962785720825193, "memory(GiB)": 153.57, "step": 1030, "token_acc": 0.867515923566879, "train_speed(iter/s)": 0.311642 }, { "epoch": 0.3944359756097561, "grad_norm": 1.316129207611084, "learning_rate": 9.847233275964333e-05, "loss": 0.3436516284942627, "memory(GiB)": 153.57, "step": 1035, "token_acc": 0.8703322173577132, "train_speed(iter/s)": 0.311465 }, { "epoch": 0.39634146341463417, "grad_norm": 1.298034906387329, "learning_rate": 9.84576135776276e-05, "loss": 0.3288773536682129, "memory(GiB)": 153.57, "step": 1040, "token_acc": 0.8622967479674797, "train_speed(iter/s)": 0.311522 }, { "epoch": 0.3982469512195122, "grad_norm": 1.0872814655303955, "learning_rate": 9.844282493572774e-05, "loss": 0.35437164306640623, "memory(GiB)": 153.57, "step": 1045, "token_acc": 0.8520345252774353, "train_speed(iter/s)": 0.31164 }, { "epoch": 0.40015243902439024, "grad_norm": 0.9167392253875732, "learning_rate": 9.842796685514203e-05, "loss": 0.36513590812683105, "memory(GiB)": 153.57, "step": 1050, "token_acc": 0.8604350635365066, "train_speed(iter/s)": 0.311683 }, { "epoch": 0.4020579268292683, "grad_norm": 1.6010055541992188, "learning_rate": 9.841303935716826e-05, "loss": 0.34415593147277834, "memory(GiB)": 153.57, "step": 1055, "token_acc": 0.8630756578947368, "train_speed(iter/s)": 0.3118 }, { "epoch": 0.40396341463414637, "grad_norm": 1.3998205661773682, "learning_rate": 9.839804246320375e-05, "loss": 0.33754730224609375, "memory(GiB)": 153.57, "step": 1060, "token_acc": 0.8633307271305708, "train_speed(iter/s)": 0.31181 }, { "epoch": 0.4058689024390244, "grad_norm": 2.1931052207946777, "learning_rate": 9.838297619474526e-05, "loss": 0.3488780975341797, "memory(GiB)": 153.57, "step": 1065, "token_acc": 0.843069306930693, "train_speed(iter/s)": 0.311909 }, { "epoch": 0.40777439024390244, "grad_norm": 1.3281787633895874, "learning_rate": 9.836784057338899e-05, "loss": 0.3606546401977539, "memory(GiB)": 153.57, "step": 1070, "token_acc": 0.8530933633295839, "train_speed(iter/s)": 0.312052 }, { "epoch": 0.4096798780487805, "grad_norm": 1.4155291318893433, "learning_rate": 9.835263562083061e-05, "loss": 0.35638353824615476, "memory(GiB)": 153.57, "step": 1075, "token_acc": 0.8624056813138038, "train_speed(iter/s)": 0.312364 }, { "epoch": 0.4115853658536585, "grad_norm": 1.2022937536239624, "learning_rate": 9.833736135886512e-05, "loss": 0.3150071620941162, "memory(GiB)": 153.57, "step": 1080, "token_acc": 0.8564023804952966, "train_speed(iter/s)": 0.312173 }, { "epoch": 0.4134908536585366, "grad_norm": 1.4364752769470215, "learning_rate": 9.832201780938688e-05, "loss": 0.2488001585006714, "memory(GiB)": 153.57, "step": 1085, "token_acc": 0.894076840981857, "train_speed(iter/s)": 0.31232 }, { "epoch": 0.41539634146341464, "grad_norm": 1.3903812170028687, "learning_rate": 9.830660499438955e-05, "loss": 0.2756960868835449, "memory(GiB)": 153.57, "step": 1090, "token_acc": 0.8941259233204362, "train_speed(iter/s)": 0.312562 }, { "epoch": 0.4173018292682927, "grad_norm": 1.2424163818359375, "learning_rate": 9.82911229359661e-05, "loss": 0.2990188837051392, "memory(GiB)": 153.57, "step": 1095, "token_acc": 0.8817172897196262, "train_speed(iter/s)": 0.311757 }, { "epoch": 0.4192073170731707, "grad_norm": 1.3850867748260498, "learning_rate": 9.827557165630879e-05, "loss": 0.32658982276916504, "memory(GiB)": 153.57, "step": 1100, "token_acc": 0.85675430643699, "train_speed(iter/s)": 0.311916 }, { "epoch": 0.4211128048780488, "grad_norm": 0.9169235825538635, "learning_rate": 9.825995117770901e-05, "loss": 0.368768048286438, "memory(GiB)": 153.57, "step": 1105, "token_acc": 0.8560477001703578, "train_speed(iter/s)": 0.311934 }, { "epoch": 0.42301829268292684, "grad_norm": 1.4543040990829468, "learning_rate": 9.824426152255741e-05, "loss": 0.31297595500946046, "memory(GiB)": 153.57, "step": 1110, "token_acc": 0.8821344140493077, "train_speed(iter/s)": 0.312158 }, { "epoch": 0.4249237804878049, "grad_norm": 1.5258914232254028, "learning_rate": 9.822850271334378e-05, "loss": 0.3574199676513672, "memory(GiB)": 153.57, "step": 1115, "token_acc": 0.861822840409956, "train_speed(iter/s)": 0.312177 }, { "epoch": 0.4268292682926829, "grad_norm": 1.3291302919387817, "learning_rate": 9.821267477265705e-05, "loss": 0.35240879058837893, "memory(GiB)": 153.57, "step": 1120, "token_acc": 0.8610868425650883, "train_speed(iter/s)": 0.312125 }, { "epoch": 0.42873475609756095, "grad_norm": 1.3770475387573242, "learning_rate": 9.819677772318523e-05, "loss": 0.33522543907165525, "memory(GiB)": 153.57, "step": 1125, "token_acc": 0.8598810310641111, "train_speed(iter/s)": 0.312262 }, { "epoch": 0.43064024390243905, "grad_norm": 1.645121693611145, "learning_rate": 9.818081158771538e-05, "loss": 0.22432076930999756, "memory(GiB)": 153.57, "step": 1130, "token_acc": 0.9199806013579049, "train_speed(iter/s)": 0.312083 }, { "epoch": 0.4325457317073171, "grad_norm": 2.638260841369629, "learning_rate": 9.816477638913361e-05, "loss": 0.36161563396453855, "memory(GiB)": 153.57, "step": 1135, "token_acc": 0.8394062078272605, "train_speed(iter/s)": 0.311847 }, { "epoch": 0.4344512195121951, "grad_norm": 1.216697335243225, "learning_rate": 9.814867215042502e-05, "loss": 0.3425921440124512, "memory(GiB)": 153.57, "step": 1140, "token_acc": 0.8568592057761732, "train_speed(iter/s)": 0.311881 }, { "epoch": 0.43635670731707316, "grad_norm": 1.3395960330963135, "learning_rate": 9.813249889467369e-05, "loss": 0.28065948486328124, "memory(GiB)": 153.57, "step": 1145, "token_acc": 0.8610917941585535, "train_speed(iter/s)": 0.311955 }, { "epoch": 0.4382621951219512, "grad_norm": 1.4687285423278809, "learning_rate": 9.811625664506259e-05, "loss": 0.29861674308776853, "memory(GiB)": 153.57, "step": 1150, "token_acc": 0.8702989392478303, "train_speed(iter/s)": 0.312017 }, { "epoch": 0.4401676829268293, "grad_norm": 1.116400957107544, "learning_rate": 9.80999454248736e-05, "loss": 0.37688274383544923, "memory(GiB)": 153.57, "step": 1155, "token_acc": 0.8446058091286307, "train_speed(iter/s)": 0.312031 }, { "epoch": 0.4420731707317073, "grad_norm": 1.0447231531143188, "learning_rate": 9.808356525748748e-05, "loss": 0.3956377267837524, "memory(GiB)": 153.57, "step": 1160, "token_acc": 0.8456303724928367, "train_speed(iter/s)": 0.312089 }, { "epoch": 0.44397865853658536, "grad_norm": 1.3014211654663086, "learning_rate": 9.806711616638383e-05, "loss": 0.3671111106872559, "memory(GiB)": 153.57, "step": 1165, "token_acc": 0.8374970814849405, "train_speed(iter/s)": 0.312221 }, { "epoch": 0.4458841463414634, "grad_norm": 1.5234813690185547, "learning_rate": 9.805059817514101e-05, "loss": 0.32637155055999756, "memory(GiB)": 153.57, "step": 1170, "token_acc": 0.8598603419215025, "train_speed(iter/s)": 0.312085 }, { "epoch": 0.4477896341463415, "grad_norm": 2.713848114013672, "learning_rate": 9.803401130743617e-05, "loss": 0.32610931396484377, "memory(GiB)": 153.57, "step": 1175, "token_acc": 0.8436363636363636, "train_speed(iter/s)": 0.312332 }, { "epoch": 0.4496951219512195, "grad_norm": 1.0739327669143677, "learning_rate": 9.801735558704517e-05, "loss": 0.3011434078216553, "memory(GiB)": 153.57, "step": 1180, "token_acc": 0.8838643371017472, "train_speed(iter/s)": 0.312483 }, { "epoch": 0.45160060975609756, "grad_norm": 1.8548461198806763, "learning_rate": 9.80006310378426e-05, "loss": 0.2648619174957275, "memory(GiB)": 153.57, "step": 1185, "token_acc": 0.8633206363432668, "train_speed(iter/s)": 0.312539 }, { "epoch": 0.4535060975609756, "grad_norm": 0.606468915939331, "learning_rate": 9.798383768380164e-05, "loss": 0.24656496047973633, "memory(GiB)": 153.57, "step": 1190, "token_acc": 0.8996763754045307, "train_speed(iter/s)": 0.311612 }, { "epoch": 0.45541158536585363, "grad_norm": 1.2962793111801147, "learning_rate": 9.796697554899418e-05, "loss": 0.31072368621826174, "memory(GiB)": 153.57, "step": 1195, "token_acc": 0.8705941591137966, "train_speed(iter/s)": 0.31128 }, { "epoch": 0.4573170731707317, "grad_norm": 1.4565011262893677, "learning_rate": 9.795004465759065e-05, "loss": 0.3086632966995239, "memory(GiB)": 153.57, "step": 1200, "token_acc": 0.8669778296382731, "train_speed(iter/s)": 0.311294 }, { "epoch": 0.45922256097560976, "grad_norm": 1.155051827430725, "learning_rate": 9.793304503386005e-05, "loss": 0.3719020366668701, "memory(GiB)": 153.57, "step": 1205, "token_acc": 0.8519659358579453, "train_speed(iter/s)": 0.311375 }, { "epoch": 0.4611280487804878, "grad_norm": 3.1130599975585938, "learning_rate": 9.791597670216989e-05, "loss": 0.27948651313781736, "memory(GiB)": 153.57, "step": 1210, "token_acc": 0.8631334760885082, "train_speed(iter/s)": 0.311454 }, { "epoch": 0.46303353658536583, "grad_norm": 1.7017649412155151, "learning_rate": 9.789883968698619e-05, "loss": 0.3175970077514648, "memory(GiB)": 153.57, "step": 1215, "token_acc": 0.8494045332308875, "train_speed(iter/s)": 0.311609 }, { "epoch": 0.4649390243902439, "grad_norm": 1.2926058769226074, "learning_rate": 9.78816340128734e-05, "loss": 0.32635815143585206, "memory(GiB)": 153.57, "step": 1220, "token_acc": 0.8631383785605933, "train_speed(iter/s)": 0.311623 }, { "epoch": 0.46684451219512196, "grad_norm": 1.2498908042907715, "learning_rate": 9.786435970449439e-05, "loss": 0.24762344360351562, "memory(GiB)": 153.57, "step": 1225, "token_acc": 0.8947804473902237, "train_speed(iter/s)": 0.311804 }, { "epoch": 0.46875, "grad_norm": 1.9541747570037842, "learning_rate": 9.784701678661045e-05, "loss": 0.3105583190917969, "memory(GiB)": 153.57, "step": 1230, "token_acc": 0.8866652622709079, "train_speed(iter/s)": 0.311598 }, { "epoch": 0.47065548780487804, "grad_norm": 1.4577618837356567, "learning_rate": 9.782960528408115e-05, "loss": 0.2416633129119873, "memory(GiB)": 153.57, "step": 1235, "token_acc": 0.8801213960546282, "train_speed(iter/s)": 0.31189 }, { "epoch": 0.4725609756097561, "grad_norm": 1.7695592641830444, "learning_rate": 9.781212522186443e-05, "loss": 0.265671443939209, "memory(GiB)": 153.57, "step": 1240, "token_acc": 0.8566966028894963, "train_speed(iter/s)": 0.312151 }, { "epoch": 0.47446646341463417, "grad_norm": 1.0763741731643677, "learning_rate": 9.779457662501643e-05, "loss": 0.27475016117095946, "memory(GiB)": 153.57, "step": 1245, "token_acc": 0.8862177298183308, "train_speed(iter/s)": 0.312142 }, { "epoch": 0.4763719512195122, "grad_norm": 1.486527681350708, "learning_rate": 9.777695951869164e-05, "loss": 0.335015606880188, "memory(GiB)": 153.57, "step": 1250, "token_acc": 0.8671820098383697, "train_speed(iter/s)": 0.31235 }, { "epoch": 0.47827743902439024, "grad_norm": 0.9547694325447083, "learning_rate": 9.775927392814266e-05, "loss": 0.3068684577941895, "memory(GiB)": 153.57, "step": 1255, "token_acc": 0.8764990155718633, "train_speed(iter/s)": 0.312078 }, { "epoch": 0.4801829268292683, "grad_norm": 2.0388734340667725, "learning_rate": 9.774151987872027e-05, "loss": 0.34391183853149415, "memory(GiB)": 153.57, "step": 1260, "token_acc": 0.8614467283853758, "train_speed(iter/s)": 0.311748 }, { "epoch": 0.48208841463414637, "grad_norm": 1.6782687902450562, "learning_rate": 9.772369739587343e-05, "loss": 0.34048449993133545, "memory(GiB)": 153.57, "step": 1265, "token_acc": 0.848559381588194, "train_speed(iter/s)": 0.311915 }, { "epoch": 0.4839939024390244, "grad_norm": 1.3475286960601807, "learning_rate": 9.770580650514914e-05, "loss": 0.36070306301116944, "memory(GiB)": 153.57, "step": 1270, "token_acc": 0.8593304843304843, "train_speed(iter/s)": 0.311909 }, { "epoch": 0.48589939024390244, "grad_norm": 2.018991231918335, "learning_rate": 9.768784723219247e-05, "loss": 0.33952181339263915, "memory(GiB)": 153.57, "step": 1275, "token_acc": 0.8618884275139918, "train_speed(iter/s)": 0.31198 }, { "epoch": 0.4878048780487805, "grad_norm": 0.8511213660240173, "learning_rate": 9.766981960274653e-05, "loss": 0.2989319086074829, "memory(GiB)": 153.57, "step": 1280, "token_acc": 0.8763864924821296, "train_speed(iter/s)": 0.311827 }, { "epoch": 0.4897103658536585, "grad_norm": 1.042600154876709, "learning_rate": 9.76517236426524e-05, "loss": 0.26121604442596436, "memory(GiB)": 153.57, "step": 1285, "token_acc": 0.8825, "train_speed(iter/s)": 0.311914 }, { "epoch": 0.4916158536585366, "grad_norm": 1.0527515411376953, "learning_rate": 9.763355937784909e-05, "loss": 0.338307523727417, "memory(GiB)": 153.57, "step": 1290, "token_acc": 0.8713246917483402, "train_speed(iter/s)": 0.311957 }, { "epoch": 0.49352134146341464, "grad_norm": 1.2706397771835327, "learning_rate": 9.761532683437355e-05, "loss": 0.2742880582809448, "memory(GiB)": 153.57, "step": 1295, "token_acc": 0.8792172739541161, "train_speed(iter/s)": 0.311684 }, { "epoch": 0.4954268292682927, "grad_norm": 1.689902663230896, "learning_rate": 9.759702603836059e-05, "loss": 0.272379469871521, "memory(GiB)": 153.57, "step": 1300, "token_acc": 0.8790450928381963, "train_speed(iter/s)": 0.311809 }, { "epoch": 0.4973323170731707, "grad_norm": 1.4148832559585571, "learning_rate": 9.757865701604281e-05, "loss": 0.29515910148620605, "memory(GiB)": 153.57, "step": 1305, "token_acc": 0.87095485417499, "train_speed(iter/s)": 0.311681 }, { "epoch": 0.4992378048780488, "grad_norm": 1.14407479763031, "learning_rate": 9.756021979375071e-05, "loss": 0.3069546461105347, "memory(GiB)": 153.57, "step": 1310, "token_acc": 0.8470080197409007, "train_speed(iter/s)": 0.311848 }, { "epoch": 0.5011432926829268, "grad_norm": 1.4003043174743652, "learning_rate": 9.754171439791246e-05, "loss": 0.3129075527191162, "memory(GiB)": 153.57, "step": 1315, "token_acc": 0.8681318681318682, "train_speed(iter/s)": 0.311665 }, { "epoch": 0.5030487804878049, "grad_norm": 1.880160927772522, "learning_rate": 9.752314085505395e-05, "loss": 0.2947786808013916, "memory(GiB)": 153.57, "step": 1320, "token_acc": 0.8742240215924426, "train_speed(iter/s)": 0.311799 }, { "epoch": 0.504954268292683, "grad_norm": 1.6665220260620117, "learning_rate": 9.750449919179883e-05, "loss": 0.30986557006835935, "memory(GiB)": 153.57, "step": 1325, "token_acc": 0.8892331525948877, "train_speed(iter/s)": 0.311879 }, { "epoch": 0.506859756097561, "grad_norm": 1.7264877557754517, "learning_rate": 9.748578943486828e-05, "loss": 0.25479028224945066, "memory(GiB)": 153.57, "step": 1330, "token_acc": 0.9045857988165681, "train_speed(iter/s)": 0.312055 }, { "epoch": 0.508765243902439, "grad_norm": 1.917636752128601, "learning_rate": 9.746701161108123e-05, "loss": 0.28808770179748533, "memory(GiB)": 153.57, "step": 1335, "token_acc": 0.8614296636085627, "train_speed(iter/s)": 0.312115 }, { "epoch": 0.510670731707317, "grad_norm": 0.8204377293586731, "learning_rate": 9.744816574735405e-05, "loss": 0.31458215713500975, "memory(GiB)": 153.57, "step": 1340, "token_acc": 0.8775220233020744, "train_speed(iter/s)": 0.311825 }, { "epoch": 0.5125762195121951, "grad_norm": 2.273974657058716, "learning_rate": 9.742925187070069e-05, "loss": 0.2992034196853638, "memory(GiB)": 153.57, "step": 1345, "token_acc": 0.8610354223433242, "train_speed(iter/s)": 0.311304 }, { "epoch": 0.5144817073170732, "grad_norm": 1.5552994012832642, "learning_rate": 9.74102700082326e-05, "loss": 0.2865092992782593, "memory(GiB)": 153.57, "step": 1350, "token_acc": 0.8931027297353615, "train_speed(iter/s)": 0.311252 }, { "epoch": 0.5163871951219512, "grad_norm": 1.5065845251083374, "learning_rate": 9.739122018715867e-05, "loss": 0.31027472019195557, "memory(GiB)": 153.57, "step": 1355, "token_acc": 0.8730263157894737, "train_speed(iter/s)": 0.311314 }, { "epoch": 0.5182926829268293, "grad_norm": 1.096280574798584, "learning_rate": 9.737210243478521e-05, "loss": 0.3311405420303345, "memory(GiB)": 153.57, "step": 1360, "token_acc": 0.8530196458889159, "train_speed(iter/s)": 0.311391 }, { "epoch": 0.5201981707317073, "grad_norm": 1.6110566854476929, "learning_rate": 9.735291677851591e-05, "loss": 0.26511993408203127, "memory(GiB)": 153.57, "step": 1365, "token_acc": 0.8818943112907883, "train_speed(iter/s)": 0.311533 }, { "epoch": 0.5221036585365854, "grad_norm": 1.577938199043274, "learning_rate": 9.733366324585175e-05, "loss": 0.3386091232299805, "memory(GiB)": 153.57, "step": 1370, "token_acc": 0.8557594291539246, "train_speed(iter/s)": 0.311662 }, { "epoch": 0.5240091463414634, "grad_norm": 1.0601489543914795, "learning_rate": 9.731434186439105e-05, "loss": 0.32564878463745117, "memory(GiB)": 153.57, "step": 1375, "token_acc": 0.8716286307053942, "train_speed(iter/s)": 0.31176 }, { "epoch": 0.5259146341463414, "grad_norm": 1.554824709892273, "learning_rate": 9.72949526618294e-05, "loss": 0.31243460178375243, "memory(GiB)": 153.57, "step": 1380, "token_acc": 0.8688911358274352, "train_speed(iter/s)": 0.311363 }, { "epoch": 0.5278201219512195, "grad_norm": 1.173509120941162, "learning_rate": 9.727549566595954e-05, "loss": 0.2760689973831177, "memory(GiB)": 153.57, "step": 1385, "token_acc": 0.8811161328355276, "train_speed(iter/s)": 0.311412 }, { "epoch": 0.5297256097560976, "grad_norm": 1.100756049156189, "learning_rate": 9.725597090467144e-05, "loss": 0.2484744071960449, "memory(GiB)": 153.57, "step": 1390, "token_acc": 0.8801395464739596, "train_speed(iter/s)": 0.311486 }, { "epoch": 0.5316310975609756, "grad_norm": 1.4523242712020874, "learning_rate": 9.72363784059522e-05, "loss": 0.2857520580291748, "memory(GiB)": 153.57, "step": 1395, "token_acc": 0.8765182186234818, "train_speed(iter/s)": 0.310869 }, { "epoch": 0.5335365853658537, "grad_norm": 1.1975281238555908, "learning_rate": 9.721671819788602e-05, "loss": 0.3463920593261719, "memory(GiB)": 153.57, "step": 1400, "token_acc": 0.86286836935167, "train_speed(iter/s)": 0.31094 }, { "epoch": 0.5354420731707317, "grad_norm": 1.1421990394592285, "learning_rate": 9.71969903086541e-05, "loss": 0.2668483257293701, "memory(GiB)": 153.57, "step": 1405, "token_acc": 0.8880266075388027, "train_speed(iter/s)": 0.311131 }, { "epoch": 0.5373475609756098, "grad_norm": 0.9336335062980652, "learning_rate": 9.717719476653475e-05, "loss": 0.3079857110977173, "memory(GiB)": 153.57, "step": 1410, "token_acc": 0.8635057471264368, "train_speed(iter/s)": 0.311237 }, { "epoch": 0.5392530487804879, "grad_norm": 1.194987416267395, "learning_rate": 9.715733159990315e-05, "loss": 0.3367245435714722, "memory(GiB)": 153.57, "step": 1415, "token_acc": 0.8678325205986211, "train_speed(iter/s)": 0.311262 }, { "epoch": 0.5411585365853658, "grad_norm": 2.0616908073425293, "learning_rate": 9.71374008372315e-05, "loss": 0.3307985782623291, "memory(GiB)": 153.57, "step": 1420, "token_acc": 0.8654536135315223, "train_speed(iter/s)": 0.311317 }, { "epoch": 0.5430640243902439, "grad_norm": 1.0621626377105713, "learning_rate": 9.711740250708887e-05, "loss": 0.3368765115737915, "memory(GiB)": 153.57, "step": 1425, "token_acc": 0.8649394775960926, "train_speed(iter/s)": 0.311338 }, { "epoch": 0.5449695121951219, "grad_norm": 1.1878458261489868, "learning_rate": 9.709733663814113e-05, "loss": 0.32130744457244875, "memory(GiB)": 153.57, "step": 1430, "token_acc": 0.8556551923633208, "train_speed(iter/s)": 0.311432 }, { "epoch": 0.546875, "grad_norm": 1.4494432210922241, "learning_rate": 9.707720325915104e-05, "loss": 0.232450270652771, "memory(GiB)": 153.57, "step": 1435, "token_acc": 0.9092711948438275, "train_speed(iter/s)": 0.311626 }, { "epoch": 0.5487804878048781, "grad_norm": 1.5558534860610962, "learning_rate": 9.705700239897809e-05, "loss": 0.2759638786315918, "memory(GiB)": 153.57, "step": 1440, "token_acc": 0.8969610636277303, "train_speed(iter/s)": 0.311644 }, { "epoch": 0.5506859756097561, "grad_norm": 3.729151964187622, "learning_rate": 9.703673408657847e-05, "loss": 0.35018846988677976, "memory(GiB)": 153.57, "step": 1445, "token_acc": 0.8690563277249451, "train_speed(iter/s)": 0.31177 }, { "epoch": 0.5525914634146342, "grad_norm": 1.088983178138733, "learning_rate": 9.701639835100513e-05, "loss": 0.2562891960144043, "memory(GiB)": 153.57, "step": 1450, "token_acc": 0.8915433690783222, "train_speed(iter/s)": 0.311714 }, { "epoch": 0.5544969512195121, "grad_norm": 1.2130035161972046, "learning_rate": 9.69959952214076e-05, "loss": 0.2577697992324829, "memory(GiB)": 153.57, "step": 1455, "token_acc": 0.8849116447666515, "train_speed(iter/s)": 0.311793 }, { "epoch": 0.5564024390243902, "grad_norm": 0.8764408230781555, "learning_rate": 9.697552472703205e-05, "loss": 0.341878867149353, "memory(GiB)": 153.57, "step": 1460, "token_acc": 0.869355702302338, "train_speed(iter/s)": 0.311504 }, { "epoch": 0.5583079268292683, "grad_norm": 1.4719806909561157, "learning_rate": 9.695498689722118e-05, "loss": 0.25530133247375486, "memory(GiB)": 153.57, "step": 1465, "token_acc": 0.8735124107446447, "train_speed(iter/s)": 0.311579 }, { "epoch": 0.5602134146341463, "grad_norm": 1.0388938188552856, "learning_rate": 9.693438176141425e-05, "loss": 0.3078275680541992, "memory(GiB)": 153.57, "step": 1470, "token_acc": 0.8792689434364994, "train_speed(iter/s)": 0.311429 }, { "epoch": 0.5621189024390244, "grad_norm": 2.147573232650757, "learning_rate": 9.691370934914698e-05, "loss": 0.3397654056549072, "memory(GiB)": 153.57, "step": 1475, "token_acc": 0.8669218644449448, "train_speed(iter/s)": 0.311517 }, { "epoch": 0.5640243902439024, "grad_norm": 1.6442817449569702, "learning_rate": 9.68929696900515e-05, "loss": 0.3635141611099243, "memory(GiB)": 153.57, "step": 1480, "token_acc": 0.8456273764258555, "train_speed(iter/s)": 0.311556 }, { "epoch": 0.5659298780487805, "grad_norm": 1.1172783374786377, "learning_rate": 9.687216281385635e-05, "loss": 0.2884554386138916, "memory(GiB)": 153.57, "step": 1485, "token_acc": 0.8821746542905854, "train_speed(iter/s)": 0.311608 }, { "epoch": 0.5678353658536586, "grad_norm": 0.863226592540741, "learning_rate": 9.685128875038647e-05, "loss": 0.25299224853515623, "memory(GiB)": 153.57, "step": 1490, "token_acc": 0.8916094356524336, "train_speed(iter/s)": 0.311463 }, { "epoch": 0.5697408536585366, "grad_norm": 2.369429349899292, "learning_rate": 9.683034752956302e-05, "loss": 0.31259894371032715, "memory(GiB)": 153.57, "step": 1495, "token_acc": 0.8811218082879866, "train_speed(iter/s)": 0.311575 }, { "epoch": 0.5716463414634146, "grad_norm": 1.5992250442504883, "learning_rate": 9.680933918140348e-05, "loss": 0.2776714086532593, "memory(GiB)": 153.57, "step": 1500, "token_acc": 0.8852534562211981, "train_speed(iter/s)": 0.3117 }, { "epoch": 0.5735518292682927, "grad_norm": 1.1922904253005981, "learning_rate": 9.678826373602153e-05, "loss": 0.2939781188964844, "memory(GiB)": 153.57, "step": 1505, "token_acc": 0.8713798977853492, "train_speed(iter/s)": 0.311815 }, { "epoch": 0.5754573170731707, "grad_norm": 0.8227452039718628, "learning_rate": 9.676712122362706e-05, "loss": 0.29805405139923097, "memory(GiB)": 153.57, "step": 1510, "token_acc": 0.8696916812100058, "train_speed(iter/s)": 0.311912 }, { "epoch": 0.5773628048780488, "grad_norm": 1.1836766004562378, "learning_rate": 9.674591167452603e-05, "loss": 0.255311107635498, "memory(GiB)": 153.57, "step": 1515, "token_acc": 0.8832302078375377, "train_speed(iter/s)": 0.311955 }, { "epoch": 0.5792682926829268, "grad_norm": 1.229888916015625, "learning_rate": 9.672463511912055e-05, "loss": 0.34862771034240725, "memory(GiB)": 153.57, "step": 1520, "token_acc": 0.8595344387755102, "train_speed(iter/s)": 0.311965 }, { "epoch": 0.5811737804878049, "grad_norm": 1.418410062789917, "learning_rate": 9.670329158790878e-05, "loss": 0.24524290561676027, "memory(GiB)": 153.57, "step": 1525, "token_acc": 0.873972602739726, "train_speed(iter/s)": 0.311784 }, { "epoch": 0.583079268292683, "grad_norm": 0.9458048343658447, "learning_rate": 9.668188111148484e-05, "loss": 0.2745774745941162, "memory(GiB)": 153.57, "step": 1530, "token_acc": 0.8768554150632215, "train_speed(iter/s)": 0.311373 }, { "epoch": 0.584984756097561, "grad_norm": 1.4171174764633179, "learning_rate": 9.666040372053884e-05, "loss": 0.2510627269744873, "memory(GiB)": 153.57, "step": 1535, "token_acc": 0.9001858242633395, "train_speed(iter/s)": 0.311448 }, { "epoch": 0.586890243902439, "grad_norm": 1.1808751821517944, "learning_rate": 9.66388594458568e-05, "loss": 0.2654898166656494, "memory(GiB)": 153.57, "step": 1540, "token_acc": 0.8840615690168818, "train_speed(iter/s)": 0.311273 }, { "epoch": 0.588795731707317, "grad_norm": 1.881843090057373, "learning_rate": 9.661724831832062e-05, "loss": 0.2632683515548706, "memory(GiB)": 153.57, "step": 1545, "token_acc": 0.8864594309799789, "train_speed(iter/s)": 0.311362 }, { "epoch": 0.5907012195121951, "grad_norm": 0.9035945534706116, "learning_rate": 9.659557036890801e-05, "loss": 0.33440594673156737, "memory(GiB)": 153.57, "step": 1550, "token_acc": 0.8637865911237016, "train_speed(iter/s)": 0.31146 }, { "epoch": 0.5926067073170732, "grad_norm": 0.9938281774520874, "learning_rate": 9.657382562869249e-05, "loss": 0.3167550563812256, "memory(GiB)": 153.57, "step": 1555, "token_acc": 0.8725522193211488, "train_speed(iter/s)": 0.311491 }, { "epoch": 0.5945121951219512, "grad_norm": 6.01306676864624, "learning_rate": 9.655201412884327e-05, "loss": 0.2338878631591797, "memory(GiB)": 153.57, "step": 1560, "token_acc": 0.8975240715268226, "train_speed(iter/s)": 0.311669 }, { "epoch": 0.5964176829268293, "grad_norm": 1.4529238939285278, "learning_rate": 9.653013590062532e-05, "loss": 0.23712286949157715, "memory(GiB)": 153.57, "step": 1565, "token_acc": 0.8939890710382513, "train_speed(iter/s)": 0.311786 }, { "epoch": 0.5983231707317073, "grad_norm": 1.5016371011734009, "learning_rate": 9.650819097539922e-05, "loss": 0.3639110565185547, "memory(GiB)": 153.57, "step": 1570, "token_acc": 0.851959686450168, "train_speed(iter/s)": 0.311582 }, { "epoch": 0.6002286585365854, "grad_norm": 1.4246408939361572, "learning_rate": 9.648617938462117e-05, "loss": 0.33392388820648194, "memory(GiB)": 153.57, "step": 1575, "token_acc": 0.8669510461100954, "train_speed(iter/s)": 0.311613 }, { "epoch": 0.6021341463414634, "grad_norm": 1.6103848218917847, "learning_rate": 9.646410115984289e-05, "loss": 0.26432366371154786, "memory(GiB)": 153.57, "step": 1580, "token_acc": 0.8951769739401011, "train_speed(iter/s)": 0.311641 }, { "epoch": 0.6040396341463414, "grad_norm": 2.025301456451416, "learning_rate": 9.644195633271166e-05, "loss": 0.3115978717803955, "memory(GiB)": 153.57, "step": 1585, "token_acc": 0.8648798010500138, "train_speed(iter/s)": 0.311748 }, { "epoch": 0.6059451219512195, "grad_norm": 1.0957562923431396, "learning_rate": 9.641974493497024e-05, "loss": 0.24244344234466553, "memory(GiB)": 153.57, "step": 1590, "token_acc": 0.9026315789473685, "train_speed(iter/s)": 0.31185 }, { "epoch": 0.6078506097560976, "grad_norm": 1.1559940576553345, "learning_rate": 9.639746699845676e-05, "loss": 0.3340721607208252, "memory(GiB)": 153.57, "step": 1595, "token_acc": 0.8623429889771853, "train_speed(iter/s)": 0.311948 }, { "epoch": 0.6097560975609756, "grad_norm": 1.5783759355545044, "learning_rate": 9.637512255510475e-05, "loss": 0.28108949661254884, "memory(GiB)": 153.57, "step": 1600, "token_acc": 0.8772470467385721, "train_speed(iter/s)": 0.312029 }, { "epoch": 0.6097560975609756, "eval_loss": 0.23246054351329803, "eval_runtime": 34.3249, "eval_samples_per_second": 3.088, "eval_steps_per_second": 3.088, "eval_token_acc": 0.8748302740759366, "step": 1600 }, { "epoch": 0.6116615853658537, "grad_norm": 1.2525917291641235, "learning_rate": 9.635271163694309e-05, "loss": 0.30366218090057373, "memory(GiB)": 153.57, "step": 1605, "token_acc": 0.874601443195167, "train_speed(iter/s)": 0.310004 }, { "epoch": 0.6135670731707317, "grad_norm": 3.8448448181152344, "learning_rate": 9.633023427609591e-05, "loss": 0.3114325523376465, "memory(GiB)": 153.57, "step": 1610, "token_acc": 0.8608367750044892, "train_speed(iter/s)": 0.310069 }, { "epoch": 0.6154725609756098, "grad_norm": 1.189878225326538, "learning_rate": 9.630769050478264e-05, "loss": 0.28998188972473143, "memory(GiB)": 153.57, "step": 1615, "token_acc": 0.8658536585365854, "train_speed(iter/s)": 0.310168 }, { "epoch": 0.6173780487804879, "grad_norm": 1.4976190328598022, "learning_rate": 9.628508035531785e-05, "loss": 0.28848369121551515, "memory(GiB)": 153.57, "step": 1620, "token_acc": 0.8830427892234548, "train_speed(iter/s)": 0.310295 }, { "epoch": 0.6192835365853658, "grad_norm": 2.355262041091919, "learning_rate": 9.626240386011125e-05, "loss": 0.3392723560333252, "memory(GiB)": 153.57, "step": 1625, "token_acc": 0.8709232889297198, "train_speed(iter/s)": 0.310317 }, { "epoch": 0.6211890243902439, "grad_norm": 1.8870227336883545, "learning_rate": 9.623966105166772e-05, "loss": 0.26919510364532473, "memory(GiB)": 153.57, "step": 1630, "token_acc": 0.8945693678404751, "train_speed(iter/s)": 0.310299 }, { "epoch": 0.6230945121951219, "grad_norm": 1.1571450233459473, "learning_rate": 9.621685196258712e-05, "loss": 0.3129563331604004, "memory(GiB)": 153.57, "step": 1635, "token_acc": 0.8707672524646378, "train_speed(iter/s)": 0.310355 }, { "epoch": 0.625, "grad_norm": 1.5234031677246094, "learning_rate": 9.619397662556435e-05, "loss": 0.2913675785064697, "memory(GiB)": 153.57, "step": 1640, "token_acc": 0.8834123222748815, "train_speed(iter/s)": 0.310461 }, { "epoch": 0.6269054878048781, "grad_norm": 1.886027455329895, "learning_rate": 9.617103507338927e-05, "loss": 0.2501126527786255, "memory(GiB)": 153.57, "step": 1645, "token_acc": 0.8900193798449613, "train_speed(iter/s)": 0.310681 }, { "epoch": 0.6288109756097561, "grad_norm": 1.3831886053085327, "learning_rate": 9.614802733894665e-05, "loss": 0.3076192855834961, "memory(GiB)": 153.57, "step": 1650, "token_acc": 0.8727183029107055, "train_speed(iter/s)": 0.310542 }, { "epoch": 0.6307164634146342, "grad_norm": 2.619527816772461, "learning_rate": 9.612495345521612e-05, "loss": 0.3713055610656738, "memory(GiB)": 153.57, "step": 1655, "token_acc": 0.8588663030903982, "train_speed(iter/s)": 0.310561 }, { "epoch": 0.6326219512195121, "grad_norm": 1.350033164024353, "learning_rate": 9.610181345527217e-05, "loss": 0.3034797430038452, "memory(GiB)": 153.57, "step": 1660, "token_acc": 0.8719119226638024, "train_speed(iter/s)": 0.310605 }, { "epoch": 0.6345274390243902, "grad_norm": 1.3440734148025513, "learning_rate": 9.6078607372284e-05, "loss": 0.2894395351409912, "memory(GiB)": 153.57, "step": 1665, "token_acc": 0.8753722794959908, "train_speed(iter/s)": 0.310725 }, { "epoch": 0.6364329268292683, "grad_norm": 1.040054202079773, "learning_rate": 9.605533523951558e-05, "loss": 0.30828425884246824, "memory(GiB)": 153.57, "step": 1670, "token_acc": 0.8687451886066204, "train_speed(iter/s)": 0.310753 }, { "epoch": 0.6383384146341463, "grad_norm": 1.3118577003479004, "learning_rate": 9.603199709032551e-05, "loss": 0.3071297168731689, "memory(GiB)": 153.57, "step": 1675, "token_acc": 0.8619746331758269, "train_speed(iter/s)": 0.310861 }, { "epoch": 0.6402439024390244, "grad_norm": 1.5613261461257935, "learning_rate": 9.600859295816708e-05, "loss": 0.31910550594329834, "memory(GiB)": 153.57, "step": 1680, "token_acc": 0.8761452879581152, "train_speed(iter/s)": 0.310847 }, { "epoch": 0.6421493902439024, "grad_norm": 1.0604315996170044, "learning_rate": 9.598512287658815e-05, "loss": 0.22136983871459961, "memory(GiB)": 153.57, "step": 1685, "token_acc": 0.8949422670352903, "train_speed(iter/s)": 0.31081 }, { "epoch": 0.6440548780487805, "grad_norm": 1.3256491422653198, "learning_rate": 9.596158687923104e-05, "loss": 0.2686015605926514, "memory(GiB)": 153.57, "step": 1690, "token_acc": 0.8778359511343804, "train_speed(iter/s)": 0.310884 }, { "epoch": 0.6459603658536586, "grad_norm": 1.108499526977539, "learning_rate": 9.593798499983265e-05, "loss": 0.29157109260559083, "memory(GiB)": 153.57, "step": 1695, "token_acc": 0.8699858423784804, "train_speed(iter/s)": 0.310938 }, { "epoch": 0.6478658536585366, "grad_norm": 1.1435343027114868, "learning_rate": 9.591431727222424e-05, "loss": 0.2633403778076172, "memory(GiB)": 153.57, "step": 1700, "token_acc": 0.8936655553605896, "train_speed(iter/s)": 0.310945 }, { "epoch": 0.6497713414634146, "grad_norm": 2.4976935386657715, "learning_rate": 9.589058373033153e-05, "loss": 0.2663792371749878, "memory(GiB)": 153.57, "step": 1705, "token_acc": 0.8726810673443456, "train_speed(iter/s)": 0.311099 }, { "epoch": 0.6516768292682927, "grad_norm": 3.7899134159088135, "learning_rate": 9.586678440817453e-05, "loss": 0.3262809753417969, "memory(GiB)": 153.57, "step": 1710, "token_acc": 0.8706865237922267, "train_speed(iter/s)": 0.311229 }, { "epoch": 0.6535823170731707, "grad_norm": 1.436048150062561, "learning_rate": 9.584291933986753e-05, "loss": 0.2854708194732666, "memory(GiB)": 153.57, "step": 1715, "token_acc": 0.8873020800993481, "train_speed(iter/s)": 0.311158 }, { "epoch": 0.6554878048780488, "grad_norm": 1.339398980140686, "learning_rate": 9.581898855961912e-05, "loss": 0.2887399435043335, "memory(GiB)": 153.57, "step": 1720, "token_acc": 0.8714679531357684, "train_speed(iter/s)": 0.311307 }, { "epoch": 0.6573932926829268, "grad_norm": 1.136566400527954, "learning_rate": 9.579499210173202e-05, "loss": 0.3172308921813965, "memory(GiB)": 153.57, "step": 1725, "token_acc": 0.8692329465043872, "train_speed(iter/s)": 0.311411 }, { "epoch": 0.6592987804878049, "grad_norm": 0.9350283741950989, "learning_rate": 9.577093000060312e-05, "loss": 0.29151625633239747, "memory(GiB)": 153.57, "step": 1730, "token_acc": 0.8844845630559917, "train_speed(iter/s)": 0.311317 }, { "epoch": 0.661204268292683, "grad_norm": 1.086296796798706, "learning_rate": 9.574680229072341e-05, "loss": 0.2717117786407471, "memory(GiB)": 153.57, "step": 1735, "token_acc": 0.8821439147172047, "train_speed(iter/s)": 0.311427 }, { "epoch": 0.663109756097561, "grad_norm": 1.5398547649383545, "learning_rate": 9.572260900667794e-05, "loss": 0.29207592010498046, "memory(GiB)": 153.57, "step": 1740, "token_acc": 0.8932724252491694, "train_speed(iter/s)": 0.311567 }, { "epoch": 0.665015243902439, "grad_norm": 1.842373251914978, "learning_rate": 9.569835018314568e-05, "loss": 0.3018088102340698, "memory(GiB)": 153.57, "step": 1745, "token_acc": 0.8834056399132321, "train_speed(iter/s)": 0.311628 }, { "epoch": 0.666920731707317, "grad_norm": 1.1789196729660034, "learning_rate": 9.567402585489963e-05, "loss": 0.3027354717254639, "memory(GiB)": 153.57, "step": 1750, "token_acc": 0.8825764429165913, "train_speed(iter/s)": 0.311677 }, { "epoch": 0.6688262195121951, "grad_norm": 1.2546907663345337, "learning_rate": 9.564963605680668e-05, "loss": 0.3155559539794922, "memory(GiB)": 153.57, "step": 1755, "token_acc": 0.883265513733469, "train_speed(iter/s)": 0.311778 }, { "epoch": 0.6707317073170732, "grad_norm": 1.7712910175323486, "learning_rate": 9.56251808238275e-05, "loss": 0.28950986862182615, "memory(GiB)": 153.57, "step": 1760, "token_acc": 0.8860274926048373, "train_speed(iter/s)": 0.311756 }, { "epoch": 0.6726371951219512, "grad_norm": 1.482928991317749, "learning_rate": 9.560066019101661e-05, "loss": 0.30448217391967775, "memory(GiB)": 153.57, "step": 1765, "token_acc": 0.8810178817056397, "train_speed(iter/s)": 0.311578 }, { "epoch": 0.6745426829268293, "grad_norm": 1.4433684349060059, "learning_rate": 9.557607419352226e-05, "loss": 0.26116199493408204, "memory(GiB)": 153.57, "step": 1770, "token_acc": 0.8942255801403131, "train_speed(iter/s)": 0.31172 }, { "epoch": 0.6764481707317073, "grad_norm": 1.6697522401809692, "learning_rate": 9.55514228665864e-05, "loss": 0.283888840675354, "memory(GiB)": 153.57, "step": 1775, "token_acc": 0.8783081285444234, "train_speed(iter/s)": 0.311792 }, { "epoch": 0.6783536585365854, "grad_norm": 1.5842039585113525, "learning_rate": 9.552670624554461e-05, "loss": 0.23155362606048585, "memory(GiB)": 153.57, "step": 1780, "token_acc": 0.8687882496940025, "train_speed(iter/s)": 0.311887 }, { "epoch": 0.6802591463414634, "grad_norm": 1.128679633140564, "learning_rate": 9.550192436582606e-05, "loss": 0.2368324041366577, "memory(GiB)": 153.57, "step": 1785, "token_acc": 0.893722194760257, "train_speed(iter/s)": 0.311916 }, { "epoch": 0.6821646341463414, "grad_norm": 1.5874735116958618, "learning_rate": 9.54770772629535e-05, "loss": 0.25910029411315916, "memory(GiB)": 153.57, "step": 1790, "token_acc": 0.8858211101524303, "train_speed(iter/s)": 0.311996 }, { "epoch": 0.6840701219512195, "grad_norm": 1.1460316181182861, "learning_rate": 9.545216497254315e-05, "loss": 0.33331966400146484, "memory(GiB)": 153.57, "step": 1795, "token_acc": 0.8774216273335682, "train_speed(iter/s)": 0.311828 }, { "epoch": 0.6859756097560976, "grad_norm": 1.669877529144287, "learning_rate": 9.542718753030463e-05, "loss": 0.2549333095550537, "memory(GiB)": 153.57, "step": 1800, "token_acc": 0.8725895316804407, "train_speed(iter/s)": 0.311957 }, { "epoch": 0.6878810975609756, "grad_norm": 0.9983353018760681, "learning_rate": 9.540214497204102e-05, "loss": 0.30560691356658937, "memory(GiB)": 153.57, "step": 1805, "token_acc": 0.8679112363508278, "train_speed(iter/s)": 0.312113 }, { "epoch": 0.6897865853658537, "grad_norm": 0.788419246673584, "learning_rate": 9.537703733364871e-05, "loss": 0.21397290229797364, "memory(GiB)": 153.57, "step": 1810, "token_acc": 0.8909365052772659, "train_speed(iter/s)": 0.312146 }, { "epoch": 0.6916920731707317, "grad_norm": 1.567008137702942, "learning_rate": 9.535186465111735e-05, "loss": 0.2592519998550415, "memory(GiB)": 153.57, "step": 1815, "token_acc": 0.8865211810012837, "train_speed(iter/s)": 0.31225 }, { "epoch": 0.6935975609756098, "grad_norm": 2.011746406555176, "learning_rate": 9.532662696052985e-05, "loss": 0.29129414558410643, "memory(GiB)": 153.57, "step": 1820, "token_acc": 0.8738095238095238, "train_speed(iter/s)": 0.312079 }, { "epoch": 0.6955030487804879, "grad_norm": 1.2923414707183838, "learning_rate": 9.530132429806234e-05, "loss": 0.25933692455291746, "memory(GiB)": 153.57, "step": 1825, "token_acc": 0.8883326077492382, "train_speed(iter/s)": 0.312125 }, { "epoch": 0.6974085365853658, "grad_norm": 1.2152726650238037, "learning_rate": 9.527595669998399e-05, "loss": 0.18221054077148438, "memory(GiB)": 153.57, "step": 1830, "token_acc": 0.9037286063569682, "train_speed(iter/s)": 0.31202 }, { "epoch": 0.6993140243902439, "grad_norm": 0.9299729466438293, "learning_rate": 9.525052420265714e-05, "loss": 0.23019843101501464, "memory(GiB)": 153.57, "step": 1835, "token_acc": 0.9055954088952655, "train_speed(iter/s)": 0.3121 }, { "epoch": 0.7012195121951219, "grad_norm": 1.265413522720337, "learning_rate": 9.522502684253709e-05, "loss": 0.2707319736480713, "memory(GiB)": 153.57, "step": 1840, "token_acc": 0.8892497564144203, "train_speed(iter/s)": 0.31213 }, { "epoch": 0.703125, "grad_norm": 1.3256968259811401, "learning_rate": 9.519946465617218e-05, "loss": 0.29828810691833496, "memory(GiB)": 153.57, "step": 1845, "token_acc": 0.880569306930693, "train_speed(iter/s)": 0.312189 }, { "epoch": 0.7050304878048781, "grad_norm": 1.7249078750610352, "learning_rate": 9.517383768020361e-05, "loss": 0.27769858837127687, "memory(GiB)": 153.57, "step": 1850, "token_acc": 0.8865149833518313, "train_speed(iter/s)": 0.312259 }, { "epoch": 0.7069359756097561, "grad_norm": 1.1537457704544067, "learning_rate": 9.51481459513655e-05, "loss": 0.3401664733886719, "memory(GiB)": 153.57, "step": 1855, "token_acc": 0.8654970760233918, "train_speed(iter/s)": 0.312297 }, { "epoch": 0.7088414634146342, "grad_norm": 1.6422456502914429, "learning_rate": 9.512238950648474e-05, "loss": 0.24998586177825927, "memory(GiB)": 153.57, "step": 1860, "token_acc": 0.8969654199011997, "train_speed(iter/s)": 0.312399 }, { "epoch": 0.7107469512195121, "grad_norm": 1.9124877452850342, "learning_rate": 9.509656838248104e-05, "loss": 0.23788347244262695, "memory(GiB)": 153.57, "step": 1865, "token_acc": 0.897497982243745, "train_speed(iter/s)": 0.312318 }, { "epoch": 0.7126524390243902, "grad_norm": 1.3361539840698242, "learning_rate": 9.507068261636679e-05, "loss": 0.2780283451080322, "memory(GiB)": 153.57, "step": 1870, "token_acc": 0.8854686633388796, "train_speed(iter/s)": 0.312394 }, { "epoch": 0.7145579268292683, "grad_norm": 1.4094849824905396, "learning_rate": 9.504473224524704e-05, "loss": 0.21094465255737305, "memory(GiB)": 153.57, "step": 1875, "token_acc": 0.9104519774011299, "train_speed(iter/s)": 0.312489 }, { "epoch": 0.7164634146341463, "grad_norm": 1.5078719854354858, "learning_rate": 9.501871730631942e-05, "loss": 0.24983351230621337, "memory(GiB)": 153.57, "step": 1880, "token_acc": 0.8937276568085798, "train_speed(iter/s)": 0.312616 }, { "epoch": 0.7183689024390244, "grad_norm": 0.779574453830719, "learning_rate": 9.49926378368742e-05, "loss": 0.2737581729888916, "memory(GiB)": 153.57, "step": 1885, "token_acc": 0.9021956087824351, "train_speed(iter/s)": 0.312618 }, { "epoch": 0.7202743902439024, "grad_norm": 1.418870210647583, "learning_rate": 9.496649387429404e-05, "loss": 0.17911324501037598, "memory(GiB)": 153.57, "step": 1890, "token_acc": 0.9156880457693466, "train_speed(iter/s)": 0.312732 }, { "epoch": 0.7221798780487805, "grad_norm": 1.286210536956787, "learning_rate": 9.49402854560541e-05, "loss": 0.2388221263885498, "memory(GiB)": 153.57, "step": 1895, "token_acc": 0.9016746411483254, "train_speed(iter/s)": 0.312803 }, { "epoch": 0.7240853658536586, "grad_norm": 1.3607137203216553, "learning_rate": 9.491401261972195e-05, "loss": 0.24465394020080566, "memory(GiB)": 153.57, "step": 1900, "token_acc": 0.906951871657754, "train_speed(iter/s)": 0.312893 }, { "epoch": 0.7259908536585366, "grad_norm": 1.0990049839019775, "learning_rate": 9.488767540295747e-05, "loss": 0.30425970554351806, "memory(GiB)": 153.57, "step": 1905, "token_acc": 0.8739007798241247, "train_speed(iter/s)": 0.312953 }, { "epoch": 0.7278963414634146, "grad_norm": 1.0542446374893188, "learning_rate": 9.486127384351282e-05, "loss": 0.3175650596618652, "memory(GiB)": 153.57, "step": 1910, "token_acc": 0.873272884283247, "train_speed(iter/s)": 0.313019 }, { "epoch": 0.7298018292682927, "grad_norm": 1.0548063516616821, "learning_rate": 9.48348079792324e-05, "loss": 0.28133995532989503, "memory(GiB)": 153.57, "step": 1915, "token_acc": 0.8801748521470815, "train_speed(iter/s)": 0.312855 }, { "epoch": 0.7317073170731707, "grad_norm": 1.0727624893188477, "learning_rate": 9.480827784805278e-05, "loss": 0.25853610038757324, "memory(GiB)": 153.57, "step": 1920, "token_acc": 0.879682679594535, "train_speed(iter/s)": 0.312945 }, { "epoch": 0.7336128048780488, "grad_norm": 1.111700415611267, "learning_rate": 9.478168348800267e-05, "loss": 0.25476689338684083, "memory(GiB)": 153.57, "step": 1925, "token_acc": 0.8913331128018525, "train_speed(iter/s)": 0.312931 }, { "epoch": 0.7355182926829268, "grad_norm": 1.7454627752304077, "learning_rate": 9.475502493720283e-05, "loss": 0.22988641262054443, "memory(GiB)": 153.57, "step": 1930, "token_acc": 0.9015300117693213, "train_speed(iter/s)": 0.313074 }, { "epoch": 0.7374237804878049, "grad_norm": 1.185491919517517, "learning_rate": 9.472830223386603e-05, "loss": 0.267413854598999, "memory(GiB)": 153.57, "step": 1935, "token_acc": 0.8834080717488789, "train_speed(iter/s)": 0.313103 }, { "epoch": 0.739329268292683, "grad_norm": 1.1870551109313965, "learning_rate": 9.470151541629699e-05, "loss": 0.2324226140975952, "memory(GiB)": 153.57, "step": 1940, "token_acc": 0.8942866365397031, "train_speed(iter/s)": 0.313139 }, { "epoch": 0.741234756097561, "grad_norm": 1.3034225702285767, "learning_rate": 9.467466452289237e-05, "loss": 0.31863765716552733, "memory(GiB)": 153.57, "step": 1945, "token_acc": 0.8783288541412168, "train_speed(iter/s)": 0.313185 }, { "epoch": 0.743140243902439, "grad_norm": 1.2649426460266113, "learning_rate": 9.464774959214063e-05, "loss": 0.23670883178710939, "memory(GiB)": 153.57, "step": 1950, "token_acc": 0.8771220695230396, "train_speed(iter/s)": 0.313337 }, { "epoch": 0.745045731707317, "grad_norm": 1.1393760442733765, "learning_rate": 9.462077066262206e-05, "loss": 0.3195551156997681, "memory(GiB)": 153.57, "step": 1955, "token_acc": 0.8727823453050627, "train_speed(iter/s)": 0.313383 }, { "epoch": 0.7469512195121951, "grad_norm": 1.1845899820327759, "learning_rate": 9.459372777300864e-05, "loss": 0.2751941680908203, "memory(GiB)": 153.57, "step": 1960, "token_acc": 0.8919177427068389, "train_speed(iter/s)": 0.313434 }, { "epoch": 0.7488567073170732, "grad_norm": 1.5352745056152344, "learning_rate": 9.456662096206408e-05, "loss": 0.2893501043319702, "memory(GiB)": 153.57, "step": 1965, "token_acc": 0.8774193548387097, "train_speed(iter/s)": 0.313512 }, { "epoch": 0.7507621951219512, "grad_norm": 0.9482070207595825, "learning_rate": 9.45394502686437e-05, "loss": 0.1899315595626831, "memory(GiB)": 153.57, "step": 1970, "token_acc": 0.9166033434650456, "train_speed(iter/s)": 0.313585 }, { "epoch": 0.7526676829268293, "grad_norm": 1.413886547088623, "learning_rate": 9.451221573169438e-05, "loss": 0.26173012256622313, "memory(GiB)": 153.57, "step": 1975, "token_acc": 0.8874353513842409, "train_speed(iter/s)": 0.313459 }, { "epoch": 0.7545731707317073, "grad_norm": 1.7136046886444092, "learning_rate": 9.448491739025454e-05, "loss": 0.2626305103302002, "memory(GiB)": 153.57, "step": 1980, "token_acc": 0.9048473967684022, "train_speed(iter/s)": 0.313527 }, { "epoch": 0.7564786585365854, "grad_norm": 0.8602431416511536, "learning_rate": 9.445755528345402e-05, "loss": 0.2829297065734863, "memory(GiB)": 153.57, "step": 1985, "token_acc": 0.877088799889518, "train_speed(iter/s)": 0.313548 }, { "epoch": 0.7583841463414634, "grad_norm": 1.0818045139312744, "learning_rate": 9.44301294505141e-05, "loss": 0.2595368385314941, "memory(GiB)": 153.57, "step": 1990, "token_acc": 0.8809038710872383, "train_speed(iter/s)": 0.313541 }, { "epoch": 0.7602896341463414, "grad_norm": 1.1734464168548584, "learning_rate": 9.440263993074736e-05, "loss": 0.24076690673828124, "memory(GiB)": 153.57, "step": 1995, "token_acc": 0.8891471066697653, "train_speed(iter/s)": 0.313585 }, { "epoch": 0.7621951219512195, "grad_norm": 1.243065357208252, "learning_rate": 9.437508676355773e-05, "loss": 0.2625396728515625, "memory(GiB)": 153.57, "step": 2000, "token_acc": 0.8913184672006927, "train_speed(iter/s)": 0.313629 }, { "epoch": 0.7641006097560976, "grad_norm": 0.6917441487312317, "learning_rate": 9.43474699884403e-05, "loss": 0.29215116500854493, "memory(GiB)": 153.57, "step": 2005, "token_acc": 0.8675373134328358, "train_speed(iter/s)": 0.313686 }, { "epoch": 0.7660060975609756, "grad_norm": 0.9860313534736633, "learning_rate": 9.431978964498143e-05, "loss": 0.22160229682922364, "memory(GiB)": 153.57, "step": 2010, "token_acc": 0.9010582010582011, "train_speed(iter/s)": 0.313744 }, { "epoch": 0.7679115853658537, "grad_norm": 1.4229819774627686, "learning_rate": 9.429204577285852e-05, "loss": 0.22371079921722412, "memory(GiB)": 153.57, "step": 2015, "token_acc": 0.9091507570770243, "train_speed(iter/s)": 0.313832 }, { "epoch": 0.7698170731707317, "grad_norm": 1.678576111793518, "learning_rate": 9.426423841184005e-05, "loss": 0.29344701766967773, "memory(GiB)": 153.57, "step": 2020, "token_acc": 0.8827444956477215, "train_speed(iter/s)": 0.313886 }, { "epoch": 0.7717225609756098, "grad_norm": 1.0253329277038574, "learning_rate": 9.423636760178553e-05, "loss": 0.24502973556518554, "memory(GiB)": 153.57, "step": 2025, "token_acc": 0.8980091883614089, "train_speed(iter/s)": 0.313691 }, { "epoch": 0.7736280487804879, "grad_norm": 1.0854699611663818, "learning_rate": 9.420843338264542e-05, "loss": 0.23021321296691893, "memory(GiB)": 153.57, "step": 2030, "token_acc": 0.9060272197018795, "train_speed(iter/s)": 0.313724 }, { "epoch": 0.7755335365853658, "grad_norm": 1.0831788778305054, "learning_rate": 9.418043579446102e-05, "loss": 0.21558291912078859, "memory(GiB)": 153.57, "step": 2035, "token_acc": 0.8868250539956803, "train_speed(iter/s)": 0.313622 }, { "epoch": 0.7774390243902439, "grad_norm": 2.5733518600463867, "learning_rate": 9.415237487736452e-05, "loss": 0.22351553440093994, "memory(GiB)": 153.57, "step": 2040, "token_acc": 0.9200901803607214, "train_speed(iter/s)": 0.3137 }, { "epoch": 0.7793445121951219, "grad_norm": 0.9895846247673035, "learning_rate": 9.412425067157888e-05, "loss": 0.2655385732650757, "memory(GiB)": 153.57, "step": 2045, "token_acc": 0.873913882988994, "train_speed(iter/s)": 0.313488 }, { "epoch": 0.78125, "grad_norm": 1.6192615032196045, "learning_rate": 9.409606321741775e-05, "loss": 0.25571722984313966, "memory(GiB)": 153.57, "step": 2050, "token_acc": 0.8776411226742352, "train_speed(iter/s)": 0.313397 }, { "epoch": 0.7831554878048781, "grad_norm": 0.7216309905052185, "learning_rate": 9.406781255528547e-05, "loss": 0.1903886914253235, "memory(GiB)": 153.57, "step": 2055, "token_acc": 0.9148338792700047, "train_speed(iter/s)": 0.313551 }, { "epoch": 0.7850609756097561, "grad_norm": 1.5867024660110474, "learning_rate": 9.403949872567695e-05, "loss": 0.2687596082687378, "memory(GiB)": 153.57, "step": 2060, "token_acc": 0.8908114558472554, "train_speed(iter/s)": 0.313562 }, { "epoch": 0.7869664634146342, "grad_norm": 2.270909309387207, "learning_rate": 9.40111217691777e-05, "loss": 0.23305611610412597, "memory(GiB)": 153.57, "step": 2065, "token_acc": 0.8936358279316441, "train_speed(iter/s)": 0.313682 }, { "epoch": 0.7888719512195121, "grad_norm": 1.470158338546753, "learning_rate": 9.398268172646365e-05, "loss": 0.2769412279129028, "memory(GiB)": 153.57, "step": 2070, "token_acc": 0.8737963532063102, "train_speed(iter/s)": 0.313386 }, { "epoch": 0.7907774390243902, "grad_norm": 1.4024021625518799, "learning_rate": 9.39541786383012e-05, "loss": 0.2864339828491211, "memory(GiB)": 153.57, "step": 2075, "token_acc": 0.8757205441549458, "train_speed(iter/s)": 0.313442 }, { "epoch": 0.7926829268292683, "grad_norm": 1.0494656562805176, "learning_rate": 9.392561254554713e-05, "loss": 0.23985681533813477, "memory(GiB)": 153.57, "step": 2080, "token_acc": 0.8985376827896513, "train_speed(iter/s)": 0.313475 }, { "epoch": 0.7945884146341463, "grad_norm": 1.0734888315200806, "learning_rate": 9.389698348914849e-05, "loss": 0.2453533172607422, "memory(GiB)": 153.57, "step": 2085, "token_acc": 0.8818897637795275, "train_speed(iter/s)": 0.313346 }, { "epoch": 0.7964939024390244, "grad_norm": 1.2278450727462769, "learning_rate": 9.386829151014262e-05, "loss": 0.2502995729446411, "memory(GiB)": 153.57, "step": 2090, "token_acc": 0.8984489318115306, "train_speed(iter/s)": 0.313414 }, { "epoch": 0.7983993902439024, "grad_norm": 1.2972345352172852, "learning_rate": 9.383953664965704e-05, "loss": 0.2828823566436768, "memory(GiB)": 153.57, "step": 2095, "token_acc": 0.8863003291972651, "train_speed(iter/s)": 0.313466 }, { "epoch": 0.8003048780487805, "grad_norm": 0.8569463491439819, "learning_rate": 9.381071894890941e-05, "loss": 0.2988056421279907, "memory(GiB)": 153.57, "step": 2100, "token_acc": 0.8759458032729193, "train_speed(iter/s)": 0.313523 }, { "epoch": 0.8022103658536586, "grad_norm": 1.3303909301757812, "learning_rate": 9.378183844920747e-05, "loss": 0.2566019058227539, "memory(GiB)": 153.57, "step": 2105, "token_acc": 0.8919829726853494, "train_speed(iter/s)": 0.313535 }, { "epoch": 0.8041158536585366, "grad_norm": 1.0876271724700928, "learning_rate": 9.375289519194894e-05, "loss": 0.32460808753967285, "memory(GiB)": 153.57, "step": 2110, "token_acc": 0.8725868725868726, "train_speed(iter/s)": 0.313607 }, { "epoch": 0.8060213414634146, "grad_norm": 1.6545137166976929, "learning_rate": 9.372388921862156e-05, "loss": 0.28569669723510743, "memory(GiB)": 153.57, "step": 2115, "token_acc": 0.8870186176995665, "train_speed(iter/s)": 0.31363 }, { "epoch": 0.8079268292682927, "grad_norm": 1.1067308187484741, "learning_rate": 9.369482057080292e-05, "loss": 0.268172550201416, "memory(GiB)": 153.57, "step": 2120, "token_acc": 0.8933823529411765, "train_speed(iter/s)": 0.313595 }, { "epoch": 0.8098323170731707, "grad_norm": 1.01986563205719, "learning_rate": 9.366568929016046e-05, "loss": 0.2749826908111572, "memory(GiB)": 153.57, "step": 2125, "token_acc": 0.8816519416478323, "train_speed(iter/s)": 0.313673 }, { "epoch": 0.8117378048780488, "grad_norm": 1.3664065599441528, "learning_rate": 9.363649541845142e-05, "loss": 0.28502697944641114, "memory(GiB)": 153.57, "step": 2130, "token_acc": 0.8858453148239961, "train_speed(iter/s)": 0.313682 }, { "epoch": 0.8136432926829268, "grad_norm": 1.2892429828643799, "learning_rate": 9.360723899752273e-05, "loss": 0.24687435626983642, "memory(GiB)": 153.57, "step": 2135, "token_acc": 0.8996655518394648, "train_speed(iter/s)": 0.313818 }, { "epoch": 0.8155487804878049, "grad_norm": 1.45743727684021, "learning_rate": 9.357792006931098e-05, "loss": 0.2536599159240723, "memory(GiB)": 153.57, "step": 2140, "token_acc": 0.8924439365804885, "train_speed(iter/s)": 0.313831 }, { "epoch": 0.817454268292683, "grad_norm": 1.162394404411316, "learning_rate": 9.35485386758424e-05, "loss": 0.2542218446731567, "memory(GiB)": 153.57, "step": 2145, "token_acc": 0.8956147430723702, "train_speed(iter/s)": 0.313895 }, { "epoch": 0.819359756097561, "grad_norm": 0.8335379362106323, "learning_rate": 9.35190948592327e-05, "loss": 0.2729935646057129, "memory(GiB)": 153.57, "step": 2150, "token_acc": 0.8918839298865813, "train_speed(iter/s)": 0.313888 }, { "epoch": 0.821265243902439, "grad_norm": 1.6468294858932495, "learning_rate": 9.348958866168712e-05, "loss": 0.27462267875671387, "memory(GiB)": 153.57, "step": 2155, "token_acc": 0.887227680478428, "train_speed(iter/s)": 0.313968 }, { "epoch": 0.823170731707317, "grad_norm": 1.0363041162490845, "learning_rate": 9.346002012550027e-05, "loss": 0.29891514778137207, "memory(GiB)": 153.57, "step": 2160, "token_acc": 0.87447581772435, "train_speed(iter/s)": 0.314042 }, { "epoch": 0.8250762195121951, "grad_norm": 0.962956964969635, "learning_rate": 9.343038929305617e-05, "loss": 0.2765654563903809, "memory(GiB)": 153.57, "step": 2165, "token_acc": 0.8814729574223246, "train_speed(iter/s)": 0.314024 }, { "epoch": 0.8269817073170732, "grad_norm": 1.3510065078735352, "learning_rate": 9.340069620682806e-05, "loss": 0.2752182722091675, "memory(GiB)": 153.57, "step": 2170, "token_acc": 0.8872699386503068, "train_speed(iter/s)": 0.314085 }, { "epoch": 0.8288871951219512, "grad_norm": 1.1155825853347778, "learning_rate": 9.337094090937852e-05, "loss": 0.24911198616027833, "memory(GiB)": 153.57, "step": 2175, "token_acc": 0.8984463812050019, "train_speed(iter/s)": 0.313895 }, { "epoch": 0.8307926829268293, "grad_norm": 1.580260992050171, "learning_rate": 9.334112344335924e-05, "loss": 0.2564751148223877, "memory(GiB)": 153.57, "step": 2180, "token_acc": 0.8815189873417721, "train_speed(iter/s)": 0.313951 }, { "epoch": 0.8326981707317073, "grad_norm": 1.73094642162323, "learning_rate": 9.331124385151099e-05, "loss": 0.2756509304046631, "memory(GiB)": 153.57, "step": 2185, "token_acc": 0.867700926524456, "train_speed(iter/s)": 0.314035 }, { "epoch": 0.8346036585365854, "grad_norm": 0.9201841354370117, "learning_rate": 9.328130217666366e-05, "loss": 0.18544576168060303, "memory(GiB)": 153.57, "step": 2190, "token_acc": 0.9115516584064048, "train_speed(iter/s)": 0.314174 }, { "epoch": 0.8365091463414634, "grad_norm": 1.5418007373809814, "learning_rate": 9.325129846173613e-05, "loss": 0.26235642433166506, "memory(GiB)": 153.57, "step": 2195, "token_acc": 0.900114372855509, "train_speed(iter/s)": 0.314323 }, { "epoch": 0.8384146341463414, "grad_norm": 1.3638105392456055, "learning_rate": 9.322123274973613e-05, "loss": 0.27008795738220215, "memory(GiB)": 153.57, "step": 2200, "token_acc": 0.8857959961868446, "train_speed(iter/s)": 0.314339 }, { "epoch": 0.8403201219512195, "grad_norm": 1.4230475425720215, "learning_rate": 9.319110508376036e-05, "loss": 0.27157442569732665, "memory(GiB)": 153.57, "step": 2205, "token_acc": 0.8980582524271845, "train_speed(iter/s)": 0.314224 }, { "epoch": 0.8422256097560976, "grad_norm": 1.1473711729049683, "learning_rate": 9.316091550699424e-05, "loss": 0.2794020175933838, "memory(GiB)": 153.57, "step": 2210, "token_acc": 0.8650378126817917, "train_speed(iter/s)": 0.314323 }, { "epoch": 0.8441310975609756, "grad_norm": 1.3989429473876953, "learning_rate": 9.3130664062712e-05, "loss": 0.23800573348999024, "memory(GiB)": 153.57, "step": 2215, "token_acc": 0.9054390330607892, "train_speed(iter/s)": 0.314433 }, { "epoch": 0.8460365853658537, "grad_norm": 1.2501412630081177, "learning_rate": 9.310035079427651e-05, "loss": 0.2912730693817139, "memory(GiB)": 153.57, "step": 2220, "token_acc": 0.8740088105726872, "train_speed(iter/s)": 0.314282 }, { "epoch": 0.8479420731707317, "grad_norm": 1.2604836225509644, "learning_rate": 9.306997574513925e-05, "loss": 0.30822076797485354, "memory(GiB)": 153.57, "step": 2225, "token_acc": 0.8752171395483498, "train_speed(iter/s)": 0.314334 }, { "epoch": 0.8498475609756098, "grad_norm": 1.050304651260376, "learning_rate": 9.303953895884033e-05, "loss": 0.20977509021759033, "memory(GiB)": 153.57, "step": 2230, "token_acc": 0.9072481572481572, "train_speed(iter/s)": 0.314416 }, { "epoch": 0.8517530487804879, "grad_norm": 1.4358839988708496, "learning_rate": 9.300904047900826e-05, "loss": 0.17531425952911378, "memory(GiB)": 153.57, "step": 2235, "token_acc": 0.9164167916041979, "train_speed(iter/s)": 0.314503 }, { "epoch": 0.8536585365853658, "grad_norm": 1.8662807941436768, "learning_rate": 9.297848034936006e-05, "loss": 0.25505714416503905, "memory(GiB)": 153.57, "step": 2240, "token_acc": 0.8919712793733682, "train_speed(iter/s)": 0.31441 }, { "epoch": 0.8555640243902439, "grad_norm": 1.1526122093200684, "learning_rate": 9.294785861370107e-05, "loss": 0.31118297576904297, "memory(GiB)": 153.57, "step": 2245, "token_acc": 0.8644026974951831, "train_speed(iter/s)": 0.314492 }, { "epoch": 0.8574695121951219, "grad_norm": 0.8138712644577026, "learning_rate": 9.291717531592494e-05, "loss": 0.30563673973083494, "memory(GiB)": 153.57, "step": 2250, "token_acc": 0.8801929777539533, "train_speed(iter/s)": 0.314591 }, { "epoch": 0.859375, "grad_norm": 0.9694454669952393, "learning_rate": 9.288643050001361e-05, "loss": 0.26598076820373534, "memory(GiB)": 153.57, "step": 2255, "token_acc": 0.8956248212753789, "train_speed(iter/s)": 0.314564 }, { "epoch": 0.8612804878048781, "grad_norm": 1.242936372756958, "learning_rate": 9.285562421003715e-05, "loss": 0.2258904457092285, "memory(GiB)": 153.57, "step": 2260, "token_acc": 0.9094496365524403, "train_speed(iter/s)": 0.314482 }, { "epoch": 0.8631859756097561, "grad_norm": 0.9733143448829651, "learning_rate": 9.282475649015377e-05, "loss": 0.2390192747116089, "memory(GiB)": 153.57, "step": 2265, "token_acc": 0.908093023255814, "train_speed(iter/s)": 0.314528 }, { "epoch": 0.8650914634146342, "grad_norm": 1.308151364326477, "learning_rate": 9.279382738460971e-05, "loss": 0.26725966930389405, "memory(GiB)": 153.57, "step": 2270, "token_acc": 0.8944723618090452, "train_speed(iter/s)": 0.314599 }, { "epoch": 0.8669969512195121, "grad_norm": 0.9255917072296143, "learning_rate": 9.276283693773926e-05, "loss": 0.28496813774108887, "memory(GiB)": 153.57, "step": 2275, "token_acc": 0.88155253178675, "train_speed(iter/s)": 0.314657 }, { "epoch": 0.8689024390243902, "grad_norm": 1.6922558546066284, "learning_rate": 9.273178519396459e-05, "loss": 0.26711699962615965, "memory(GiB)": 153.57, "step": 2280, "token_acc": 0.8852806212495588, "train_speed(iter/s)": 0.314763 }, { "epoch": 0.8708079268292683, "grad_norm": 1.6256868839263916, "learning_rate": 9.270067219779573e-05, "loss": 0.2010500431060791, "memory(GiB)": 153.57, "step": 2285, "token_acc": 0.9157782515991472, "train_speed(iter/s)": 0.314829 }, { "epoch": 0.8727134146341463, "grad_norm": 1.1766444444656372, "learning_rate": 9.266949799383053e-05, "loss": 0.24439167976379395, "memory(GiB)": 153.57, "step": 2290, "token_acc": 0.8993266546230908, "train_speed(iter/s)": 0.314805 }, { "epoch": 0.8746189024390244, "grad_norm": 1.4270025491714478, "learning_rate": 9.263826262675459e-05, "loss": 0.2677504062652588, "memory(GiB)": 153.57, "step": 2295, "token_acc": 0.8931168611533908, "train_speed(iter/s)": 0.314625 }, { "epoch": 0.8765243902439024, "grad_norm": 1.1258821487426758, "learning_rate": 9.260696614134114e-05, "loss": 0.2468870162963867, "memory(GiB)": 153.57, "step": 2300, "token_acc": 0.8948361235557651, "train_speed(iter/s)": 0.31468 }, { "epoch": 0.8784298780487805, "grad_norm": 1.5283325910568237, "learning_rate": 9.257560858245105e-05, "loss": 0.25461535453796386, "memory(GiB)": 153.57, "step": 2305, "token_acc": 0.9020114942528735, "train_speed(iter/s)": 0.314761 }, { "epoch": 0.8803353658536586, "grad_norm": 1.0486336946487427, "learning_rate": 9.254418999503271e-05, "loss": 0.25705852508544924, "memory(GiB)": 153.57, "step": 2310, "token_acc": 0.8943683409436834, "train_speed(iter/s)": 0.314841 }, { "epoch": 0.8822408536585366, "grad_norm": 0.9248655438423157, "learning_rate": 9.251271042412202e-05, "loss": 0.249931263923645, "memory(GiB)": 153.57, "step": 2315, "token_acc": 0.9087011349306431, "train_speed(iter/s)": 0.314869 }, { "epoch": 0.8841463414634146, "grad_norm": 0.9409138560295105, "learning_rate": 9.248116991484229e-05, "loss": 0.30486786365509033, "memory(GiB)": 153.57, "step": 2320, "token_acc": 0.8831168831168831, "train_speed(iter/s)": 0.314875 }, { "epoch": 0.8860518292682927, "grad_norm": 1.0848805904388428, "learning_rate": 9.244956851240411e-05, "loss": 0.24744114875793458, "memory(GiB)": 153.57, "step": 2325, "token_acc": 0.9033681765389082, "train_speed(iter/s)": 0.314948 }, { "epoch": 0.8879573170731707, "grad_norm": 1.4887359142303467, "learning_rate": 9.241790626210549e-05, "loss": 0.27587180137634276, "memory(GiB)": 153.57, "step": 2330, "token_acc": 0.8885757641448081, "train_speed(iter/s)": 0.314965 }, { "epoch": 0.8898628048780488, "grad_norm": 2.0570740699768066, "learning_rate": 9.238618320933153e-05, "loss": 0.20936784744262696, "memory(GiB)": 153.57, "step": 2335, "token_acc": 0.9060374149659864, "train_speed(iter/s)": 0.315074 }, { "epoch": 0.8917682926829268, "grad_norm": 1.6616604328155518, "learning_rate": 9.235439939955457e-05, "loss": 0.19374250173568724, "memory(GiB)": 153.57, "step": 2340, "token_acc": 0.9114529203907712, "train_speed(iter/s)": 0.315104 }, { "epoch": 0.8936737804878049, "grad_norm": 1.1072403192520142, "learning_rate": 9.232255487833398e-05, "loss": 0.30487613677978515, "memory(GiB)": 153.57, "step": 2345, "token_acc": 0.8819047619047619, "train_speed(iter/s)": 0.315139 }, { "epoch": 0.895579268292683, "grad_norm": 1.2228060960769653, "learning_rate": 9.229064969131621e-05, "loss": 0.22865872383117675, "memory(GiB)": 153.57, "step": 2350, "token_acc": 0.8995614035087719, "train_speed(iter/s)": 0.315222 }, { "epoch": 0.897484756097561, "grad_norm": 1.1726197004318237, "learning_rate": 9.225868388423463e-05, "loss": 0.2959782838821411, "memory(GiB)": 153.57, "step": 2355, "token_acc": 0.8782072642452515, "train_speed(iter/s)": 0.315055 }, { "epoch": 0.899390243902439, "grad_norm": 1.0418236255645752, "learning_rate": 9.222665750290953e-05, "loss": 0.25895066261291505, "memory(GiB)": 153.57, "step": 2360, "token_acc": 0.8892351274787536, "train_speed(iter/s)": 0.315159 }, { "epoch": 0.901295731707317, "grad_norm": 1.0100785493850708, "learning_rate": 9.219457059324799e-05, "loss": 0.23587119579315186, "memory(GiB)": 153.57, "step": 2365, "token_acc": 0.8891343643301868, "train_speed(iter/s)": 0.31503 }, { "epoch": 0.9032012195121951, "grad_norm": 0.9950655102729797, "learning_rate": 9.216242320124388e-05, "loss": 0.26404287815093996, "memory(GiB)": 153.57, "step": 2370, "token_acc": 0.8908069048574869, "train_speed(iter/s)": 0.314889 }, { "epoch": 0.9051067073170732, "grad_norm": 1.25274658203125, "learning_rate": 9.21302153729778e-05, "loss": 0.19308249950408934, "memory(GiB)": 153.57, "step": 2375, "token_acc": 0.9037828947368421, "train_speed(iter/s)": 0.314944 }, { "epoch": 0.9070121951219512, "grad_norm": 1.1500147581100464, "learning_rate": 9.20979471546169e-05, "loss": 0.2697604656219482, "memory(GiB)": 153.57, "step": 2380, "token_acc": 0.8928484848484849, "train_speed(iter/s)": 0.314995 }, { "epoch": 0.9089176829268293, "grad_norm": 2.1848814487457275, "learning_rate": 9.206561859241496e-05, "loss": 0.27335782051086427, "memory(GiB)": 153.57, "step": 2385, "token_acc": 0.8894303990252818, "train_speed(iter/s)": 0.315101 }, { "epoch": 0.9108231707317073, "grad_norm": 1.023332953453064, "learning_rate": 9.203322973271223e-05, "loss": 0.16402807235717773, "memory(GiB)": 153.57, "step": 2390, "token_acc": 0.9303361684892669, "train_speed(iter/s)": 0.314986 }, { "epoch": 0.9127286585365854, "grad_norm": 1.14621102809906, "learning_rate": 9.200078062193539e-05, "loss": 0.22965695858001708, "memory(GiB)": 153.57, "step": 2395, "token_acc": 0.9068893528183716, "train_speed(iter/s)": 0.314989 }, { "epoch": 0.9146341463414634, "grad_norm": 1.5930829048156738, "learning_rate": 9.19682713065975e-05, "loss": 0.2770206928253174, "memory(GiB)": 153.57, "step": 2400, "token_acc": 0.892449517120281, "train_speed(iter/s)": 0.31504 }, { "epoch": 0.9146341463414634, "eval_loss": 0.21461741626262665, "eval_runtime": 33.185, "eval_samples_per_second": 3.194, "eval_steps_per_second": 3.194, "eval_token_acc": 0.8893638420920291, "step": 2400 }, { "epoch": 0.9165396341463414, "grad_norm": 2.4381721019744873, "learning_rate": 9.193570183329792e-05, "loss": 0.3239989519119263, "memory(GiB)": 153.57, "step": 2405, "token_acc": 0.8853091180495086, "train_speed(iter/s)": 0.313735 }, { "epoch": 0.9184451219512195, "grad_norm": 1.3946526050567627, "learning_rate": 9.19030722487222e-05, "loss": 0.260250186920166, "memory(GiB)": 153.57, "step": 2410, "token_acc": 0.8862478777589134, "train_speed(iter/s)": 0.313621 }, { "epoch": 0.9203506097560976, "grad_norm": 1.6903891563415527, "learning_rate": 9.187038259964211e-05, "loss": 0.255747127532959, "memory(GiB)": 153.57, "step": 2415, "token_acc": 0.9059775840597758, "train_speed(iter/s)": 0.313674 }, { "epoch": 0.9222560975609756, "grad_norm": 1.324049472808838, "learning_rate": 9.183763293291549e-05, "loss": 0.23514039516448976, "memory(GiB)": 153.57, "step": 2420, "token_acc": 0.9026809651474531, "train_speed(iter/s)": 0.31375 }, { "epoch": 0.9241615853658537, "grad_norm": 1.4506843090057373, "learning_rate": 9.18048232954862e-05, "loss": 0.24096417427062988, "memory(GiB)": 153.57, "step": 2425, "token_acc": 0.8919590189382179, "train_speed(iter/s)": 0.313835 }, { "epoch": 0.9260670731707317, "grad_norm": 1.5878288745880127, "learning_rate": 9.17719537343841e-05, "loss": 0.2668702840805054, "memory(GiB)": 153.57, "step": 2430, "token_acc": 0.8954468802698144, "train_speed(iter/s)": 0.313914 }, { "epoch": 0.9279725609756098, "grad_norm": 1.6788142919540405, "learning_rate": 9.17390242967249e-05, "loss": 0.21423912048339844, "memory(GiB)": 153.57, "step": 2435, "token_acc": 0.9241316270566727, "train_speed(iter/s)": 0.314036 }, { "epoch": 0.9298780487804879, "grad_norm": 1.227168083190918, "learning_rate": 9.170603502971016e-05, "loss": 0.27565879821777345, "memory(GiB)": 153.57, "step": 2440, "token_acc": 0.8914321713565728, "train_speed(iter/s)": 0.314065 }, { "epoch": 0.9317835365853658, "grad_norm": 1.304644227027893, "learning_rate": 9.167298598062721e-05, "loss": 0.3150049924850464, "memory(GiB)": 153.57, "step": 2445, "token_acc": 0.8640406607369758, "train_speed(iter/s)": 0.314062 }, { "epoch": 0.9336890243902439, "grad_norm": 1.4278231859207153, "learning_rate": 9.163987719684907e-05, "loss": 0.25922932624816897, "memory(GiB)": 153.57, "step": 2450, "token_acc": 0.8905176362803482, "train_speed(iter/s)": 0.31408 }, { "epoch": 0.9355945121951219, "grad_norm": 1.1509865522384644, "learning_rate": 9.160670872583434e-05, "loss": 0.2804328441619873, "memory(GiB)": 153.57, "step": 2455, "token_acc": 0.8882067024814531, "train_speed(iter/s)": 0.314148 }, { "epoch": 0.9375, "grad_norm": 1.1538304090499878, "learning_rate": 9.157348061512727e-05, "loss": 0.21048405170440673, "memory(GiB)": 153.57, "step": 2460, "token_acc": 0.8927335640138409, "train_speed(iter/s)": 0.314214 }, { "epoch": 0.9394054878048781, "grad_norm": 0.5595046877861023, "learning_rate": 9.15401929123575e-05, "loss": 0.2596498966217041, "memory(GiB)": 153.57, "step": 2465, "token_acc": 0.8903137789904502, "train_speed(iter/s)": 0.314092 }, { "epoch": 0.9413109756097561, "grad_norm": 1.114023208618164, "learning_rate": 9.150684566524012e-05, "loss": 0.22133567333221435, "memory(GiB)": 153.57, "step": 2470, "token_acc": 0.8919514884233738, "train_speed(iter/s)": 0.314221 }, { "epoch": 0.9432164634146342, "grad_norm": 1.4041748046875, "learning_rate": 9.147343892157562e-05, "loss": 0.2596601486206055, "memory(GiB)": 153.57, "step": 2475, "token_acc": 0.9010349288486417, "train_speed(iter/s)": 0.314315 }, { "epoch": 0.9451219512195121, "grad_norm": 1.2525033950805664, "learning_rate": 9.143997272924973e-05, "loss": 0.2740582704544067, "memory(GiB)": 153.57, "step": 2480, "token_acc": 0.8782009192383454, "train_speed(iter/s)": 0.314406 }, { "epoch": 0.9470274390243902, "grad_norm": 1.2241204977035522, "learning_rate": 9.140644713623339e-05, "loss": 0.29618520736694337, "memory(GiB)": 153.57, "step": 2485, "token_acc": 0.8696568332329921, "train_speed(iter/s)": 0.314476 }, { "epoch": 0.9489329268292683, "grad_norm": 1.0669825077056885, "learning_rate": 9.13728621905827e-05, "loss": 0.2624840259552002, "memory(GiB)": 153.57, "step": 2490, "token_acc": 0.8718164951931185, "train_speed(iter/s)": 0.314357 }, { "epoch": 0.9508384146341463, "grad_norm": 1.5949070453643799, "learning_rate": 9.133921794043885e-05, "loss": 0.260730242729187, "memory(GiB)": 153.57, "step": 2495, "token_acc": 0.8870824711794265, "train_speed(iter/s)": 0.314119 }, { "epoch": 0.9527439024390244, "grad_norm": 1.4317415952682495, "learning_rate": 9.130551443402799e-05, "loss": 0.20654239654541015, "memory(GiB)": 153.57, "step": 2500, "token_acc": 0.9125943583631307, "train_speed(iter/s)": 0.314203 }, { "epoch": 0.9546493902439024, "grad_norm": 1.292273759841919, "learning_rate": 9.127175171966126e-05, "loss": 0.2993767261505127, "memory(GiB)": 153.57, "step": 2505, "token_acc": 0.881795195954488, "train_speed(iter/s)": 0.314234 }, { "epoch": 0.9565548780487805, "grad_norm": 1.0115107297897339, "learning_rate": 9.123792984573466e-05, "loss": 0.2730101108551025, "memory(GiB)": 153.57, "step": 2510, "token_acc": 0.8892952127659575, "train_speed(iter/s)": 0.314261 }, { "epoch": 0.9584603658536586, "grad_norm": 1.197665810585022, "learning_rate": 9.120404886072898e-05, "loss": 0.26359126567840574, "memory(GiB)": 153.57, "step": 2515, "token_acc": 0.888421052631579, "train_speed(iter/s)": 0.31399 }, { "epoch": 0.9603658536585366, "grad_norm": 1.1726064682006836, "learning_rate": 9.117010881320973e-05, "loss": 0.25237534046173093, "memory(GiB)": 153.57, "step": 2520, "token_acc": 0.8927586206896552, "train_speed(iter/s)": 0.313692 }, { "epoch": 0.9622713414634146, "grad_norm": 0.9211184978485107, "learning_rate": 9.11361097518271e-05, "loss": 0.2998049259185791, "memory(GiB)": 153.57, "step": 2525, "token_acc": 0.8806993486458691, "train_speed(iter/s)": 0.313705 }, { "epoch": 0.9641768292682927, "grad_norm": 1.2097318172454834, "learning_rate": 9.110205172531585e-05, "loss": 0.3148249626159668, "memory(GiB)": 153.57, "step": 2530, "token_acc": 0.8758835758835759, "train_speed(iter/s)": 0.313729 }, { "epoch": 0.9660823170731707, "grad_norm": 1.2977283000946045, "learning_rate": 9.106793478249531e-05, "loss": 0.2710968732833862, "memory(GiB)": 153.57, "step": 2535, "token_acc": 0.8903189703413542, "train_speed(iter/s)": 0.313812 }, { "epoch": 0.9679878048780488, "grad_norm": 1.4274362325668335, "learning_rate": 9.103375897226918e-05, "loss": 0.2515212059020996, "memory(GiB)": 153.57, "step": 2540, "token_acc": 0.8929133858267716, "train_speed(iter/s)": 0.313889 }, { "epoch": 0.9698932926829268, "grad_norm": 1.3239736557006836, "learning_rate": 9.099952434362563e-05, "loss": 0.2405371904373169, "memory(GiB)": 153.57, "step": 2545, "token_acc": 0.9018909899888765, "train_speed(iter/s)": 0.313917 }, { "epoch": 0.9717987804878049, "grad_norm": 1.2070719003677368, "learning_rate": 9.096523094563708e-05, "loss": 0.2743818283081055, "memory(GiB)": 153.57, "step": 2550, "token_acc": 0.8826174949404093, "train_speed(iter/s)": 0.313739 }, { "epoch": 0.973704268292683, "grad_norm": 1.3678795099258423, "learning_rate": 9.093087882746021e-05, "loss": 0.2657839298248291, "memory(GiB)": 153.57, "step": 2555, "token_acc": 0.8954174228675136, "train_speed(iter/s)": 0.313643 }, { "epoch": 0.975609756097561, "grad_norm": 0.8178489208221436, "learning_rate": 9.089646803833589e-05, "loss": 0.27731928825378416, "memory(GiB)": 153.57, "step": 2560, "token_acc": 0.8828809574695448, "train_speed(iter/s)": 0.31369 }, { "epoch": 0.977515243902439, "grad_norm": 0.8200597763061523, "learning_rate": 9.086199862758905e-05, "loss": 0.19488768577575682, "memory(GiB)": 153.57, "step": 2565, "token_acc": 0.9151712887438825, "train_speed(iter/s)": 0.313755 }, { "epoch": 0.979420731707317, "grad_norm": 1.083716630935669, "learning_rate": 9.082747064462867e-05, "loss": 0.27525689601898196, "memory(GiB)": 153.57, "step": 2570, "token_acc": 0.8765547993428773, "train_speed(iter/s)": 0.313631 }, { "epoch": 0.9813262195121951, "grad_norm": 1.828143835067749, "learning_rate": 9.07928841389477e-05, "loss": 0.2675456523895264, "memory(GiB)": 153.57, "step": 2575, "token_acc": 0.885562831306517, "train_speed(iter/s)": 0.313721 }, { "epoch": 0.9832317073170732, "grad_norm": 1.069172978401184, "learning_rate": 9.075823916012298e-05, "loss": 0.24997663497924805, "memory(GiB)": 153.57, "step": 2580, "token_acc": 0.8949846950788792, "train_speed(iter/s)": 0.313636 }, { "epoch": 0.9851371951219512, "grad_norm": 1.1798211336135864, "learning_rate": 9.072353575781512e-05, "loss": 0.2428279161453247, "memory(GiB)": 153.57, "step": 2585, "token_acc": 0.901111474338019, "train_speed(iter/s)": 0.313677 }, { "epoch": 0.9870426829268293, "grad_norm": 0.9670788645744324, "learning_rate": 9.068877398176852e-05, "loss": 0.20313148498535155, "memory(GiB)": 153.57, "step": 2590, "token_acc": 0.9013227513227513, "train_speed(iter/s)": 0.313737 }, { "epoch": 0.9889481707317073, "grad_norm": 1.3256622552871704, "learning_rate": 9.065395388181125e-05, "loss": 0.2606493473052979, "memory(GiB)": 153.57, "step": 2595, "token_acc": 0.897887323943662, "train_speed(iter/s)": 0.3136 }, { "epoch": 0.9908536585365854, "grad_norm": 1.1462643146514893, "learning_rate": 9.061907550785498e-05, "loss": 0.26296310424804686, "memory(GiB)": 153.57, "step": 2600, "token_acc": 0.9035244922341696, "train_speed(iter/s)": 0.313666 }, { "epoch": 0.9927591463414634, "grad_norm": 1.665973424911499, "learning_rate": 9.058413890989488e-05, "loss": 0.24817824363708496, "memory(GiB)": 153.57, "step": 2605, "token_acc": 0.8824193184382758, "train_speed(iter/s)": 0.313718 }, { "epoch": 0.9946646341463414, "grad_norm": 1.2561821937561035, "learning_rate": 9.054914413800961e-05, "loss": 0.22882814407348634, "memory(GiB)": 153.57, "step": 2610, "token_acc": 0.9121776870053613, "train_speed(iter/s)": 0.313736 }, { "epoch": 0.9965701219512195, "grad_norm": 0.9707039594650269, "learning_rate": 9.051409124236122e-05, "loss": 0.23701071739196777, "memory(GiB)": 153.57, "step": 2615, "token_acc": 0.8911758265820013, "train_speed(iter/s)": 0.313777 }, { "epoch": 0.9984756097560976, "grad_norm": 1.5666323900222778, "learning_rate": 9.047898027319507e-05, "loss": 0.21503009796142578, "memory(GiB)": 153.57, "step": 2620, "token_acc": 0.9163896701610841, "train_speed(iter/s)": 0.313833 }, { "epoch": 1.0003810975609757, "grad_norm": 1.2882250547409058, "learning_rate": 9.044381128083974e-05, "loss": 0.29809937477111814, "memory(GiB)": 153.57, "step": 2625, "token_acc": 0.873709987755816, "train_speed(iter/s)": 0.313909 }, { "epoch": 1.0022865853658536, "grad_norm": 1.3313795328140259, "learning_rate": 9.040858431570702e-05, "loss": 0.25233683586120603, "memory(GiB)": 153.57, "step": 2630, "token_acc": 0.8971812940422806, "train_speed(iter/s)": 0.313835 }, { "epoch": 1.0041920731707317, "grad_norm": 1.1467128992080688, "learning_rate": 9.037329942829178e-05, "loss": 0.2719454526901245, "memory(GiB)": 153.57, "step": 2635, "token_acc": 0.886164274322169, "train_speed(iter/s)": 0.313881 }, { "epoch": 1.0060975609756098, "grad_norm": 1.2956342697143555, "learning_rate": 9.033795666917191e-05, "loss": 0.21656830310821534, "memory(GiB)": 153.57, "step": 2640, "token_acc": 0.9089464123524069, "train_speed(iter/s)": 0.313948 }, { "epoch": 1.0080030487804879, "grad_norm": 1.354995608329773, "learning_rate": 9.030255608900826e-05, "loss": 0.22435028553009034, "memory(GiB)": 153.57, "step": 2645, "token_acc": 0.9200873362445415, "train_speed(iter/s)": 0.314042 }, { "epoch": 1.009908536585366, "grad_norm": 1.3393521308898926, "learning_rate": 9.026709773854457e-05, "loss": 0.24945261478424072, "memory(GiB)": 153.57, "step": 2650, "token_acc": 0.8998996487706974, "train_speed(iter/s)": 0.314075 }, { "epoch": 1.0118140243902438, "grad_norm": 1.3517801761627197, "learning_rate": 9.023158166860737e-05, "loss": 0.28362038135528567, "memory(GiB)": 153.57, "step": 2655, "token_acc": 0.8796655718124813, "train_speed(iter/s)": 0.314135 }, { "epoch": 1.013719512195122, "grad_norm": 1.0189573764801025, "learning_rate": 9.019600793010597e-05, "loss": 0.1706958532333374, "memory(GiB)": 153.57, "step": 2660, "token_acc": 0.9171113482746354, "train_speed(iter/s)": 0.314218 }, { "epoch": 1.015625, "grad_norm": 1.7680280208587646, "learning_rate": 9.016037657403224e-05, "loss": 0.29625513553619387, "memory(GiB)": 153.57, "step": 2665, "token_acc": 0.8620775969962453, "train_speed(iter/s)": 0.314311 }, { "epoch": 1.017530487804878, "grad_norm": 1.3570187091827393, "learning_rate": 9.012468765146079e-05, "loss": 0.2465003252029419, "memory(GiB)": 153.57, "step": 2670, "token_acc": 0.9032576505429417, "train_speed(iter/s)": 0.314002 }, { "epoch": 1.0194359756097562, "grad_norm": 1.504968523979187, "learning_rate": 9.008894121354862e-05, "loss": 0.20558226108551025, "memory(GiB)": 153.57, "step": 2675, "token_acc": 0.914204163868368, "train_speed(iter/s)": 0.313859 }, { "epoch": 1.021341463414634, "grad_norm": 0.8436688184738159, "learning_rate": 9.005313731153524e-05, "loss": 0.217226243019104, "memory(GiB)": 153.57, "step": 2680, "token_acc": 0.8838120104438643, "train_speed(iter/s)": 0.313915 }, { "epoch": 1.0232469512195121, "grad_norm": 1.5977096557617188, "learning_rate": 9.001727599674251e-05, "loss": 0.20629043579101564, "memory(GiB)": 153.57, "step": 2685, "token_acc": 0.9107675111773472, "train_speed(iter/s)": 0.313958 }, { "epoch": 1.0251524390243902, "grad_norm": 0.9728628396987915, "learning_rate": 8.998135732057458e-05, "loss": 0.21682386398315429, "memory(GiB)": 153.57, "step": 2690, "token_acc": 0.9090425531914894, "train_speed(iter/s)": 0.31403 }, { "epoch": 1.0270579268292683, "grad_norm": 1.0089056491851807, "learning_rate": 8.994538133451782e-05, "loss": 0.23635997772216796, "memory(GiB)": 153.57, "step": 2695, "token_acc": 0.8997539975399754, "train_speed(iter/s)": 0.314062 }, { "epoch": 1.0289634146341464, "grad_norm": 1.150850772857666, "learning_rate": 8.990934809014077e-05, "loss": 0.17320096492767334, "memory(GiB)": 153.57, "step": 2700, "token_acc": 0.90013633265167, "train_speed(iter/s)": 0.314166 }, { "epoch": 1.0308689024390243, "grad_norm": 0.8839222192764282, "learning_rate": 8.987325763909404e-05, "loss": 0.16326476335525514, "memory(GiB)": 153.57, "step": 2705, "token_acc": 0.9207077326343381, "train_speed(iter/s)": 0.314228 }, { "epoch": 1.0327743902439024, "grad_norm": 1.6380645036697388, "learning_rate": 8.983711003311024e-05, "loss": 0.24515087604522706, "memory(GiB)": 153.57, "step": 2710, "token_acc": 0.8993255469649614, "train_speed(iter/s)": 0.314258 }, { "epoch": 1.0346798780487805, "grad_norm": 1.088632583618164, "learning_rate": 8.980090532400387e-05, "loss": 0.24659194946289062, "memory(GiB)": 153.57, "step": 2715, "token_acc": 0.897362003990246, "train_speed(iter/s)": 0.314302 }, { "epoch": 1.0365853658536586, "grad_norm": 1.646657943725586, "learning_rate": 8.976464356367134e-05, "loss": 0.25817556381225587, "memory(GiB)": 153.57, "step": 2720, "token_acc": 0.8958115183246074, "train_speed(iter/s)": 0.314365 }, { "epoch": 1.0384908536585367, "grad_norm": 2.0975563526153564, "learning_rate": 8.972832480409079e-05, "loss": 0.2013038158416748, "memory(GiB)": 153.57, "step": 2725, "token_acc": 0.9222854640980735, "train_speed(iter/s)": 0.314419 }, { "epoch": 1.0403963414634145, "grad_norm": 1.1599847078323364, "learning_rate": 8.96919490973221e-05, "loss": 0.20942163467407227, "memory(GiB)": 153.57, "step": 2730, "token_acc": 0.9054921540656206, "train_speed(iter/s)": 0.314408 }, { "epoch": 1.0423018292682926, "grad_norm": 0.7143600583076477, "learning_rate": 8.965551649550676e-05, "loss": 0.25589134693145754, "memory(GiB)": 153.57, "step": 2735, "token_acc": 0.8911584476218267, "train_speed(iter/s)": 0.314465 }, { "epoch": 1.0442073170731707, "grad_norm": 1.5100138187408447, "learning_rate": 8.961902705086785e-05, "loss": 0.189524245262146, "memory(GiB)": 153.57, "step": 2740, "token_acc": 0.9159449821701477, "train_speed(iter/s)": 0.314348 }, { "epoch": 1.0461128048780488, "grad_norm": 1.9462368488311768, "learning_rate": 8.958248081570984e-05, "loss": 0.24768528938293458, "memory(GiB)": 153.57, "step": 2745, "token_acc": 0.9070168711656442, "train_speed(iter/s)": 0.314394 }, { "epoch": 1.048018292682927, "grad_norm": 1.1959118843078613, "learning_rate": 8.954587784241871e-05, "loss": 0.21952486038208008, "memory(GiB)": 153.57, "step": 2750, "token_acc": 0.8816938853019369, "train_speed(iter/s)": 0.31442 }, { "epoch": 1.0499237804878048, "grad_norm": 0.81492018699646, "learning_rate": 8.950921818346168e-05, "loss": 0.26012754440307617, "memory(GiB)": 153.57, "step": 2755, "token_acc": 0.8947630302276403, "train_speed(iter/s)": 0.314399 }, { "epoch": 1.0518292682926829, "grad_norm": 1.0227842330932617, "learning_rate": 8.947250189138731e-05, "loss": 0.18723193407058716, "memory(GiB)": 153.57, "step": 2760, "token_acc": 0.9179088809573798, "train_speed(iter/s)": 0.314441 }, { "epoch": 1.053734756097561, "grad_norm": 1.406761884689331, "learning_rate": 8.943572901882526e-05, "loss": 0.20757932662963868, "memory(GiB)": 153.57, "step": 2765, "token_acc": 0.8987427031881455, "train_speed(iter/s)": 0.314461 }, { "epoch": 1.055640243902439, "grad_norm": 1.4740697145462036, "learning_rate": 8.939889961848634e-05, "loss": 0.2331608533859253, "memory(GiB)": 153.57, "step": 2770, "token_acc": 0.904697583671019, "train_speed(iter/s)": 0.314465 }, { "epoch": 1.0575457317073171, "grad_norm": 1.1564308404922485, "learning_rate": 8.936201374316237e-05, "loss": 0.2693225383758545, "memory(GiB)": 153.57, "step": 2775, "token_acc": 0.8879983125922801, "train_speed(iter/s)": 0.31451 }, { "epoch": 1.0594512195121952, "grad_norm": 1.039127230644226, "learning_rate": 8.932507144572616e-05, "loss": 0.2634108543395996, "memory(GiB)": 153.57, "step": 2780, "token_acc": 0.899005593536358, "train_speed(iter/s)": 0.31453 }, { "epoch": 1.061356707317073, "grad_norm": 1.4310401678085327, "learning_rate": 8.928807277913132e-05, "loss": 0.26037750244140623, "memory(GiB)": 153.57, "step": 2785, "token_acc": 0.8885813148788927, "train_speed(iter/s)": 0.314599 }, { "epoch": 1.0632621951219512, "grad_norm": 1.544795274734497, "learning_rate": 8.925101779641232e-05, "loss": 0.2013012409210205, "memory(GiB)": 153.57, "step": 2790, "token_acc": 0.9126791620727673, "train_speed(iter/s)": 0.314504 }, { "epoch": 1.0651676829268293, "grad_norm": 0.875274658203125, "learning_rate": 8.921390655068433e-05, "loss": 0.19657191038131713, "memory(GiB)": 153.57, "step": 2795, "token_acc": 0.9141423885780436, "train_speed(iter/s)": 0.314345 }, { "epoch": 1.0670731707317074, "grad_norm": 1.3879762887954712, "learning_rate": 8.917673909514322e-05, "loss": 0.23002052307128906, "memory(GiB)": 153.57, "step": 2800, "token_acc": 0.8979926259729619, "train_speed(iter/s)": 0.314452 }, { "epoch": 1.0689786585365855, "grad_norm": 1.3197975158691406, "learning_rate": 8.913951548306532e-05, "loss": 0.22526030540466307, "memory(GiB)": 153.57, "step": 2805, "token_acc": 0.8876080691642652, "train_speed(iter/s)": 0.314545 }, { "epoch": 1.0708841463414633, "grad_norm": 1.2373971939086914, "learning_rate": 8.910223576780758e-05, "loss": 0.2628974914550781, "memory(GiB)": 153.57, "step": 2810, "token_acc": 0.8933066933066933, "train_speed(iter/s)": 0.314523 }, { "epoch": 1.0727896341463414, "grad_norm": 1.2534942626953125, "learning_rate": 8.906490000280729e-05, "loss": 0.25473973751068113, "memory(GiB)": 153.57, "step": 2815, "token_acc": 0.8951132300357568, "train_speed(iter/s)": 0.314254 }, { "epoch": 1.0746951219512195, "grad_norm": 1.4723511934280396, "learning_rate": 8.902750824158212e-05, "loss": 0.264472222328186, "memory(GiB)": 153.57, "step": 2820, "token_acc": 0.8873091100579252, "train_speed(iter/s)": 0.314308 }, { "epoch": 1.0766006097560976, "grad_norm": 1.1797399520874023, "learning_rate": 8.899006053772998e-05, "loss": 0.21063518524169922, "memory(GiB)": 153.57, "step": 2825, "token_acc": 0.9020109689213894, "train_speed(iter/s)": 0.314429 }, { "epoch": 1.0785060975609757, "grad_norm": 1.6741175651550293, "learning_rate": 8.895255694492896e-05, "loss": 0.22530629634857177, "memory(GiB)": 153.57, "step": 2830, "token_acc": 0.9183372019413378, "train_speed(iter/s)": 0.314488 }, { "epoch": 1.0804115853658536, "grad_norm": 1.0855587720870972, "learning_rate": 8.891499751693735e-05, "loss": 0.21100544929504395, "memory(GiB)": 153.57, "step": 2835, "token_acc": 0.9034894687205282, "train_speed(iter/s)": 0.314566 }, { "epoch": 1.0823170731707317, "grad_norm": 1.0574440956115723, "learning_rate": 8.887738230759333e-05, "loss": 0.21955995559692382, "memory(GiB)": 153.57, "step": 2840, "token_acc": 0.9088931851135814, "train_speed(iter/s)": 0.314594 }, { "epoch": 1.0842225609756098, "grad_norm": 1.2290796041488647, "learning_rate": 8.883971137081517e-05, "loss": 0.20180702209472656, "memory(GiB)": 153.57, "step": 2845, "token_acc": 0.9076369452219112, "train_speed(iter/s)": 0.314684 }, { "epoch": 1.0861280487804879, "grad_norm": 1.303459882736206, "learning_rate": 8.880198476060095e-05, "loss": 0.2473137617111206, "memory(GiB)": 153.57, "step": 2850, "token_acc": 0.8929317029525797, "train_speed(iter/s)": 0.314786 }, { "epoch": 1.088033536585366, "grad_norm": 1.3740986585617065, "learning_rate": 8.876420253102856e-05, "loss": 0.21946237087249756, "memory(GiB)": 153.57, "step": 2855, "token_acc": 0.9027630180658873, "train_speed(iter/s)": 0.314755 }, { "epoch": 1.0899390243902438, "grad_norm": 1.2033692598342896, "learning_rate": 8.872636473625565e-05, "loss": 0.22226312160491943, "memory(GiB)": 153.57, "step": 2860, "token_acc": 0.9131772346607417, "train_speed(iter/s)": 0.31473 }, { "epoch": 1.091844512195122, "grad_norm": 1.3270052671432495, "learning_rate": 8.868847143051946e-05, "loss": 0.1942179322242737, "memory(GiB)": 153.57, "step": 2865, "token_acc": 0.9197948239682909, "train_speed(iter/s)": 0.314775 }, { "epoch": 1.09375, "grad_norm": 1.5063115358352661, "learning_rate": 8.865052266813685e-05, "loss": 0.2597026824951172, "memory(GiB)": 153.57, "step": 2870, "token_acc": 0.8985272697847548, "train_speed(iter/s)": 0.314794 }, { "epoch": 1.095655487804878, "grad_norm": 1.0648983716964722, "learning_rate": 8.861251850350417e-05, "loss": 0.1811489224433899, "memory(GiB)": 153.57, "step": 2875, "token_acc": 0.9275399690562145, "train_speed(iter/s)": 0.314836 }, { "epoch": 1.0975609756097562, "grad_norm": 1.1470153331756592, "learning_rate": 8.857445899109715e-05, "loss": 0.1859492301940918, "memory(GiB)": 153.57, "step": 2880, "token_acc": 0.9240327591075967, "train_speed(iter/s)": 0.314903 }, { "epoch": 1.099466463414634, "grad_norm": 1.2174228429794312, "learning_rate": 8.85363441854709e-05, "loss": 0.2296205759048462, "memory(GiB)": 153.57, "step": 2885, "token_acc": 0.889633601429848, "train_speed(iter/s)": 0.315017 }, { "epoch": 1.1013719512195121, "grad_norm": 1.508866548538208, "learning_rate": 8.849817414125973e-05, "loss": 0.2820731163024902, "memory(GiB)": 153.57, "step": 2890, "token_acc": 0.8831268165825302, "train_speed(iter/s)": 0.314992 }, { "epoch": 1.1032774390243902, "grad_norm": 0.9316002130508423, "learning_rate": 8.845994891317719e-05, "loss": 0.26219098567962645, "memory(GiB)": 153.57, "step": 2895, "token_acc": 0.8848144952545298, "train_speed(iter/s)": 0.315033 }, { "epoch": 1.1051829268292683, "grad_norm": 0.9679893255233765, "learning_rate": 8.84216685560159e-05, "loss": 0.24569568634033204, "memory(GiB)": 153.57, "step": 2900, "token_acc": 0.9023295575977773, "train_speed(iter/s)": 0.314964 }, { "epoch": 1.1070884146341464, "grad_norm": 1.0409789085388184, "learning_rate": 8.838333312464753e-05, "loss": 0.17699381113052368, "memory(GiB)": 153.57, "step": 2905, "token_acc": 0.912633723892002, "train_speed(iter/s)": 0.315013 }, { "epoch": 1.1089939024390243, "grad_norm": 1.032784342765808, "learning_rate": 8.834494267402263e-05, "loss": 0.2527631282806396, "memory(GiB)": 153.57, "step": 2910, "token_acc": 0.8908130319929557, "train_speed(iter/s)": 0.31506 }, { "epoch": 1.1108993902439024, "grad_norm": 1.1286613941192627, "learning_rate": 8.83064972591707e-05, "loss": 0.19412436485290527, "memory(GiB)": 153.57, "step": 2915, "token_acc": 0.8999676270637746, "train_speed(iter/s)": 0.315133 }, { "epoch": 1.1128048780487805, "grad_norm": 1.7684135437011719, "learning_rate": 8.826799693519996e-05, "loss": 0.2014693021774292, "memory(GiB)": 153.57, "step": 2920, "token_acc": 0.9246491763270287, "train_speed(iter/s)": 0.315225 }, { "epoch": 1.1147103658536586, "grad_norm": 1.4437639713287354, "learning_rate": 8.822944175729737e-05, "loss": 0.20496664047241211, "memory(GiB)": 153.57, "step": 2925, "token_acc": 0.9154644393824348, "train_speed(iter/s)": 0.315291 }, { "epoch": 1.1166158536585367, "grad_norm": 1.9762517213821411, "learning_rate": 8.819083178072852e-05, "loss": 0.21904253959655762, "memory(GiB)": 153.57, "step": 2930, "token_acc": 0.9014483627204031, "train_speed(iter/s)": 0.315397 }, { "epoch": 1.1185213414634145, "grad_norm": 2.034135103225708, "learning_rate": 8.815216706083751e-05, "loss": 0.17686333656311035, "memory(GiB)": 153.57, "step": 2935, "token_acc": 0.9278350515463918, "train_speed(iter/s)": 0.315513 }, { "epoch": 1.1204268292682926, "grad_norm": 1.7080367803573608, "learning_rate": 8.811344765304698e-05, "loss": 0.22477998733520507, "memory(GiB)": 153.57, "step": 2940, "token_acc": 0.8962595577598678, "train_speed(iter/s)": 0.315572 }, { "epoch": 1.1223323170731707, "grad_norm": 0.7846762537956238, "learning_rate": 8.807467361285793e-05, "loss": 0.21013574600219725, "memory(GiB)": 153.57, "step": 2945, "token_acc": 0.9213953488372093, "train_speed(iter/s)": 0.315601 }, { "epoch": 1.1242378048780488, "grad_norm": 1.1570621728897095, "learning_rate": 8.80358449958496e-05, "loss": 0.2553663969039917, "memory(GiB)": 153.57, "step": 2950, "token_acc": 0.8951716541215348, "train_speed(iter/s)": 0.315528 }, { "epoch": 1.126143292682927, "grad_norm": 0.9512637257575989, "learning_rate": 8.799696185767958e-05, "loss": 0.20932090282440186, "memory(GiB)": 153.57, "step": 2955, "token_acc": 0.9135163127912999, "train_speed(iter/s)": 0.315594 }, { "epoch": 1.1280487804878048, "grad_norm": 1.198053240776062, "learning_rate": 8.795802425408352e-05, "loss": 0.2667028188705444, "memory(GiB)": 153.57, "step": 2960, "token_acc": 0.889353958143767, "train_speed(iter/s)": 0.315625 }, { "epoch": 1.1299542682926829, "grad_norm": 0.9259385466575623, "learning_rate": 8.791903224087521e-05, "loss": 0.2622497797012329, "memory(GiB)": 153.57, "step": 2965, "token_acc": 0.8880688806888068, "train_speed(iter/s)": 0.315658 }, { "epoch": 1.131859756097561, "grad_norm": 1.2207560539245605, "learning_rate": 8.787998587394637e-05, "loss": 0.2503600358963013, "memory(GiB)": 153.57, "step": 2970, "token_acc": 0.8928571428571429, "train_speed(iter/s)": 0.315169 }, { "epoch": 1.133765243902439, "grad_norm": 1.8474041223526, "learning_rate": 8.784088520926667e-05, "loss": 0.23761434555053712, "memory(GiB)": 153.57, "step": 2975, "token_acc": 0.9180679785330949, "train_speed(iter/s)": 0.315041 }, { "epoch": 1.1356707317073171, "grad_norm": 1.4863240718841553, "learning_rate": 8.780173030288359e-05, "loss": 0.2063281774520874, "memory(GiB)": 153.57, "step": 2980, "token_acc": 0.91675, "train_speed(iter/s)": 0.315093 }, { "epoch": 1.1375762195121952, "grad_norm": 0.8709113001823425, "learning_rate": 8.77625212109224e-05, "loss": 0.18892886638641357, "memory(GiB)": 153.57, "step": 2985, "token_acc": 0.9140407288317256, "train_speed(iter/s)": 0.315138 }, { "epoch": 1.139481707317073, "grad_norm": 1.1238645315170288, "learning_rate": 8.772325798958597e-05, "loss": 0.21570391654968263, "memory(GiB)": 153.57, "step": 2990, "token_acc": 0.9085258964143427, "train_speed(iter/s)": 0.315151 }, { "epoch": 1.1413871951219512, "grad_norm": 0.321491539478302, "learning_rate": 8.768394069515484e-05, "loss": 0.20554850101470948, "memory(GiB)": 153.57, "step": 2995, "token_acc": 0.9003412969283277, "train_speed(iter/s)": 0.315194 }, { "epoch": 1.1432926829268293, "grad_norm": 0.8923466801643372, "learning_rate": 8.7644569383987e-05, "loss": 0.20917270183563233, "memory(GiB)": 153.57, "step": 3000, "token_acc": 0.9020408163265307, "train_speed(iter/s)": 0.315262 }, { "epoch": 1.1451981707317074, "grad_norm": 0.9825434684753418, "learning_rate": 8.760514411251788e-05, "loss": 0.235619854927063, "memory(GiB)": 153.57, "step": 3005, "token_acc": 0.8901998097050429, "train_speed(iter/s)": 0.315271 }, { "epoch": 1.1471036585365852, "grad_norm": 0.9844560623168945, "learning_rate": 8.75656649372603e-05, "loss": 0.2417755365371704, "memory(GiB)": 153.57, "step": 3010, "token_acc": 0.8917966784096628, "train_speed(iter/s)": 0.315317 }, { "epoch": 1.1490091463414633, "grad_norm": 1.3099067211151123, "learning_rate": 8.752613191480429e-05, "loss": 0.2861191272735596, "memory(GiB)": 153.57, "step": 3015, "token_acc": 0.8828710644677661, "train_speed(iter/s)": 0.315331 }, { "epoch": 1.1509146341463414, "grad_norm": 1.9530308246612549, "learning_rate": 8.748654510181709e-05, "loss": 0.22751429080963134, "memory(GiB)": 153.57, "step": 3020, "token_acc": 0.9191748393642205, "train_speed(iter/s)": 0.315269 }, { "epoch": 1.1528201219512195, "grad_norm": 1.295230746269226, "learning_rate": 8.744690455504305e-05, "loss": 0.22434914112091064, "memory(GiB)": 153.57, "step": 3025, "token_acc": 0.8995535714285714, "train_speed(iter/s)": 0.315378 }, { "epoch": 1.1547256097560976, "grad_norm": 1.3238247632980347, "learning_rate": 8.740721033130352e-05, "loss": 0.20937294960021974, "memory(GiB)": 153.57, "step": 3030, "token_acc": 0.9173887926054304, "train_speed(iter/s)": 0.315399 }, { "epoch": 1.1566310975609757, "grad_norm": 0.8454665541648865, "learning_rate": 8.736746248749683e-05, "loss": 0.19293515682220458, "memory(GiB)": 153.57, "step": 3035, "token_acc": 0.8984823625922888, "train_speed(iter/s)": 0.31543 }, { "epoch": 1.1585365853658536, "grad_norm": 1.3990658521652222, "learning_rate": 8.732766108059813e-05, "loss": 0.20673666000366211, "memory(GiB)": 153.57, "step": 3040, "token_acc": 0.9076479076479076, "train_speed(iter/s)": 0.315497 }, { "epoch": 1.1604420731707317, "grad_norm": 1.0747017860412598, "learning_rate": 8.728780616765938e-05, "loss": 0.22226285934448242, "memory(GiB)": 153.57, "step": 3045, "token_acc": 0.9113814074717637, "train_speed(iter/s)": 0.315526 }, { "epoch": 1.1623475609756098, "grad_norm": 1.1713976860046387, "learning_rate": 8.72478978058092e-05, "loss": 0.18758519887924194, "memory(GiB)": 153.57, "step": 3050, "token_acc": 0.9209039548022598, "train_speed(iter/s)": 0.315569 }, { "epoch": 1.1642530487804879, "grad_norm": 1.0636019706726074, "learning_rate": 8.720793605225289e-05, "loss": 0.20971004962921141, "memory(GiB)": 153.57, "step": 3055, "token_acc": 0.9097888675623801, "train_speed(iter/s)": 0.315622 }, { "epoch": 1.166158536585366, "grad_norm": 1.2704837322235107, "learning_rate": 8.716792096427217e-05, "loss": 0.23917412757873535, "memory(GiB)": 153.57, "step": 3060, "token_acc": 0.8948318663958474, "train_speed(iter/s)": 0.315637 }, { "epoch": 1.1680640243902438, "grad_norm": 0.8801584243774414, "learning_rate": 8.712785259922533e-05, "loss": 0.17196304798126222, "memory(GiB)": 153.57, "step": 3065, "token_acc": 0.9105209397344228, "train_speed(iter/s)": 0.315661 }, { "epoch": 1.169969512195122, "grad_norm": 1.6203020811080933, "learning_rate": 8.708773101454697e-05, "loss": 0.25239999294281007, "memory(GiB)": 153.57, "step": 3070, "token_acc": 0.9004287756074321, "train_speed(iter/s)": 0.315446 }, { "epoch": 1.171875, "grad_norm": 0.9971458315849304, "learning_rate": 8.704755626774796e-05, "loss": 0.25118656158447267, "memory(GiB)": 153.57, "step": 3075, "token_acc": 0.900459418070444, "train_speed(iter/s)": 0.315512 }, { "epoch": 1.173780487804878, "grad_norm": 0.7995221018791199, "learning_rate": 8.700732841641542e-05, "loss": 0.21467065811157227, "memory(GiB)": 153.57, "step": 3080, "token_acc": 0.901794834379104, "train_speed(iter/s)": 0.315469 }, { "epoch": 1.1756859756097562, "grad_norm": 1.5310572385787964, "learning_rate": 8.696704751821256e-05, "loss": 0.23184475898742676, "memory(GiB)": 153.57, "step": 3085, "token_acc": 0.9058874086807047, "train_speed(iter/s)": 0.315499 }, { "epoch": 1.177591463414634, "grad_norm": 1.8125377893447876, "learning_rate": 8.692671363087863e-05, "loss": 0.23920886516571044, "memory(GiB)": 153.57, "step": 3090, "token_acc": 0.9080020387359837, "train_speed(iter/s)": 0.315556 }, { "epoch": 1.1794969512195121, "grad_norm": 1.3982175588607788, "learning_rate": 8.688632681222886e-05, "loss": 0.23753468990325927, "memory(GiB)": 153.57, "step": 3095, "token_acc": 0.8953173374613003, "train_speed(iter/s)": 0.315583 }, { "epoch": 1.1814024390243902, "grad_norm": 1.4944342374801636, "learning_rate": 8.68458871201543e-05, "loss": 0.23452892303466796, "memory(GiB)": 153.57, "step": 3100, "token_acc": 0.9125907226254338, "train_speed(iter/s)": 0.315514 }, { "epoch": 1.1833079268292683, "grad_norm": 1.5237213373184204, "learning_rate": 8.680539461262186e-05, "loss": 0.2121495246887207, "memory(GiB)": 153.57, "step": 3105, "token_acc": 0.911620294599018, "train_speed(iter/s)": 0.315516 }, { "epoch": 1.1852134146341464, "grad_norm": 1.08982515335083, "learning_rate": 8.676484934767409e-05, "loss": 0.2166914939880371, "memory(GiB)": 153.57, "step": 3110, "token_acc": 0.9127622034895119, "train_speed(iter/s)": 0.315529 }, { "epoch": 1.1871189024390243, "grad_norm": 1.2586420774459839, "learning_rate": 8.672425138342919e-05, "loss": 0.26205596923828123, "memory(GiB)": 153.57, "step": 3115, "token_acc": 0.8968937614199948, "train_speed(iter/s)": 0.315592 }, { "epoch": 1.1890243902439024, "grad_norm": 1.215455174446106, "learning_rate": 8.668360077808093e-05, "loss": 0.2003873109817505, "memory(GiB)": 153.57, "step": 3120, "token_acc": 0.9126896041435442, "train_speed(iter/s)": 0.315634 }, { "epoch": 1.1909298780487805, "grad_norm": 0.2765921652317047, "learning_rate": 8.66428975898985e-05, "loss": 0.17244263887405395, "memory(GiB)": 153.57, "step": 3125, "token_acc": 0.9072138669385674, "train_speed(iter/s)": 0.315668 }, { "epoch": 1.1928353658536586, "grad_norm": 0.8632105588912964, "learning_rate": 8.660214187722646e-05, "loss": 0.14563931226730348, "memory(GiB)": 153.57, "step": 3130, "token_acc": 0.9347915806851012, "train_speed(iter/s)": 0.315771 }, { "epoch": 1.1947408536585367, "grad_norm": 1.2007712125778198, "learning_rate": 8.656133369848468e-05, "loss": 0.2591915845870972, "memory(GiB)": 153.57, "step": 3135, "token_acc": 0.9010921079744488, "train_speed(iter/s)": 0.315797 }, { "epoch": 1.1966463414634148, "grad_norm": 1.028590202331543, "learning_rate": 8.652047311216822e-05, "loss": 0.2455207109451294, "memory(GiB)": 153.57, "step": 3140, "token_acc": 0.9000951474785919, "train_speed(iter/s)": 0.31569 }, { "epoch": 1.1985518292682926, "grad_norm": 0.8011485934257507, "learning_rate": 8.647956017684729e-05, "loss": 0.26211564540863036, "memory(GiB)": 153.57, "step": 3145, "token_acc": 0.8817356778218945, "train_speed(iter/s)": 0.315728 }, { "epoch": 1.2004573170731707, "grad_norm": 1.4865754842758179, "learning_rate": 8.64385949511671e-05, "loss": 0.19993478059768677, "memory(GiB)": 153.57, "step": 3150, "token_acc": 0.9159908504886671, "train_speed(iter/s)": 0.315777 }, { "epoch": 1.2023628048780488, "grad_norm": 0.507438063621521, "learning_rate": 8.639757749384782e-05, "loss": 0.1727904796600342, "memory(GiB)": 153.57, "step": 3155, "token_acc": 0.9315431164901664, "train_speed(iter/s)": 0.315852 }, { "epoch": 1.204268292682927, "grad_norm": 1.2299463748931885, "learning_rate": 8.635650786368452e-05, "loss": 0.26090803146362307, "memory(GiB)": 153.57, "step": 3160, "token_acc": 0.8922752539773816, "train_speed(iter/s)": 0.315857 }, { "epoch": 1.2061737804878048, "grad_norm": 1.2195637226104736, "learning_rate": 8.631538611954704e-05, "loss": 0.19421907663345336, "memory(GiB)": 153.57, "step": 3165, "token_acc": 0.9155879180151025, "train_speed(iter/s)": 0.315795 }, { "epoch": 1.2080792682926829, "grad_norm": 1.1763468980789185, "learning_rate": 8.627421232037989e-05, "loss": 0.24568688869476318, "memory(GiB)": 153.57, "step": 3170, "token_acc": 0.9005462422301752, "train_speed(iter/s)": 0.315737 }, { "epoch": 1.209984756097561, "grad_norm": 1.0562019348144531, "learning_rate": 8.623298652520224e-05, "loss": 0.23319015502929688, "memory(GiB)": 153.57, "step": 3175, "token_acc": 0.9014625498596543, "train_speed(iter/s)": 0.315693 }, { "epoch": 1.211890243902439, "grad_norm": 1.3257118463516235, "learning_rate": 8.619170879310779e-05, "loss": 0.24129800796508788, "memory(GiB)": 153.57, "step": 3180, "token_acc": 0.9125511167033659, "train_speed(iter/s)": 0.315781 }, { "epoch": 1.2137957317073171, "grad_norm": 1.0961191654205322, "learning_rate": 8.615037918326464e-05, "loss": 0.23751115798950195, "memory(GiB)": 153.57, "step": 3185, "token_acc": 0.9100975481149486, "train_speed(iter/s)": 0.31584 }, { "epoch": 1.2157012195121952, "grad_norm": 1.0639315843582153, "learning_rate": 8.61089977549153e-05, "loss": 0.24928569793701172, "memory(GiB)": 153.57, "step": 3190, "token_acc": 0.9023467070401211, "train_speed(iter/s)": 0.315839 }, { "epoch": 1.217606707317073, "grad_norm": 0.9985455870628357, "learning_rate": 8.606756456737656e-05, "loss": 0.2037409782409668, "memory(GiB)": 153.57, "step": 3195, "token_acc": 0.9075732899022801, "train_speed(iter/s)": 0.315851 }, { "epoch": 1.2195121951219512, "grad_norm": 1.4796713590621948, "learning_rate": 8.602607968003935e-05, "loss": 0.20391595363616943, "memory(GiB)": 153.57, "step": 3200, "token_acc": 0.910958904109589, "train_speed(iter/s)": 0.315648 }, { "epoch": 1.2195121951219512, "eval_loss": 0.19502298533916473, "eval_runtime": 33.1675, "eval_samples_per_second": 3.196, "eval_steps_per_second": 3.196, "eval_token_acc": 0.8978124214231833, "step": 3200 }, { "epoch": 1.2214176829268293, "grad_norm": 1.48545241355896, "learning_rate": 8.598454315236877e-05, "loss": 0.17302515506744384, "memory(GiB)": 153.57, "step": 3205, "token_acc": 0.9001602421436838, "train_speed(iter/s)": 0.314692 }, { "epoch": 1.2233231707317074, "grad_norm": 1.3973233699798584, "learning_rate": 8.59429550439039e-05, "loss": 0.2318706512451172, "memory(GiB)": 153.57, "step": 3210, "token_acc": 0.9061258624294376, "train_speed(iter/s)": 0.314735 }, { "epoch": 1.2252286585365852, "grad_norm": 1.16274893283844, "learning_rate": 8.590131541425777e-05, "loss": 0.22101926803588867, "memory(GiB)": 153.57, "step": 3215, "token_acc": 0.8996698459280997, "train_speed(iter/s)": 0.314756 }, { "epoch": 1.2271341463414633, "grad_norm": 1.8753942251205444, "learning_rate": 8.585962432311727e-05, "loss": 0.24948856830596924, "memory(GiB)": 153.57, "step": 3220, "token_acc": 0.8965396714435512, "train_speed(iter/s)": 0.314841 }, { "epoch": 1.2290396341463414, "grad_norm": 0.9715408086776733, "learning_rate": 8.581788183024305e-05, "loss": 0.2379611015319824, "memory(GiB)": 153.57, "step": 3225, "token_acc": 0.900768112933361, "train_speed(iter/s)": 0.314869 }, { "epoch": 1.2309451219512195, "grad_norm": 1.4201011657714844, "learning_rate": 8.577608799546942e-05, "loss": 0.23652877807617187, "memory(GiB)": 153.57, "step": 3230, "token_acc": 0.9009824198552223, "train_speed(iter/s)": 0.314912 }, { "epoch": 1.2328506097560976, "grad_norm": 1.4984245300292969, "learning_rate": 8.573424287870431e-05, "loss": 0.23375365734100342, "memory(GiB)": 153.57, "step": 3235, "token_acc": 0.8812272174969623, "train_speed(iter/s)": 0.315 }, { "epoch": 1.2347560975609757, "grad_norm": 1.2144943475723267, "learning_rate": 8.569234653992916e-05, "loss": 0.21292781829833984, "memory(GiB)": 153.57, "step": 3240, "token_acc": 0.9016678248783878, "train_speed(iter/s)": 0.315094 }, { "epoch": 1.2366615853658536, "grad_norm": 1.4547382593154907, "learning_rate": 8.56503990391988e-05, "loss": 0.23664486408233643, "memory(GiB)": 153.57, "step": 3245, "token_acc": 0.9050480769230769, "train_speed(iter/s)": 0.31513 }, { "epoch": 1.2385670731707317, "grad_norm": 1.3641294240951538, "learning_rate": 8.560840043664144e-05, "loss": 0.204215145111084, "memory(GiB)": 153.57, "step": 3250, "token_acc": 0.903593839132915, "train_speed(iter/s)": 0.315057 }, { "epoch": 1.2404725609756098, "grad_norm": 1.37388014793396, "learning_rate": 8.556635079245853e-05, "loss": 0.22686493396759033, "memory(GiB)": 153.57, "step": 3255, "token_acc": 0.909156279961649, "train_speed(iter/s)": 0.314963 }, { "epoch": 1.2423780487804879, "grad_norm": 1.3845572471618652, "learning_rate": 8.552425016692464e-05, "loss": 0.24329853057861328, "memory(GiB)": 153.57, "step": 3260, "token_acc": 0.8995895168929586, "train_speed(iter/s)": 0.315019 }, { "epoch": 1.244283536585366, "grad_norm": 1.2225029468536377, "learning_rate": 8.548209862038746e-05, "loss": 0.2011625051498413, "memory(GiB)": 153.57, "step": 3265, "token_acc": 0.9123123123123124, "train_speed(iter/s)": 0.315077 }, { "epoch": 1.2461890243902438, "grad_norm": 1.2933759689331055, "learning_rate": 8.543989621326768e-05, "loss": 0.1437530279159546, "memory(GiB)": 153.57, "step": 3270, "token_acc": 0.926033779848573, "train_speed(iter/s)": 0.315051 }, { "epoch": 1.248094512195122, "grad_norm": 1.8079135417938232, "learning_rate": 8.539764300605885e-05, "loss": 0.21502385139465333, "memory(GiB)": 153.57, "step": 3275, "token_acc": 0.9159519725557461, "train_speed(iter/s)": 0.31509 }, { "epoch": 1.25, "grad_norm": 1.4949060678482056, "learning_rate": 8.535533905932738e-05, "loss": 0.1945024013519287, "memory(GiB)": 153.57, "step": 3280, "token_acc": 0.9178947368421052, "train_speed(iter/s)": 0.315099 }, { "epoch": 1.251905487804878, "grad_norm": 1.2355149984359741, "learning_rate": 8.531298443371239e-05, "loss": 0.20527338981628418, "memory(GiB)": 153.57, "step": 3285, "token_acc": 0.9148121560181862, "train_speed(iter/s)": 0.314896 }, { "epoch": 1.2538109756097562, "grad_norm": 1.180343747138977, "learning_rate": 8.527057918992565e-05, "loss": 0.261825704574585, "memory(GiB)": 153.57, "step": 3290, "token_acc": 0.8966709114062216, "train_speed(iter/s)": 0.314801 }, { "epoch": 1.2557164634146343, "grad_norm": 1.1057976484298706, "learning_rate": 8.522812338875148e-05, "loss": 0.3169482946395874, "memory(GiB)": 153.57, "step": 3295, "token_acc": 0.870254110612855, "train_speed(iter/s)": 0.314871 }, { "epoch": 1.2576219512195121, "grad_norm": 0.8280823826789856, "learning_rate": 8.518561709104667e-05, "loss": 0.2041994571685791, "memory(GiB)": 153.57, "step": 3300, "token_acc": 0.8956115294728642, "train_speed(iter/s)": 0.314916 }, { "epoch": 1.2595274390243902, "grad_norm": 1.199804425239563, "learning_rate": 8.51430603577404e-05, "loss": 0.20888195037841797, "memory(GiB)": 153.57, "step": 3305, "token_acc": 0.8770983213429256, "train_speed(iter/s)": 0.314901 }, { "epoch": 1.2614329268292683, "grad_norm": 1.6345326900482178, "learning_rate": 8.510045324983417e-05, "loss": 0.20995895862579345, "memory(GiB)": 153.57, "step": 3310, "token_acc": 0.9083895853423336, "train_speed(iter/s)": 0.314923 }, { "epoch": 1.2633384146341464, "grad_norm": 1.307020902633667, "learning_rate": 8.505779582840161e-05, "loss": 0.2605759143829346, "memory(GiB)": 153.57, "step": 3315, "token_acc": 0.8886123210952085, "train_speed(iter/s)": 0.314949 }, { "epoch": 1.2652439024390243, "grad_norm": 1.3373843431472778, "learning_rate": 8.501508815458855e-05, "loss": 0.21336863040924073, "memory(GiB)": 153.57, "step": 3320, "token_acc": 0.9101326899879373, "train_speed(iter/s)": 0.314743 }, { "epoch": 1.2671493902439024, "grad_norm": 1.6532785892486572, "learning_rate": 8.497233028961282e-05, "loss": 0.2352123498916626, "memory(GiB)": 153.57, "step": 3325, "token_acc": 0.8875514729173266, "train_speed(iter/s)": 0.31477 }, { "epoch": 1.2690548780487805, "grad_norm": 1.3972206115722656, "learning_rate": 8.492952229476421e-05, "loss": 0.21713192462921144, "memory(GiB)": 153.57, "step": 3330, "token_acc": 0.908845871110022, "train_speed(iter/s)": 0.314824 }, { "epoch": 1.2709603658536586, "grad_norm": 1.5711396932601929, "learning_rate": 8.488666423140432e-05, "loss": 0.23616747856140136, "memory(GiB)": 153.57, "step": 3335, "token_acc": 0.8992486996725101, "train_speed(iter/s)": 0.314716 }, { "epoch": 1.2728658536585367, "grad_norm": 0.8584586977958679, "learning_rate": 8.484375616096658e-05, "loss": 0.2000264883041382, "memory(GiB)": 153.57, "step": 3340, "token_acc": 0.9079780498100465, "train_speed(iter/s)": 0.314653 }, { "epoch": 1.2747713414634148, "grad_norm": 0.8704352974891663, "learning_rate": 8.480079814495608e-05, "loss": 0.2548203945159912, "memory(GiB)": 153.57, "step": 3345, "token_acc": 0.8954117411340413, "train_speed(iter/s)": 0.314558 }, { "epoch": 1.2766768292682926, "grad_norm": 1.1931506395339966, "learning_rate": 8.475779024494945e-05, "loss": 0.21864681243896483, "memory(GiB)": 153.57, "step": 3350, "token_acc": 0.9163770190145165, "train_speed(iter/s)": 0.31457 }, { "epoch": 1.2785823170731707, "grad_norm": 1.5422935485839844, "learning_rate": 8.471473252259493e-05, "loss": 0.20450243949890137, "memory(GiB)": 153.57, "step": 3355, "token_acc": 0.9183537263626251, "train_speed(iter/s)": 0.314483 }, { "epoch": 1.2804878048780488, "grad_norm": 1.173396110534668, "learning_rate": 8.467162503961208e-05, "loss": 0.26073811054229734, "memory(GiB)": 153.57, "step": 3360, "token_acc": 0.8902863259298903, "train_speed(iter/s)": 0.314418 }, { "epoch": 1.282393292682927, "grad_norm": 1.732362985610962, "learning_rate": 8.462846785779186e-05, "loss": 0.23577561378479003, "memory(GiB)": 153.57, "step": 3365, "token_acc": 0.9084433270479783, "train_speed(iter/s)": 0.314337 }, { "epoch": 1.2842987804878048, "grad_norm": 1.3555549383163452, "learning_rate": 8.45852610389964e-05, "loss": 0.2383570194244385, "memory(GiB)": 153.57, "step": 3370, "token_acc": 0.902016129032258, "train_speed(iter/s)": 0.314388 }, { "epoch": 1.2862042682926829, "grad_norm": 1.3500431776046753, "learning_rate": 8.454200464515902e-05, "loss": 0.23206133842468263, "memory(GiB)": 153.57, "step": 3375, "token_acc": 0.9034789987271956, "train_speed(iter/s)": 0.314418 }, { "epoch": 1.288109756097561, "grad_norm": 1.4564005136489868, "learning_rate": 8.449869873828411e-05, "loss": 0.19974875450134277, "memory(GiB)": 153.57, "step": 3380, "token_acc": 0.9168081494057725, "train_speed(iter/s)": 0.314437 }, { "epoch": 1.290015243902439, "grad_norm": 1.351737380027771, "learning_rate": 8.4455343380447e-05, "loss": 0.23936476707458496, "memory(GiB)": 153.57, "step": 3385, "token_acc": 0.9062333510921684, "train_speed(iter/s)": 0.314362 }, { "epoch": 1.2919207317073171, "grad_norm": 1.0835955142974854, "learning_rate": 8.441193863379396e-05, "loss": 0.1884536027908325, "memory(GiB)": 153.57, "step": 3390, "token_acc": 0.9122615803814714, "train_speed(iter/s)": 0.314289 }, { "epoch": 1.2938262195121952, "grad_norm": 1.122472882270813, "learning_rate": 8.436848456054196e-05, "loss": 0.18368353843688964, "memory(GiB)": 153.57, "step": 3395, "token_acc": 0.9222674813306178, "train_speed(iter/s)": 0.31425 }, { "epoch": 1.295731707317073, "grad_norm": 1.8413636684417725, "learning_rate": 8.432498122297878e-05, "loss": 0.20047061443328856, "memory(GiB)": 153.57, "step": 3400, "token_acc": 0.9215995790581426, "train_speed(iter/s)": 0.314311 }, { "epoch": 1.2976371951219512, "grad_norm": 1.4209650754928589, "learning_rate": 8.428142868346277e-05, "loss": 0.26879990100860596, "memory(GiB)": 153.57, "step": 3405, "token_acc": 0.9077708006279435, "train_speed(iter/s)": 0.314323 }, { "epoch": 1.2995426829268293, "grad_norm": 1.6193376779556274, "learning_rate": 8.423782700442277e-05, "loss": 0.2336493492126465, "memory(GiB)": 153.57, "step": 3410, "token_acc": 0.905032021957914, "train_speed(iter/s)": 0.314365 }, { "epoch": 1.3014481707317074, "grad_norm": 1.1695619821548462, "learning_rate": 8.419417624835811e-05, "loss": 0.19947500228881837, "memory(GiB)": 153.57, "step": 3415, "token_acc": 0.9057294526816767, "train_speed(iter/s)": 0.314368 }, { "epoch": 1.3033536585365852, "grad_norm": 1.015486478805542, "learning_rate": 8.415047647783847e-05, "loss": 0.18503966331481933, "memory(GiB)": 153.57, "step": 3420, "token_acc": 0.918939393939394, "train_speed(iter/s)": 0.314402 }, { "epoch": 1.3052591463414633, "grad_norm": 1.7809827327728271, "learning_rate": 8.410672775550374e-05, "loss": 0.19084973335266114, "memory(GiB)": 153.57, "step": 3425, "token_acc": 0.9079704190632704, "train_speed(iter/s)": 0.314471 }, { "epoch": 1.3071646341463414, "grad_norm": 1.3721338510513306, "learning_rate": 8.406293014406403e-05, "loss": 0.20611088275909423, "memory(GiB)": 153.57, "step": 3430, "token_acc": 0.911121903836814, "train_speed(iter/s)": 0.314523 }, { "epoch": 1.3090701219512195, "grad_norm": 1.3483657836914062, "learning_rate": 8.40190837062995e-05, "loss": 0.23802428245544432, "memory(GiB)": 153.57, "step": 3435, "token_acc": 0.903688043222927, "train_speed(iter/s)": 0.314447 }, { "epoch": 1.3109756097560976, "grad_norm": 1.0223816633224487, "learning_rate": 8.397518850506028e-05, "loss": 0.18615604639053346, "memory(GiB)": 153.57, "step": 3440, "token_acc": 0.9184875546266388, "train_speed(iter/s)": 0.31447 }, { "epoch": 1.3128810975609757, "grad_norm": 1.4195303916931152, "learning_rate": 8.393124460326647e-05, "loss": 0.25189552307128904, "memory(GiB)": 153.57, "step": 3445, "token_acc": 0.8929471032745592, "train_speed(iter/s)": 0.314396 }, { "epoch": 1.3147865853658536, "grad_norm": 1.0998095273971558, "learning_rate": 8.388725206390788e-05, "loss": 0.28467679023742676, "memory(GiB)": 153.57, "step": 3450, "token_acc": 0.8886992794842624, "train_speed(iter/s)": 0.314305 }, { "epoch": 1.3166920731707317, "grad_norm": 1.4650319814682007, "learning_rate": 8.384321095004413e-05, "loss": 0.21221392154693602, "memory(GiB)": 153.57, "step": 3455, "token_acc": 0.9182577565632458, "train_speed(iter/s)": 0.314325 }, { "epoch": 1.3185975609756098, "grad_norm": 1.0198012590408325, "learning_rate": 8.379912132480441e-05, "loss": 0.20611848831176757, "memory(GiB)": 153.57, "step": 3460, "token_acc": 0.9014232999472852, "train_speed(iter/s)": 0.314356 }, { "epoch": 1.3205030487804879, "grad_norm": 1.0021440982818604, "learning_rate": 8.375498325138745e-05, "loss": 0.2183063507080078, "memory(GiB)": 153.57, "step": 3465, "token_acc": 0.8920574162679425, "train_speed(iter/s)": 0.314375 }, { "epoch": 1.3224085365853657, "grad_norm": 0.801670253276825, "learning_rate": 8.371079679306146e-05, "loss": 0.2080472230911255, "memory(GiB)": 153.57, "step": 3470, "token_acc": 0.9206367384333486, "train_speed(iter/s)": 0.31429 }, { "epoch": 1.3243140243902438, "grad_norm": 1.2726926803588867, "learning_rate": 8.366656201316396e-05, "loss": 0.21202797889709474, "memory(GiB)": 153.57, "step": 3475, "token_acc": 0.9141594782242795, "train_speed(iter/s)": 0.314306 }, { "epoch": 1.326219512195122, "grad_norm": 1.257169246673584, "learning_rate": 8.36222789751018e-05, "loss": 0.2151618242263794, "memory(GiB)": 153.57, "step": 3480, "token_acc": 0.9138984345169913, "train_speed(iter/s)": 0.314319 }, { "epoch": 1.328125, "grad_norm": 2.2129526138305664, "learning_rate": 8.357794774235092e-05, "loss": 0.21835768222808838, "memory(GiB)": 153.57, "step": 3485, "token_acc": 0.904019688269073, "train_speed(iter/s)": 0.314397 }, { "epoch": 1.330030487804878, "grad_norm": 2.2282683849334717, "learning_rate": 8.353356837845642e-05, "loss": 0.2870006084442139, "memory(GiB)": 153.57, "step": 3490, "token_acc": 0.8867091711623345, "train_speed(iter/s)": 0.31442 }, { "epoch": 1.3319359756097562, "grad_norm": 1.5618784427642822, "learning_rate": 8.348914094703232e-05, "loss": 0.23572986125946044, "memory(GiB)": 153.57, "step": 3495, "token_acc": 0.8983628922237381, "train_speed(iter/s)": 0.314466 }, { "epoch": 1.3338414634146343, "grad_norm": 1.076036810874939, "learning_rate": 8.344466551176164e-05, "loss": 0.16910452842712403, "memory(GiB)": 153.57, "step": 3500, "token_acc": 0.9208759124087591, "train_speed(iter/s)": 0.31451 }, { "epoch": 1.3357469512195121, "grad_norm": 1.6522314548492432, "learning_rate": 8.340014213639609e-05, "loss": 0.2389664649963379, "memory(GiB)": 153.57, "step": 3505, "token_acc": 0.8978508888299284, "train_speed(iter/s)": 0.314438 }, { "epoch": 1.3376524390243902, "grad_norm": 1.4212536811828613, "learning_rate": 8.335557088475618e-05, "loss": 0.18906698226928711, "memory(GiB)": 153.57, "step": 3510, "token_acc": 0.9134199134199135, "train_speed(iter/s)": 0.314537 }, { "epoch": 1.3395579268292683, "grad_norm": 0.8857728242874146, "learning_rate": 8.331095182073104e-05, "loss": 0.19940059185028075, "memory(GiB)": 153.57, "step": 3515, "token_acc": 0.9196857267568748, "train_speed(iter/s)": 0.314584 }, { "epoch": 1.3414634146341464, "grad_norm": 1.3112916946411133, "learning_rate": 8.326628500827826e-05, "loss": 0.21512527465820314, "memory(GiB)": 153.57, "step": 3520, "token_acc": 0.9104439312291507, "train_speed(iter/s)": 0.314476 }, { "epoch": 1.3433689024390243, "grad_norm": 1.1566346883773804, "learning_rate": 8.322157051142401e-05, "loss": 0.19938902854919432, "memory(GiB)": 153.57, "step": 3525, "token_acc": 0.9161639276055752, "train_speed(iter/s)": 0.314481 }, { "epoch": 1.3452743902439024, "grad_norm": 1.3223989009857178, "learning_rate": 8.31768083942627e-05, "loss": 0.22848639488220215, "memory(GiB)": 153.57, "step": 3530, "token_acc": 0.9069163730762099, "train_speed(iter/s)": 0.314364 }, { "epoch": 1.3471798780487805, "grad_norm": 1.6453980207443237, "learning_rate": 8.313199872095701e-05, "loss": 0.29529974460601804, "memory(GiB)": 153.57, "step": 3535, "token_acc": 0.8799426934097421, "train_speed(iter/s)": 0.314429 }, { "epoch": 1.3490853658536586, "grad_norm": 1.2242372035980225, "learning_rate": 8.308714155573785e-05, "loss": 0.2539889097213745, "memory(GiB)": 153.57, "step": 3540, "token_acc": 0.8984841525034452, "train_speed(iter/s)": 0.314462 }, { "epoch": 1.3509908536585367, "grad_norm": 1.0400673151016235, "learning_rate": 8.304223696290413e-05, "loss": 0.23513784408569335, "memory(GiB)": 153.57, "step": 3545, "token_acc": 0.9085736707867373, "train_speed(iter/s)": 0.314381 }, { "epoch": 1.3528963414634148, "grad_norm": 1.9549869298934937, "learning_rate": 8.29972850068228e-05, "loss": 0.19363155364990234, "memory(GiB)": 153.57, "step": 3550, "token_acc": 0.9079439252336449, "train_speed(iter/s)": 0.314431 }, { "epoch": 1.3548018292682926, "grad_norm": 1.629018783569336, "learning_rate": 8.295228575192869e-05, "loss": 0.2302783250808716, "memory(GiB)": 153.57, "step": 3555, "token_acc": 0.9022900763358779, "train_speed(iter/s)": 0.314485 }, { "epoch": 1.3567073170731707, "grad_norm": 1.193678855895996, "learning_rate": 8.290723926272439e-05, "loss": 0.2657754898071289, "memory(GiB)": 153.57, "step": 3560, "token_acc": 0.8904615384615384, "train_speed(iter/s)": 0.314506 }, { "epoch": 1.3586128048780488, "grad_norm": 1.9957401752471924, "learning_rate": 8.286214560378025e-05, "loss": 0.23792438507080077, "memory(GiB)": 153.57, "step": 3565, "token_acc": 0.8914782608695652, "train_speed(iter/s)": 0.314328 }, { "epoch": 1.360518292682927, "grad_norm": 1.5510872602462769, "learning_rate": 8.281700483973421e-05, "loss": 0.26185901165008546, "memory(GiB)": 153.57, "step": 3570, "token_acc": 0.8942017037647706, "train_speed(iter/s)": 0.31438 }, { "epoch": 1.3624237804878048, "grad_norm": 3.146616220474243, "learning_rate": 8.277181703529173e-05, "loss": 0.21653194427490235, "memory(GiB)": 153.57, "step": 3575, "token_acc": 0.9209419680403701, "train_speed(iter/s)": 0.314444 }, { "epoch": 1.3643292682926829, "grad_norm": 1.1549046039581299, "learning_rate": 8.272658225522569e-05, "loss": 0.18078871965408325, "memory(GiB)": 153.57, "step": 3580, "token_acc": 0.9181977252843394, "train_speed(iter/s)": 0.314466 }, { "epoch": 1.366234756097561, "grad_norm": 0.9145674109458923, "learning_rate": 8.268130056437632e-05, "loss": 0.21413426399230956, "memory(GiB)": 153.57, "step": 3585, "token_acc": 0.9184981684981685, "train_speed(iter/s)": 0.314533 }, { "epoch": 1.368140243902439, "grad_norm": 1.2122259140014648, "learning_rate": 8.263597202765109e-05, "loss": 0.23099210262298583, "memory(GiB)": 153.57, "step": 3590, "token_acc": 0.9024106400665004, "train_speed(iter/s)": 0.314465 }, { "epoch": 1.3700457317073171, "grad_norm": 0.9200612306594849, "learning_rate": 8.259059671002462e-05, "loss": 0.1929612636566162, "memory(GiB)": 153.57, "step": 3595, "token_acc": 0.9207760711398545, "train_speed(iter/s)": 0.31445 }, { "epoch": 1.3719512195121952, "grad_norm": 1.3900879621505737, "learning_rate": 8.254517467653858e-05, "loss": 0.1878268003463745, "memory(GiB)": 153.57, "step": 3600, "token_acc": 0.9074259909683894, "train_speed(iter/s)": 0.314403 }, { "epoch": 1.373856707317073, "grad_norm": 1.4762675762176514, "learning_rate": 8.249970599230159e-05, "loss": 0.2715415954589844, "memory(GiB)": 153.57, "step": 3605, "token_acc": 0.8879824286162535, "train_speed(iter/s)": 0.314455 }, { "epoch": 1.3757621951219512, "grad_norm": 2.2575302124023438, "learning_rate": 8.245419072248919e-05, "loss": 0.20396075248718262, "memory(GiB)": 153.57, "step": 3610, "token_acc": 0.9090060662622492, "train_speed(iter/s)": 0.314555 }, { "epoch": 1.3776676829268293, "grad_norm": 1.0742396116256714, "learning_rate": 8.240862893234365e-05, "loss": 0.22013986110687256, "memory(GiB)": 153.57, "step": 3615, "token_acc": 0.9075468056359776, "train_speed(iter/s)": 0.314576 }, { "epoch": 1.3795731707317074, "grad_norm": 1.6102453470230103, "learning_rate": 8.236302068717392e-05, "loss": 0.2233124256134033, "memory(GiB)": 153.57, "step": 3620, "token_acc": 0.9139888397152203, "train_speed(iter/s)": 0.314607 }, { "epoch": 1.3814786585365852, "grad_norm": 1.5646675825119019, "learning_rate": 8.231736605235559e-05, "loss": 0.23088431358337402, "memory(GiB)": 153.57, "step": 3625, "token_acc": 0.8994755244755245, "train_speed(iter/s)": 0.314639 }, { "epoch": 1.3833841463414633, "grad_norm": 1.1702961921691895, "learning_rate": 8.227166509333068e-05, "loss": 0.247345495223999, "memory(GiB)": 153.57, "step": 3630, "token_acc": 0.9009310097875388, "train_speed(iter/s)": 0.314674 }, { "epoch": 1.3852896341463414, "grad_norm": 1.3644391298294067, "learning_rate": 8.222591787560767e-05, "loss": 0.220162034034729, "memory(GiB)": 153.57, "step": 3635, "token_acc": 0.9099015990159902, "train_speed(iter/s)": 0.314722 }, { "epoch": 1.3871951219512195, "grad_norm": 1.569035291671753, "learning_rate": 8.218012446476128e-05, "loss": 0.25192253589630126, "memory(GiB)": 153.57, "step": 3640, "token_acc": 0.9069188651070185, "train_speed(iter/s)": 0.314729 }, { "epoch": 1.3891006097560976, "grad_norm": 1.6114429235458374, "learning_rate": 8.213428492643253e-05, "loss": 0.23442463874816893, "memory(GiB)": 153.57, "step": 3645, "token_acc": 0.9030193961207759, "train_speed(iter/s)": 0.314767 }, { "epoch": 1.3910060975609757, "grad_norm": 1.601576566696167, "learning_rate": 8.208839932632849e-05, "loss": 0.25686798095703123, "memory(GiB)": 153.57, "step": 3650, "token_acc": 0.9039190897597977, "train_speed(iter/s)": 0.314783 }, { "epoch": 1.3929115853658536, "grad_norm": 1.2033571004867554, "learning_rate": 8.20424677302223e-05, "loss": 0.20301945209503175, "memory(GiB)": 153.57, "step": 3655, "token_acc": 0.910941475826972, "train_speed(iter/s)": 0.314746 }, { "epoch": 1.3948170731707317, "grad_norm": 1.2314972877502441, "learning_rate": 8.199649020395298e-05, "loss": 0.20713632106781005, "memory(GiB)": 153.57, "step": 3660, "token_acc": 0.9183261183261183, "train_speed(iter/s)": 0.314789 }, { "epoch": 1.3967225609756098, "grad_norm": 1.6193104982376099, "learning_rate": 8.195046681342545e-05, "loss": 0.19522944688796998, "memory(GiB)": 153.57, "step": 3665, "token_acc": 0.9222021194107004, "train_speed(iter/s)": 0.314844 }, { "epoch": 1.3986280487804879, "grad_norm": 1.2562590837478638, "learning_rate": 8.190439762461033e-05, "loss": 0.24899492263793946, "memory(GiB)": 153.57, "step": 3670, "token_acc": 0.8965574588391818, "train_speed(iter/s)": 0.314857 }, { "epoch": 1.4005335365853657, "grad_norm": 0.8147920966148376, "learning_rate": 8.185828270354391e-05, "loss": 0.16950958967208862, "memory(GiB)": 153.57, "step": 3675, "token_acc": 0.9167797376752601, "train_speed(iter/s)": 0.314894 }, { "epoch": 1.4024390243902438, "grad_norm": 1.373261570930481, "learning_rate": 8.181212211632799e-05, "loss": 0.18469791412353515, "memory(GiB)": 153.57, "step": 3680, "token_acc": 0.9056795131845842, "train_speed(iter/s)": 0.314948 }, { "epoch": 1.404344512195122, "grad_norm": 1.3926947116851807, "learning_rate": 8.17659159291299e-05, "loss": 0.21085448265075685, "memory(GiB)": 153.57, "step": 3685, "token_acc": 0.9099699899966656, "train_speed(iter/s)": 0.314918 }, { "epoch": 1.40625, "grad_norm": 1.2614420652389526, "learning_rate": 8.171966420818228e-05, "loss": 0.2319277286529541, "memory(GiB)": 153.57, "step": 3690, "token_acc": 0.8897998553170967, "train_speed(iter/s)": 0.314978 }, { "epoch": 1.408155487804878, "grad_norm": 12.453064918518066, "learning_rate": 8.167336701978306e-05, "loss": 0.19607105255126953, "memory(GiB)": 153.57, "step": 3695, "token_acc": 0.9165687426556992, "train_speed(iter/s)": 0.315031 }, { "epoch": 1.4100609756097562, "grad_norm": 1.0853991508483887, "learning_rate": 8.162702443029531e-05, "loss": 0.21045379638671874, "memory(GiB)": 153.57, "step": 3700, "token_acc": 0.9069672963640056, "train_speed(iter/s)": 0.315034 }, { "epoch": 1.4119664634146343, "grad_norm": 1.3436912298202515, "learning_rate": 8.158063650614723e-05, "loss": 0.1848902940750122, "memory(GiB)": 153.57, "step": 3705, "token_acc": 0.9220489977728286, "train_speed(iter/s)": 0.31512 }, { "epoch": 1.4138719512195121, "grad_norm": 0.9452284574508667, "learning_rate": 8.153420331383199e-05, "loss": 0.22264461517333983, "memory(GiB)": 153.57, "step": 3710, "token_acc": 0.8878374617311439, "train_speed(iter/s)": 0.315062 }, { "epoch": 1.4157774390243902, "grad_norm": 1.0567686557769775, "learning_rate": 8.148772491990762e-05, "loss": 0.19656684398651122, "memory(GiB)": 153.57, "step": 3715, "token_acc": 0.899580492920818, "train_speed(iter/s)": 0.315108 }, { "epoch": 1.4176829268292683, "grad_norm": 0.6348177194595337, "learning_rate": 8.144120139099697e-05, "loss": 0.17831757068634033, "memory(GiB)": 153.57, "step": 3720, "token_acc": 0.8967573611628774, "train_speed(iter/s)": 0.315189 }, { "epoch": 1.4195884146341464, "grad_norm": 1.33773672580719, "learning_rate": 8.139463279378756e-05, "loss": 0.19186902046203613, "memory(GiB)": 153.57, "step": 3725, "token_acc": 0.9200674536256324, "train_speed(iter/s)": 0.315258 }, { "epoch": 1.4214939024390243, "grad_norm": 2.035773515701294, "learning_rate": 8.134801919503154e-05, "loss": 0.20443353652954102, "memory(GiB)": 153.57, "step": 3730, "token_acc": 0.9134279251490849, "train_speed(iter/s)": 0.315281 }, { "epoch": 1.4233993902439024, "grad_norm": 0.9861239194869995, "learning_rate": 8.130136066154556e-05, "loss": 0.17253400087356568, "memory(GiB)": 153.57, "step": 3735, "token_acc": 0.9290697674418604, "train_speed(iter/s)": 0.315245 }, { "epoch": 1.4253048780487805, "grad_norm": 1.112980604171753, "learning_rate": 8.125465726021069e-05, "loss": 0.23598296642303468, "memory(GiB)": 153.57, "step": 3740, "token_acc": 0.8981713185755534, "train_speed(iter/s)": 0.315241 }, { "epoch": 1.4272103658536586, "grad_norm": 1.3028579950332642, "learning_rate": 8.120790905797226e-05, "loss": 0.1924046754837036, "memory(GiB)": 153.57, "step": 3745, "token_acc": 0.9217215482380127, "train_speed(iter/s)": 0.315267 }, { "epoch": 1.4291158536585367, "grad_norm": 1.0251699686050415, "learning_rate": 8.116111612183989e-05, "loss": 0.1472646713256836, "memory(GiB)": 153.57, "step": 3750, "token_acc": 0.9256134969325154, "train_speed(iter/s)": 0.315339 }, { "epoch": 1.4310213414634148, "grad_norm": 1.3653753995895386, "learning_rate": 8.111427851888728e-05, "loss": 0.1933227300643921, "memory(GiB)": 153.57, "step": 3755, "token_acc": 0.9131018285522563, "train_speed(iter/s)": 0.315343 }, { "epoch": 1.4329268292682926, "grad_norm": 0.8043114542961121, "learning_rate": 8.106739631625217e-05, "loss": 0.20062410831451416, "memory(GiB)": 153.57, "step": 3760, "token_acc": 0.9164721141374838, "train_speed(iter/s)": 0.315263 }, { "epoch": 1.4348323170731707, "grad_norm": 1.2397761344909668, "learning_rate": 8.10204695811362e-05, "loss": 0.19017746448516845, "memory(GiB)": 153.57, "step": 3765, "token_acc": 0.9233608336951802, "train_speed(iter/s)": 0.315196 }, { "epoch": 1.4367378048780488, "grad_norm": 1.7866933345794678, "learning_rate": 8.09734983808049e-05, "loss": 0.25176544189453126, "memory(GiB)": 153.57, "step": 3770, "token_acc": 0.9080928481806776, "train_speed(iter/s)": 0.315254 }, { "epoch": 1.438643292682927, "grad_norm": 1.2205421924591064, "learning_rate": 8.092648278258751e-05, "loss": 0.23471908569335936, "memory(GiB)": 153.57, "step": 3775, "token_acc": 0.8907133243606998, "train_speed(iter/s)": 0.315198 }, { "epoch": 1.4405487804878048, "grad_norm": 1.1692094802856445, "learning_rate": 8.087942285387688e-05, "loss": 0.23550260066986084, "memory(GiB)": 153.57, "step": 3780, "token_acc": 0.8935338345864662, "train_speed(iter/s)": 0.315252 }, { "epoch": 1.4424542682926829, "grad_norm": 1.230905532836914, "learning_rate": 8.083231866212945e-05, "loss": 0.2438673496246338, "memory(GiB)": 153.57, "step": 3785, "token_acc": 0.8997369190295236, "train_speed(iter/s)": 0.315239 }, { "epoch": 1.444359756097561, "grad_norm": 0.04481758177280426, "learning_rate": 8.07851702748651e-05, "loss": 0.2020124912261963, "memory(GiB)": 153.57, "step": 3790, "token_acc": 0.9122641509433962, "train_speed(iter/s)": 0.315336 }, { "epoch": 1.446265243902439, "grad_norm": 1.637022614479065, "learning_rate": 8.073797775966701e-05, "loss": 0.23536784648895265, "memory(GiB)": 153.57, "step": 3795, "token_acc": 0.9069827033952594, "train_speed(iter/s)": 0.315194 }, { "epoch": 1.4481707317073171, "grad_norm": 0.9516097903251648, "learning_rate": 8.06907411841817e-05, "loss": 0.16901991367340088, "memory(GiB)": 153.57, "step": 3800, "token_acc": 0.9242474363215349, "train_speed(iter/s)": 0.315255 }, { "epoch": 1.4500762195121952, "grad_norm": 1.2765940427780151, "learning_rate": 8.064346061611875e-05, "loss": 0.19371250867843628, "memory(GiB)": 153.57, "step": 3805, "token_acc": 0.9274939172749391, "train_speed(iter/s)": 0.315279 }, { "epoch": 1.451981707317073, "grad_norm": 1.061185359954834, "learning_rate": 8.05961361232509e-05, "loss": 0.25471177101135256, "memory(GiB)": 153.57, "step": 3810, "token_acc": 0.8969342166076105, "train_speed(iter/s)": 0.315307 }, { "epoch": 1.4538871951219512, "grad_norm": 1.6234833002090454, "learning_rate": 8.054876777341376e-05, "loss": 0.25296869277954104, "memory(GiB)": 153.57, "step": 3815, "token_acc": 0.9055092889173607, "train_speed(iter/s)": 0.315347 }, { "epoch": 1.4557926829268293, "grad_norm": 1.3003789186477661, "learning_rate": 8.050135563450587e-05, "loss": 0.2803217887878418, "memory(GiB)": 153.57, "step": 3820, "token_acc": 0.8851332783731795, "train_speed(iter/s)": 0.315358 }, { "epoch": 1.4576981707317074, "grad_norm": 1.497312068939209, "learning_rate": 8.04538997744885e-05, "loss": 0.2067281723022461, "memory(GiB)": 153.57, "step": 3825, "token_acc": 0.9054557777024738, "train_speed(iter/s)": 0.31542 }, { "epoch": 1.4596036585365852, "grad_norm": 1.4771572351455688, "learning_rate": 8.040640026138562e-05, "loss": 0.23271255493164061, "memory(GiB)": 153.57, "step": 3830, "token_acc": 0.8901722787783869, "train_speed(iter/s)": 0.315455 }, { "epoch": 1.4615091463414633, "grad_norm": 1.6084868907928467, "learning_rate": 8.035885716328376e-05, "loss": 0.214349365234375, "memory(GiB)": 153.57, "step": 3835, "token_acc": 0.9121359223300971, "train_speed(iter/s)": 0.315491 }, { "epoch": 1.4634146341463414, "grad_norm": 1.162778615951538, "learning_rate": 8.03112705483319e-05, "loss": 0.21698243618011476, "memory(GiB)": 153.57, "step": 3840, "token_acc": 0.9117591721542804, "train_speed(iter/s)": 0.315424 }, { "epoch": 1.4653201219512195, "grad_norm": 2.1669983863830566, "learning_rate": 8.026364048474144e-05, "loss": 0.21794767379760743, "memory(GiB)": 153.57, "step": 3845, "token_acc": 0.8990352254879964, "train_speed(iter/s)": 0.315361 }, { "epoch": 1.4672256097560976, "grad_norm": 1.1001149415969849, "learning_rate": 8.021596704078605e-05, "loss": 0.21925132274627684, "memory(GiB)": 153.57, "step": 3850, "token_acc": 0.8984874927283304, "train_speed(iter/s)": 0.315396 }, { "epoch": 1.4691310975609757, "grad_norm": 0.9201654195785522, "learning_rate": 8.016825028480155e-05, "loss": 0.2025221109390259, "memory(GiB)": 153.57, "step": 3855, "token_acc": 0.9150862068965517, "train_speed(iter/s)": 0.315192 }, { "epoch": 1.4710365853658536, "grad_norm": 1.3238810300827026, "learning_rate": 8.012049028518589e-05, "loss": 0.1966274380683899, "memory(GiB)": 153.57, "step": 3860, "token_acc": 0.9194520547945205, "train_speed(iter/s)": 0.315223 }, { "epoch": 1.4729420731707317, "grad_norm": 0.7635603547096252, "learning_rate": 8.007268711039898e-05, "loss": 0.17722615003585815, "memory(GiB)": 153.57, "step": 3865, "token_acc": 0.9161307609860665, "train_speed(iter/s)": 0.315262 }, { "epoch": 1.4748475609756098, "grad_norm": 2.2394230365753174, "learning_rate": 8.002484082896257e-05, "loss": 0.22998418807983398, "memory(GiB)": 153.57, "step": 3870, "token_acc": 0.9089987839481152, "train_speed(iter/s)": 0.315282 }, { "epoch": 1.4767530487804879, "grad_norm": 0.6663928627967834, "learning_rate": 7.997695150946034e-05, "loss": 0.1498611330986023, "memory(GiB)": 153.57, "step": 3875, "token_acc": 0.9353922612708555, "train_speed(iter/s)": 0.315317 }, { "epoch": 1.4786585365853657, "grad_norm": 0.7648011445999146, "learning_rate": 7.992901922053752e-05, "loss": 0.282049822807312, "memory(GiB)": 153.57, "step": 3880, "token_acc": 0.8930367210117325, "train_speed(iter/s)": 0.315309 }, { "epoch": 1.4805640243902438, "grad_norm": 1.446054458618164, "learning_rate": 7.988104403090097e-05, "loss": 0.21687166690826415, "memory(GiB)": 153.57, "step": 3885, "token_acc": 0.9076695541939573, "train_speed(iter/s)": 0.315362 }, { "epoch": 1.482469512195122, "grad_norm": 1.4518179893493652, "learning_rate": 7.983302600931911e-05, "loss": 0.2697602272033691, "memory(GiB)": 153.57, "step": 3890, "token_acc": 0.8879781420765027, "train_speed(iter/s)": 0.315284 }, { "epoch": 1.484375, "grad_norm": 1.2767537832260132, "learning_rate": 7.978496522462167e-05, "loss": 0.20154814720153807, "memory(GiB)": 153.57, "step": 3895, "token_acc": 0.9095160158989947, "train_speed(iter/s)": 0.315108 }, { "epoch": 1.486280487804878, "grad_norm": 1.1474145650863647, "learning_rate": 7.973686174569972e-05, "loss": 0.23195149898529052, "memory(GiB)": 153.57, "step": 3900, "token_acc": 0.8928153065208903, "train_speed(iter/s)": 0.314987 }, { "epoch": 1.4881859756097562, "grad_norm": 0.8500837683677673, "learning_rate": 7.968871564150554e-05, "loss": 0.22649178504943848, "memory(GiB)": 153.57, "step": 3905, "token_acc": 0.902015402261183, "train_speed(iter/s)": 0.315019 }, { "epoch": 1.4900914634146343, "grad_norm": 1.2730180025100708, "learning_rate": 7.964052698105247e-05, "loss": 0.27593746185302737, "memory(GiB)": 153.57, "step": 3910, "token_acc": 0.8966429298067141, "train_speed(iter/s)": 0.315037 }, { "epoch": 1.4919969512195121, "grad_norm": 1.3554725646972656, "learning_rate": 7.959229583341487e-05, "loss": 0.23875155448913574, "memory(GiB)": 153.57, "step": 3915, "token_acc": 0.9014492753623189, "train_speed(iter/s)": 0.314964 }, { "epoch": 1.4939024390243902, "grad_norm": 1.6184662580490112, "learning_rate": 7.954402226772804e-05, "loss": 0.22082107067108153, "memory(GiB)": 153.57, "step": 3920, "token_acc": 0.9113891726251276, "train_speed(iter/s)": 0.314871 }, { "epoch": 1.4958079268292683, "grad_norm": 1.2876579761505127, "learning_rate": 7.9495706353188e-05, "loss": 0.22554280757904052, "memory(GiB)": 153.57, "step": 3925, "token_acc": 0.9042821158690176, "train_speed(iter/s)": 0.3149 }, { "epoch": 1.4977134146341464, "grad_norm": 1.5456023216247559, "learning_rate": 7.944734815905154e-05, "loss": 0.22120862007141112, "memory(GiB)": 153.57, "step": 3930, "token_acc": 0.911064055055585, "train_speed(iter/s)": 0.314943 }, { "epoch": 1.4996189024390243, "grad_norm": 1.2982467412948608, "learning_rate": 7.939894775463606e-05, "loss": 0.20633516311645508, "memory(GiB)": 153.57, "step": 3935, "token_acc": 0.9122906613681522, "train_speed(iter/s)": 0.314976 }, { "epoch": 1.5015243902439024, "grad_norm": 1.1865313053131104, "learning_rate": 7.93505052093194e-05, "loss": 0.22224657535552977, "memory(GiB)": 153.57, "step": 3940, "token_acc": 0.9147424511545293, "train_speed(iter/s)": 0.31475 }, { "epoch": 1.5034298780487805, "grad_norm": 1.5844517946243286, "learning_rate": 7.930202059253986e-05, "loss": 0.16369341611862182, "memory(GiB)": 153.57, "step": 3945, "token_acc": 0.9204400102327961, "train_speed(iter/s)": 0.314682 }, { "epoch": 1.5053353658536586, "grad_norm": 1.255823016166687, "learning_rate": 7.925349397379604e-05, "loss": 0.25106956958770754, "memory(GiB)": 153.57, "step": 3950, "token_acc": 0.8975010588733587, "train_speed(iter/s)": 0.31448 }, { "epoch": 1.5072408536585367, "grad_norm": 1.1193134784698486, "learning_rate": 7.920492542264673e-05, "loss": 0.2519754648208618, "memory(GiB)": 153.57, "step": 3955, "token_acc": 0.884071630537229, "train_speed(iter/s)": 0.314545 }, { "epoch": 1.5091463414634148, "grad_norm": 1.1666730642318726, "learning_rate": 7.915631500871083e-05, "loss": 0.19652124643325805, "memory(GiB)": 153.57, "step": 3960, "token_acc": 0.9229331117573744, "train_speed(iter/s)": 0.31458 }, { "epoch": 1.5110518292682928, "grad_norm": 1.1776777505874634, "learning_rate": 7.910766280166726e-05, "loss": 0.22044944763183594, "memory(GiB)": 153.57, "step": 3965, "token_acc": 0.902265100671141, "train_speed(iter/s)": 0.314572 }, { "epoch": 1.5129573170731707, "grad_norm": 0.9766071438789368, "learning_rate": 7.905896887125482e-05, "loss": 0.2082287311553955, "memory(GiB)": 153.57, "step": 3970, "token_acc": 0.9189776144336785, "train_speed(iter/s)": 0.314556 }, { "epoch": 1.5148628048780488, "grad_norm": 1.513796091079712, "learning_rate": 7.901023328727217e-05, "loss": 0.2786529064178467, "memory(GiB)": 153.57, "step": 3975, "token_acc": 0.8804545454545455, "train_speed(iter/s)": 0.314391 }, { "epoch": 1.5167682926829267, "grad_norm": 1.1218892335891724, "learning_rate": 7.896145611957759e-05, "loss": 0.24514567852020264, "memory(GiB)": 153.57, "step": 3980, "token_acc": 0.9019138755980861, "train_speed(iter/s)": 0.314426 }, { "epoch": 1.5186737804878048, "grad_norm": 0.7444071769714355, "learning_rate": 7.891263743808905e-05, "loss": 0.19276204109191894, "memory(GiB)": 153.57, "step": 3985, "token_acc": 0.9225652396384104, "train_speed(iter/s)": 0.314417 }, { "epoch": 1.5205792682926829, "grad_norm": 1.0127686262130737, "learning_rate": 7.8863777312784e-05, "loss": 0.17704445123672485, "memory(GiB)": 153.57, "step": 3990, "token_acc": 0.9191996921893035, "train_speed(iter/s)": 0.314481 }, { "epoch": 1.522484756097561, "grad_norm": 1.283420205116272, "learning_rate": 7.881487581369927e-05, "loss": 0.23565316200256348, "memory(GiB)": 153.57, "step": 3995, "token_acc": 0.9063291139240506, "train_speed(iter/s)": 0.314512 }, { "epoch": 1.524390243902439, "grad_norm": 1.428924560546875, "learning_rate": 7.876593301093104e-05, "loss": 0.1906520128250122, "memory(GiB)": 153.57, "step": 4000, "token_acc": 0.9153726708074534, "train_speed(iter/s)": 0.314543 }, { "epoch": 1.524390243902439, "eval_loss": 0.1922401338815689, "eval_runtime": 33.5142, "eval_samples_per_second": 3.163, "eval_steps_per_second": 3.163, "eval_token_acc": 0.8984158913754086, "step": 4000 }, { "epoch": 1.5262957317073171, "grad_norm": 1.0523440837860107, "learning_rate": 7.871694897463464e-05, "loss": 0.21278066635131837, "memory(GiB)": 153.57, "step": 4005, "token_acc": 0.9024209180517178, "train_speed(iter/s)": 0.313632 }, { "epoch": 1.5282012195121952, "grad_norm": 1.5046910047531128, "learning_rate": 7.866792377502457e-05, "loss": 0.18246288299560548, "memory(GiB)": 153.57, "step": 4010, "token_acc": 0.9159859976662778, "train_speed(iter/s)": 0.31366 }, { "epoch": 1.5301067073170733, "grad_norm": 1.0792149305343628, "learning_rate": 7.861885748237428e-05, "loss": 0.19503821134567262, "memory(GiB)": 153.57, "step": 4015, "token_acc": 0.914327917282127, "train_speed(iter/s)": 0.313599 }, { "epoch": 1.5320121951219512, "grad_norm": 1.3073827028274536, "learning_rate": 7.856975016701615e-05, "loss": 0.14792168140411377, "memory(GiB)": 153.57, "step": 4020, "token_acc": 0.9294165646674827, "train_speed(iter/s)": 0.313649 }, { "epoch": 1.5339176829268293, "grad_norm": 1.8213969469070435, "learning_rate": 7.852060189934135e-05, "loss": 0.2139897346496582, "memory(GiB)": 153.57, "step": 4025, "token_acc": 0.9147476727094561, "train_speed(iter/s)": 0.313596 }, { "epoch": 1.5358231707317072, "grad_norm": 1.1758160591125488, "learning_rate": 7.847141274979977e-05, "loss": 0.25045289993286135, "memory(GiB)": 153.57, "step": 4030, "token_acc": 0.9005958829902492, "train_speed(iter/s)": 0.31353 }, { "epoch": 1.5377286585365852, "grad_norm": 1.2962257862091064, "learning_rate": 7.842218278889988e-05, "loss": 0.202514123916626, "memory(GiB)": 153.57, "step": 4035, "token_acc": 0.914004914004914, "train_speed(iter/s)": 0.313542 }, { "epoch": 1.5396341463414633, "grad_norm": 2.488415479660034, "learning_rate": 7.837291208720866e-05, "loss": 0.22815463542938233, "memory(GiB)": 153.57, "step": 4040, "token_acc": 0.9067545304777594, "train_speed(iter/s)": 0.313504 }, { "epoch": 1.5415396341463414, "grad_norm": 0.8882768750190735, "learning_rate": 7.83236007153515e-05, "loss": 0.215626859664917, "memory(GiB)": 153.57, "step": 4045, "token_acc": 0.9056904098686829, "train_speed(iter/s)": 0.313548 }, { "epoch": 1.5434451219512195, "grad_norm": 0.9546802639961243, "learning_rate": 7.827424874401203e-05, "loss": 0.2520237684249878, "memory(GiB)": 153.57, "step": 4050, "token_acc": 0.9104508196721312, "train_speed(iter/s)": 0.313552 }, { "epoch": 1.5453506097560976, "grad_norm": 1.437752604484558, "learning_rate": 7.822485624393219e-05, "loss": 0.24344449043273925, "memory(GiB)": 153.57, "step": 4055, "token_acc": 0.8932905067808709, "train_speed(iter/s)": 0.313576 }, { "epoch": 1.5472560975609757, "grad_norm": 1.3167967796325684, "learning_rate": 7.81754232859119e-05, "loss": 0.2124643325805664, "memory(GiB)": 153.57, "step": 4060, "token_acc": 0.9174958540630183, "train_speed(iter/s)": 0.313594 }, { "epoch": 1.5491615853658538, "grad_norm": 1.6146668195724487, "learning_rate": 7.812594994080912e-05, "loss": 0.17162930965423584, "memory(GiB)": 153.57, "step": 4065, "token_acc": 0.9251644736842105, "train_speed(iter/s)": 0.313574 }, { "epoch": 1.5510670731707317, "grad_norm": 1.1738935708999634, "learning_rate": 7.807643627953969e-05, "loss": 0.1966196060180664, "memory(GiB)": 153.57, "step": 4070, "token_acc": 0.913339183852567, "train_speed(iter/s)": 0.313598 }, { "epoch": 1.5529725609756098, "grad_norm": 1.302384614944458, "learning_rate": 7.802688237307729e-05, "loss": 0.22035698890686034, "memory(GiB)": 153.57, "step": 4075, "token_acc": 0.9122277617423249, "train_speed(iter/s)": 0.313588 }, { "epoch": 1.5548780487804879, "grad_norm": 0.9011093378067017, "learning_rate": 7.797728829245321e-05, "loss": 0.2607485055923462, "memory(GiB)": 153.57, "step": 4080, "token_acc": 0.8999575191163977, "train_speed(iter/s)": 0.313507 }, { "epoch": 1.5567835365853657, "grad_norm": 0.9154324531555176, "learning_rate": 7.792765410875638e-05, "loss": 0.2154399871826172, "memory(GiB)": 153.57, "step": 4085, "token_acc": 0.8971410814170292, "train_speed(iter/s)": 0.313503 }, { "epoch": 1.5586890243902438, "grad_norm": 1.0424160957336426, "learning_rate": 7.787797989313317e-05, "loss": 0.22240853309631348, "memory(GiB)": 153.57, "step": 4090, "token_acc": 0.9053304904051173, "train_speed(iter/s)": 0.313491 }, { "epoch": 1.560594512195122, "grad_norm": 1.2229665517807007, "learning_rate": 7.782826571678741e-05, "loss": 0.21058838367462157, "memory(GiB)": 153.57, "step": 4095, "token_acc": 0.9010704419889503, "train_speed(iter/s)": 0.313518 }, { "epoch": 1.5625, "grad_norm": 1.5088765621185303, "learning_rate": 7.777851165098012e-05, "loss": 0.18523035049438477, "memory(GiB)": 153.57, "step": 4100, "token_acc": 0.9150525733259546, "train_speed(iter/s)": 0.313549 }, { "epoch": 1.564405487804878, "grad_norm": 1.669822096824646, "learning_rate": 7.772871776702954e-05, "loss": 0.1997946619987488, "memory(GiB)": 153.57, "step": 4105, "token_acc": 0.9102760736196319, "train_speed(iter/s)": 0.313518 }, { "epoch": 1.5663109756097562, "grad_norm": 1.8374254703521729, "learning_rate": 7.767888413631101e-05, "loss": 0.16855828762054442, "memory(GiB)": 153.57, "step": 4110, "token_acc": 0.9396120150187734, "train_speed(iter/s)": 0.313583 }, { "epoch": 1.5682164634146343, "grad_norm": 1.284151554107666, "learning_rate": 7.76290108302568e-05, "loss": 0.17358088493347168, "memory(GiB)": 153.57, "step": 4115, "token_acc": 0.9229422066549913, "train_speed(iter/s)": 0.313629 }, { "epoch": 1.5701219512195121, "grad_norm": 1.4255653619766235, "learning_rate": 7.757909792035608e-05, "loss": 0.22660915851593016, "memory(GiB)": 153.57, "step": 4120, "token_acc": 0.9068600111544897, "train_speed(iter/s)": 0.313663 }, { "epoch": 1.5720274390243902, "grad_norm": 1.5603740215301514, "learning_rate": 7.752914547815477e-05, "loss": 0.17913821935653687, "memory(GiB)": 153.57, "step": 4125, "token_acc": 0.9265548281505729, "train_speed(iter/s)": 0.313669 }, { "epoch": 1.5739329268292683, "grad_norm": 1.3568607568740845, "learning_rate": 7.747915357525545e-05, "loss": 0.19806445837020875, "memory(GiB)": 153.57, "step": 4130, "token_acc": 0.9202157864849517, "train_speed(iter/s)": 0.313705 }, { "epoch": 1.5758384146341462, "grad_norm": 1.3604662418365479, "learning_rate": 7.742912228331731e-05, "loss": 0.18948907852172853, "memory(GiB)": 153.57, "step": 4135, "token_acc": 0.9277159067030889, "train_speed(iter/s)": 0.313656 }, { "epoch": 1.5777439024390243, "grad_norm": 1.4862534999847412, "learning_rate": 7.737905167405595e-05, "loss": 0.18374404907226563, "memory(GiB)": 153.57, "step": 4140, "token_acc": 0.9066731141199227, "train_speed(iter/s)": 0.313688 }, { "epoch": 1.5796493902439024, "grad_norm": 1.4313099384307861, "learning_rate": 7.732894181924335e-05, "loss": 0.24464893341064453, "memory(GiB)": 153.57, "step": 4145, "token_acc": 0.8924157984007753, "train_speed(iter/s)": 0.313726 }, { "epoch": 1.5815548780487805, "grad_norm": 0.7852306365966797, "learning_rate": 7.727879279070773e-05, "loss": 0.14259910583496094, "memory(GiB)": 153.57, "step": 4150, "token_acc": 0.9253422888142598, "train_speed(iter/s)": 0.313727 }, { "epoch": 1.5834603658536586, "grad_norm": 1.322152853012085, "learning_rate": 7.722860466033348e-05, "loss": 0.23485851287841797, "memory(GiB)": 153.57, "step": 4155, "token_acc": 0.90744920993228, "train_speed(iter/s)": 0.313661 }, { "epoch": 1.5853658536585367, "grad_norm": 1.213621735572815, "learning_rate": 7.717837750006106e-05, "loss": 0.24328668117523194, "memory(GiB)": 153.57, "step": 4160, "token_acc": 0.8978882651697407, "train_speed(iter/s)": 0.313699 }, { "epoch": 1.5872713414634148, "grad_norm": 1.9185823202133179, "learning_rate": 7.712811138188682e-05, "loss": 0.2161430835723877, "memory(GiB)": 153.57, "step": 4165, "token_acc": 0.9208287596048299, "train_speed(iter/s)": 0.313688 }, { "epoch": 1.5891768292682928, "grad_norm": 1.0261613130569458, "learning_rate": 7.7077806377863e-05, "loss": 0.15961008071899413, "memory(GiB)": 153.57, "step": 4170, "token_acc": 0.9211678832116789, "train_speed(iter/s)": 0.313744 }, { "epoch": 1.5910823170731707, "grad_norm": 1.459675908088684, "learning_rate": 7.702746256009752e-05, "loss": 0.24461760520935058, "memory(GiB)": 153.57, "step": 4175, "token_acc": 0.8942165101334651, "train_speed(iter/s)": 0.313786 }, { "epoch": 1.5929878048780488, "grad_norm": 0.9305482506752014, "learning_rate": 7.697708000075403e-05, "loss": 0.2597900152206421, "memory(GiB)": 153.57, "step": 4180, "token_acc": 0.8953583729703133, "train_speed(iter/s)": 0.3138 }, { "epoch": 1.5948932926829267, "grad_norm": 1.113081455230713, "learning_rate": 7.692665877205165e-05, "loss": 0.19535627365112304, "memory(GiB)": 153.57, "step": 4185, "token_acc": 0.9154581919160752, "train_speed(iter/s)": 0.313841 }, { "epoch": 1.5967987804878048, "grad_norm": 1.335242748260498, "learning_rate": 7.687619894626493e-05, "loss": 0.2027146577835083, "memory(GiB)": 153.57, "step": 4190, "token_acc": 0.9079106802838843, "train_speed(iter/s)": 0.313854 }, { "epoch": 1.5987042682926829, "grad_norm": 1.3586809635162354, "learning_rate": 7.682570059572374e-05, "loss": 0.20089418888092042, "memory(GiB)": 153.57, "step": 4195, "token_acc": 0.9035951454087474, "train_speed(iter/s)": 0.313884 }, { "epoch": 1.600609756097561, "grad_norm": 1.5975453853607178, "learning_rate": 7.677516379281321e-05, "loss": 0.24886775016784668, "memory(GiB)": 153.57, "step": 4200, "token_acc": 0.8887262079062958, "train_speed(iter/s)": 0.313935 }, { "epoch": 1.602515243902439, "grad_norm": 1.5560672283172607, "learning_rate": 7.672458860997357e-05, "loss": 0.23308587074279785, "memory(GiB)": 153.57, "step": 4205, "token_acc": 0.9043082021541011, "train_speed(iter/s)": 0.313936 }, { "epoch": 1.6044207317073171, "grad_norm": 1.457554578781128, "learning_rate": 7.667397511970005e-05, "loss": 0.22916111946105958, "memory(GiB)": 153.57, "step": 4210, "token_acc": 0.9093099294441577, "train_speed(iter/s)": 0.313914 }, { "epoch": 1.6063262195121952, "grad_norm": 1.5829964876174927, "learning_rate": 7.662332339454278e-05, "loss": 0.20832257270812987, "memory(GiB)": 153.57, "step": 4215, "token_acc": 0.9160855974993989, "train_speed(iter/s)": 0.313944 }, { "epoch": 1.6082317073170733, "grad_norm": 1.199296236038208, "learning_rate": 7.657263350710676e-05, "loss": 0.14886198043823243, "memory(GiB)": 153.57, "step": 4220, "token_acc": 0.9205041202132817, "train_speed(iter/s)": 0.313955 }, { "epoch": 1.6101371951219512, "grad_norm": 1.4551962614059448, "learning_rate": 7.652190553005161e-05, "loss": 0.19618771076202393, "memory(GiB)": 153.57, "step": 4225, "token_acc": 0.9152235199355618, "train_speed(iter/s)": 0.313879 }, { "epoch": 1.6120426829268293, "grad_norm": 1.1387065649032593, "learning_rate": 7.647113953609163e-05, "loss": 0.20335180759429933, "memory(GiB)": 153.57, "step": 4230, "token_acc": 0.9203864734299517, "train_speed(iter/s)": 0.313888 }, { "epoch": 1.6139481707317072, "grad_norm": 0.9711575508117676, "learning_rate": 7.642033559799552e-05, "loss": 0.23921616077423097, "memory(GiB)": 153.57, "step": 4235, "token_acc": 0.8920080930703086, "train_speed(iter/s)": 0.313823 }, { "epoch": 1.6158536585365852, "grad_norm": 1.5528905391693115, "learning_rate": 7.636949378858646e-05, "loss": 0.15672655105590821, "memory(GiB)": 153.57, "step": 4240, "token_acc": 0.9225589225589226, "train_speed(iter/s)": 0.313874 }, { "epoch": 1.6177591463414633, "grad_norm": 1.3215019702911377, "learning_rate": 7.631861418074189e-05, "loss": 0.18401331901550294, "memory(GiB)": 153.57, "step": 4245, "token_acc": 0.928402494087293, "train_speed(iter/s)": 0.313896 }, { "epoch": 1.6196646341463414, "grad_norm": 1.1514108180999756, "learning_rate": 7.626769684739337e-05, "loss": 0.17718877792358398, "memory(GiB)": 153.57, "step": 4250, "token_acc": 0.9242264786360767, "train_speed(iter/s)": 0.313903 }, { "epoch": 1.6215701219512195, "grad_norm": 1.5144273042678833, "learning_rate": 7.621674186152661e-05, "loss": 0.2201828956604004, "memory(GiB)": 153.57, "step": 4255, "token_acc": 0.9175597691673537, "train_speed(iter/s)": 0.313921 }, { "epoch": 1.6234756097560976, "grad_norm": 0.9289644956588745, "learning_rate": 7.616574929618125e-05, "loss": 0.2057976484298706, "memory(GiB)": 153.57, "step": 4260, "token_acc": 0.906258114775383, "train_speed(iter/s)": 0.313941 }, { "epoch": 1.6253810975609757, "grad_norm": 0.9918609261512756, "learning_rate": 7.611471922445085e-05, "loss": 0.2530960559844971, "memory(GiB)": 153.57, "step": 4265, "token_acc": 0.8932092004381161, "train_speed(iter/s)": 0.313977 }, { "epoch": 1.6272865853658538, "grad_norm": 0.9149210453033447, "learning_rate": 7.606365171948267e-05, "loss": 0.2286924362182617, "memory(GiB)": 153.57, "step": 4270, "token_acc": 0.9036996735582155, "train_speed(iter/s)": 0.314016 }, { "epoch": 1.6291920731707317, "grad_norm": 1.4089398384094238, "learning_rate": 7.601254685447764e-05, "loss": 0.20997021198272706, "memory(GiB)": 153.57, "step": 4275, "token_acc": 0.914567733151804, "train_speed(iter/s)": 0.314061 }, { "epoch": 1.6310975609756098, "grad_norm": 1.0784205198287964, "learning_rate": 7.596140470269029e-05, "loss": 0.21397385597229004, "memory(GiB)": 153.57, "step": 4280, "token_acc": 0.907168971546654, "train_speed(iter/s)": 0.314057 }, { "epoch": 1.6330030487804879, "grad_norm": 1.5103622674942017, "learning_rate": 7.591022533742852e-05, "loss": 0.22283687591552734, "memory(GiB)": 153.57, "step": 4285, "token_acc": 0.913856234341877, "train_speed(iter/s)": 0.314064 }, { "epoch": 1.6349085365853657, "grad_norm": 1.2210482358932495, "learning_rate": 7.585900883205364e-05, "loss": 0.20763258934020995, "memory(GiB)": 153.57, "step": 4290, "token_acc": 0.9174871073605251, "train_speed(iter/s)": 0.314097 }, { "epoch": 1.6368140243902438, "grad_norm": 1.2070786952972412, "learning_rate": 7.580775525998017e-05, "loss": 0.20769917964935303, "memory(GiB)": 153.57, "step": 4295, "token_acc": 0.9133592736705577, "train_speed(iter/s)": 0.314125 }, { "epoch": 1.638719512195122, "grad_norm": 1.3237180709838867, "learning_rate": 7.575646469467575e-05, "loss": 0.21649584770202637, "memory(GiB)": 153.57, "step": 4300, "token_acc": 0.9124288238759081, "train_speed(iter/s)": 0.314131 }, { "epoch": 1.640625, "grad_norm": 0.8049647212028503, "learning_rate": 7.570513720966108e-05, "loss": 0.23405847549438477, "memory(GiB)": 153.57, "step": 4305, "token_acc": 0.921369910885899, "train_speed(iter/s)": 0.314132 }, { "epoch": 1.642530487804878, "grad_norm": 1.8886512517929077, "learning_rate": 7.565377287850977e-05, "loss": 0.160580575466156, "memory(GiB)": 153.57, "step": 4310, "token_acc": 0.9311251314405888, "train_speed(iter/s)": 0.314157 }, { "epoch": 1.6444359756097562, "grad_norm": 1.0390665531158447, "learning_rate": 7.56023717748482e-05, "loss": 0.17904529571533204, "memory(GiB)": 153.57, "step": 4315, "token_acc": 0.916569654637175, "train_speed(iter/s)": 0.314166 }, { "epoch": 1.6463414634146343, "grad_norm": 1.6933420896530151, "learning_rate": 7.555093397235552e-05, "loss": 0.2508329629898071, "memory(GiB)": 153.57, "step": 4320, "token_acc": 0.8828781512605042, "train_speed(iter/s)": 0.314208 }, { "epoch": 1.6482469512195121, "grad_norm": 1.1816372871398926, "learning_rate": 7.549945954476347e-05, "loss": 0.23822190761566162, "memory(GiB)": 153.57, "step": 4325, "token_acc": 0.9084134143949794, "train_speed(iter/s)": 0.314235 }, { "epoch": 1.6501524390243902, "grad_norm": 1.1568915843963623, "learning_rate": 7.544794856585626e-05, "loss": 0.17965152263641357, "memory(GiB)": 153.57, "step": 4330, "token_acc": 0.9224837662337663, "train_speed(iter/s)": 0.314276 }, { "epoch": 1.6520579268292683, "grad_norm": 1.3099696636199951, "learning_rate": 7.539640110947054e-05, "loss": 0.18226674795150757, "memory(GiB)": 153.57, "step": 4335, "token_acc": 0.9232848935451166, "train_speed(iter/s)": 0.314302 }, { "epoch": 1.6539634146341462, "grad_norm": 1.0891613960266113, "learning_rate": 7.53448172494952e-05, "loss": 0.19213918447494507, "memory(GiB)": 153.57, "step": 4340, "token_acc": 0.9225157728706624, "train_speed(iter/s)": 0.314306 }, { "epoch": 1.6558689024390243, "grad_norm": 0.8302837610244751, "learning_rate": 7.529319705987135e-05, "loss": 0.2039104461669922, "memory(GiB)": 153.57, "step": 4345, "token_acc": 0.9121495327102803, "train_speed(iter/s)": 0.314343 }, { "epoch": 1.6577743902439024, "grad_norm": 1.951332449913025, "learning_rate": 7.524154061459215e-05, "loss": 0.21931180953979493, "memory(GiB)": 153.57, "step": 4350, "token_acc": 0.9055496264674493, "train_speed(iter/s)": 0.314369 }, { "epoch": 1.6596798780487805, "grad_norm": 1.4535441398620605, "learning_rate": 7.518984798770272e-05, "loss": 0.21242597103118896, "memory(GiB)": 153.57, "step": 4355, "token_acc": 0.9200171086398631, "train_speed(iter/s)": 0.31439 }, { "epoch": 1.6615853658536586, "grad_norm": 1.0375066995620728, "learning_rate": 7.51381192533001e-05, "loss": 0.19560158252716064, "memory(GiB)": 153.57, "step": 4360, "token_acc": 0.9164963503649635, "train_speed(iter/s)": 0.314439 }, { "epoch": 1.6634908536585367, "grad_norm": 1.22868013381958, "learning_rate": 7.508635448553303e-05, "loss": 0.22150719165802002, "memory(GiB)": 153.57, "step": 4365, "token_acc": 0.9020866773675762, "train_speed(iter/s)": 0.314443 }, { "epoch": 1.6653963414634148, "grad_norm": 1.3130435943603516, "learning_rate": 7.503455375860192e-05, "loss": 0.22061316967010497, "memory(GiB)": 153.57, "step": 4370, "token_acc": 0.911345496009122, "train_speed(iter/s)": 0.3143 }, { "epoch": 1.6673018292682928, "grad_norm": 1.4872994422912598, "learning_rate": 7.498271714675873e-05, "loss": 0.142090106010437, "memory(GiB)": 153.57, "step": 4375, "token_acc": 0.9348409352242238, "train_speed(iter/s)": 0.314367 }, { "epoch": 1.6692073170731707, "grad_norm": 1.5307401418685913, "learning_rate": 7.493084472430682e-05, "loss": 0.2111595869064331, "memory(GiB)": 153.57, "step": 4380, "token_acc": 0.9146013735443416, "train_speed(iter/s)": 0.314403 }, { "epoch": 1.6711128048780488, "grad_norm": 1.2353521585464478, "learning_rate": 7.487893656560096e-05, "loss": 0.2354135751724243, "memory(GiB)": 153.57, "step": 4385, "token_acc": 0.8975661375661376, "train_speed(iter/s)": 0.314406 }, { "epoch": 1.6730182926829267, "grad_norm": 2.0445265769958496, "learning_rate": 7.482699274504708e-05, "loss": 0.17831945419311523, "memory(GiB)": 153.57, "step": 4390, "token_acc": 0.9223664503245133, "train_speed(iter/s)": 0.314427 }, { "epoch": 1.6749237804878048, "grad_norm": 0.831583559513092, "learning_rate": 7.477501333710224e-05, "loss": 0.1965598464012146, "memory(GiB)": 153.57, "step": 4395, "token_acc": 0.9206608199061799, "train_speed(iter/s)": 0.314421 }, { "epoch": 1.6768292682926829, "grad_norm": 1.546514630317688, "learning_rate": 7.472299841627451e-05, "loss": 0.18101673126220702, "memory(GiB)": 153.57, "step": 4400, "token_acc": 0.9285588597571706, "train_speed(iter/s)": 0.314432 }, { "epoch": 1.678734756097561, "grad_norm": 1.0095341205596924, "learning_rate": 7.46709480571229e-05, "loss": 0.2159656047821045, "memory(GiB)": 153.57, "step": 4405, "token_acc": 0.9123711340206185, "train_speed(iter/s)": 0.314475 }, { "epoch": 1.680640243902439, "grad_norm": 0.9883344769477844, "learning_rate": 7.461886233425717e-05, "loss": 0.19895355701446532, "memory(GiB)": 153.57, "step": 4410, "token_acc": 0.9234423023190835, "train_speed(iter/s)": 0.314503 }, { "epoch": 1.6825457317073171, "grad_norm": 1.4156954288482666, "learning_rate": 7.45667413223378e-05, "loss": 0.24624016284942626, "memory(GiB)": 153.57, "step": 4415, "token_acc": 0.8949990037856147, "train_speed(iter/s)": 0.314522 }, { "epoch": 1.6844512195121952, "grad_norm": 1.1820992231369019, "learning_rate": 7.451458509607582e-05, "loss": 0.2239694595336914, "memory(GiB)": 153.57, "step": 4420, "token_acc": 0.9076040781648258, "train_speed(iter/s)": 0.314532 }, { "epoch": 1.6863567073170733, "grad_norm": 1.1639996767044067, "learning_rate": 7.44623937302328e-05, "loss": 0.25830903053283694, "memory(GiB)": 153.57, "step": 4425, "token_acc": 0.8972959563383776, "train_speed(iter/s)": 0.314569 }, { "epoch": 1.6882621951219512, "grad_norm": 0.7842183709144592, "learning_rate": 7.441016729962064e-05, "loss": 0.22606632709503174, "memory(GiB)": 153.57, "step": 4430, "token_acc": 0.9030239833159541, "train_speed(iter/s)": 0.314621 }, { "epoch": 1.6901676829268293, "grad_norm": 0.9802505970001221, "learning_rate": 7.435790587910147e-05, "loss": 0.21836037635803224, "memory(GiB)": 153.57, "step": 4435, "token_acc": 0.9220676987667279, "train_speed(iter/s)": 0.314661 }, { "epoch": 1.6920731707317072, "grad_norm": 1.1999272108078003, "learning_rate": 7.430560954358764e-05, "loss": 0.1930884003639221, "memory(GiB)": 153.57, "step": 4440, "token_acc": 0.922713529856387, "train_speed(iter/s)": 0.314654 }, { "epoch": 1.6939786585365852, "grad_norm": 1.2265654802322388, "learning_rate": 7.425327836804149e-05, "loss": 0.23937611579895018, "memory(GiB)": 153.57, "step": 4445, "token_acc": 0.9016706443914081, "train_speed(iter/s)": 0.31469 }, { "epoch": 1.6958841463414633, "grad_norm": 0.4202624261379242, "learning_rate": 7.420091242747536e-05, "loss": 0.21639952659606934, "memory(GiB)": 153.57, "step": 4450, "token_acc": 0.910581222056632, "train_speed(iter/s)": 0.314724 }, { "epoch": 1.6977896341463414, "grad_norm": 0.7577977776527405, "learning_rate": 7.414851179695137e-05, "loss": 0.1743553638458252, "memory(GiB)": 153.57, "step": 4455, "token_acc": 0.9166378415773089, "train_speed(iter/s)": 0.314756 }, { "epoch": 1.6996951219512195, "grad_norm": 1.9626224040985107, "learning_rate": 7.409607655158139e-05, "loss": 0.2704953908920288, "memory(GiB)": 153.57, "step": 4460, "token_acc": 0.8939393939393939, "train_speed(iter/s)": 0.314788 }, { "epoch": 1.7016006097560976, "grad_norm": 0.8300598859786987, "learning_rate": 7.404360676652687e-05, "loss": 0.1845728039741516, "memory(GiB)": 153.57, "step": 4465, "token_acc": 0.9287318766591791, "train_speed(iter/s)": 0.314826 }, { "epoch": 1.7035060975609757, "grad_norm": 1.268587589263916, "learning_rate": 7.399110251699887e-05, "loss": 0.21773314476013184, "memory(GiB)": 153.57, "step": 4470, "token_acc": 0.9019021739130435, "train_speed(iter/s)": 0.314863 }, { "epoch": 1.7054115853658538, "grad_norm": 1.1206554174423218, "learning_rate": 7.393856387825772e-05, "loss": 0.1940711259841919, "memory(GiB)": 153.57, "step": 4475, "token_acc": 0.9219512195121952, "train_speed(iter/s)": 0.314854 }, { "epoch": 1.7073170731707317, "grad_norm": 1.3966267108917236, "learning_rate": 7.388599092561315e-05, "loss": 0.19646750688552855, "memory(GiB)": 153.57, "step": 4480, "token_acc": 0.9050381679389313, "train_speed(iter/s)": 0.314902 }, { "epoch": 1.7092225609756098, "grad_norm": 1.7570897340774536, "learning_rate": 7.383338373442401e-05, "loss": 0.23736414909362794, "memory(GiB)": 153.57, "step": 4485, "token_acc": 0.8963973132505597, "train_speed(iter/s)": 0.31493 }, { "epoch": 1.7111280487804879, "grad_norm": 1.4502314329147339, "learning_rate": 7.378074238009826e-05, "loss": 0.17603728771209717, "memory(GiB)": 153.57, "step": 4490, "token_acc": 0.9145261293179805, "train_speed(iter/s)": 0.314892 }, { "epoch": 1.7130335365853657, "grad_norm": 1.3116272687911987, "learning_rate": 7.372806693809283e-05, "loss": 0.22251293659210206, "memory(GiB)": 153.57, "step": 4495, "token_acc": 0.9107560405300078, "train_speed(iter/s)": 0.314919 }, { "epoch": 1.7149390243902438, "grad_norm": 1.1249237060546875, "learning_rate": 7.367535748391349e-05, "loss": 0.23563849925994873, "memory(GiB)": 153.57, "step": 4500, "token_acc": 0.9052009456264776, "train_speed(iter/s)": 0.314938 }, { "epoch": 1.716844512195122, "grad_norm": 1.14506196975708, "learning_rate": 7.362261409311479e-05, "loss": 0.19208406209945678, "memory(GiB)": 153.57, "step": 4505, "token_acc": 0.9271744724652599, "train_speed(iter/s)": 0.314747 }, { "epoch": 1.71875, "grad_norm": 1.1958534717559814, "learning_rate": 7.35698368412999e-05, "loss": 0.22129263877868652, "memory(GiB)": 153.57, "step": 4510, "token_acc": 0.9056096046367211, "train_speed(iter/s)": 0.314792 }, { "epoch": 1.720655487804878, "grad_norm": 0.9949187636375427, "learning_rate": 7.351702580412053e-05, "loss": 0.2021095037460327, "memory(GiB)": 153.57, "step": 4515, "token_acc": 0.9188734762505254, "train_speed(iter/s)": 0.31471 }, { "epoch": 1.7225609756097562, "grad_norm": 1.875449299812317, "learning_rate": 7.346418105727686e-05, "loss": 0.1794981598854065, "memory(GiB)": 153.57, "step": 4520, "token_acc": 0.9196883428228948, "train_speed(iter/s)": 0.314746 }, { "epoch": 1.7244664634146343, "grad_norm": 2.063800811767578, "learning_rate": 7.341130267651735e-05, "loss": 0.25052928924560547, "memory(GiB)": 153.57, "step": 4525, "token_acc": 0.8961809417871709, "train_speed(iter/s)": 0.314782 }, { "epoch": 1.7263719512195121, "grad_norm": 1.6098219156265259, "learning_rate": 7.335839073763865e-05, "loss": 0.20038940906524658, "memory(GiB)": 153.57, "step": 4530, "token_acc": 0.9107640825432236, "train_speed(iter/s)": 0.31483 }, { "epoch": 1.7282774390243902, "grad_norm": 0.9171839952468872, "learning_rate": 7.330544531648557e-05, "loss": 0.1463776111602783, "memory(GiB)": 153.57, "step": 4535, "token_acc": 0.9336677814938684, "train_speed(iter/s)": 0.314792 }, { "epoch": 1.7301829268292683, "grad_norm": 1.4036598205566406, "learning_rate": 7.325246648895088e-05, "loss": 0.2377678394317627, "memory(GiB)": 153.57, "step": 4540, "token_acc": 0.8996593372561165, "train_speed(iter/s)": 0.314815 }, { "epoch": 1.7320884146341462, "grad_norm": 1.2897989749908447, "learning_rate": 7.319945433097524e-05, "loss": 0.22792582511901854, "memory(GiB)": 153.57, "step": 4545, "token_acc": 0.9050812483879288, "train_speed(iter/s)": 0.314865 }, { "epoch": 1.7339939024390243, "grad_norm": 1.7956656217575073, "learning_rate": 7.31464089185471e-05, "loss": 0.21067216396331787, "memory(GiB)": 153.57, "step": 4550, "token_acc": 0.9213924761370017, "train_speed(iter/s)": 0.314884 }, { "epoch": 1.7358993902439024, "grad_norm": 0.9169415831565857, "learning_rate": 7.309333032770256e-05, "loss": 0.21225352287292482, "memory(GiB)": 153.57, "step": 4555, "token_acc": 0.909665613042755, "train_speed(iter/s)": 0.314866 }, { "epoch": 1.7378048780487805, "grad_norm": 0.9787991642951965, "learning_rate": 7.304021863452524e-05, "loss": 0.19602025747299195, "memory(GiB)": 153.57, "step": 4560, "token_acc": 0.9115634005763689, "train_speed(iter/s)": 0.314882 }, { "epoch": 1.7397103658536586, "grad_norm": 1.1940335035324097, "learning_rate": 7.298707391514633e-05, "loss": 0.24276669025421144, "memory(GiB)": 153.57, "step": 4565, "token_acc": 0.8999011369253583, "train_speed(iter/s)": 0.314923 }, { "epoch": 1.7416158536585367, "grad_norm": 1.2883527278900146, "learning_rate": 7.293389624574422e-05, "loss": 0.2028367519378662, "memory(GiB)": 153.57, "step": 4570, "token_acc": 0.9062618595825427, "train_speed(iter/s)": 0.314938 }, { "epoch": 1.7435213414634148, "grad_norm": 1.5035252571105957, "learning_rate": 7.288068570254462e-05, "loss": 0.23693270683288575, "memory(GiB)": 153.57, "step": 4575, "token_acc": 0.9012578616352201, "train_speed(iter/s)": 0.314792 }, { "epoch": 1.7454268292682928, "grad_norm": 0.779818058013916, "learning_rate": 7.282744236182034e-05, "loss": 0.15117422342300416, "memory(GiB)": 153.57, "step": 4580, "token_acc": 0.9311658623317247, "train_speed(iter/s)": 0.314834 }, { "epoch": 1.7473323170731707, "grad_norm": 1.0733174085617065, "learning_rate": 7.27741662998912e-05, "loss": 0.19074219465255737, "memory(GiB)": 153.57, "step": 4585, "token_acc": 0.9079394501081248, "train_speed(iter/s)": 0.314887 }, { "epoch": 1.7492378048780488, "grad_norm": 1.6576168537139893, "learning_rate": 7.27208575931239e-05, "loss": 0.238118314743042, "memory(GiB)": 153.57, "step": 4590, "token_acc": 0.9006745754826704, "train_speed(iter/s)": 0.314819 }, { "epoch": 1.7511432926829267, "grad_norm": 1.2514431476593018, "learning_rate": 7.266751631793196e-05, "loss": 0.2552229166030884, "memory(GiB)": 153.57, "step": 4595, "token_acc": 0.8898525585429314, "train_speed(iter/s)": 0.314861 }, { "epoch": 1.7530487804878048, "grad_norm": 1.3727641105651855, "learning_rate": 7.26141425507756e-05, "loss": 0.2013338565826416, "memory(GiB)": 153.57, "step": 4600, "token_acc": 0.9109204882876938, "train_speed(iter/s)": 0.314781 }, { "epoch": 1.7549542682926829, "grad_norm": 1.2234669923782349, "learning_rate": 7.256073636816158e-05, "loss": 0.18318672180175782, "memory(GiB)": 153.57, "step": 4605, "token_acc": 0.9077939233817701, "train_speed(iter/s)": 0.314829 }, { "epoch": 1.756859756097561, "grad_norm": 1.0273747444152832, "learning_rate": 7.250729784664316e-05, "loss": 0.1766943335533142, "memory(GiB)": 153.57, "step": 4610, "token_acc": 0.9258789990497308, "train_speed(iter/s)": 0.314862 }, { "epoch": 1.758765243902439, "grad_norm": 1.6717541217803955, "learning_rate": 7.245382706281989e-05, "loss": 0.20863327980041504, "memory(GiB)": 153.57, "step": 4615, "token_acc": 0.9219798657718121, "train_speed(iter/s)": 0.314893 }, { "epoch": 1.7606707317073171, "grad_norm": 2.521141290664673, "learning_rate": 7.240032409333764e-05, "loss": 0.18264204263687134, "memory(GiB)": 153.57, "step": 4620, "token_acc": 0.9154089154089154, "train_speed(iter/s)": 0.314938 }, { "epoch": 1.7625762195121952, "grad_norm": 1.925690770149231, "learning_rate": 7.234678901488837e-05, "loss": 0.2114468812942505, "memory(GiB)": 153.57, "step": 4625, "token_acc": 0.915064935064935, "train_speed(iter/s)": 0.315001 }, { "epoch": 1.7644817073170733, "grad_norm": 1.4706602096557617, "learning_rate": 7.22932219042101e-05, "loss": 0.17161781787872316, "memory(GiB)": 153.57, "step": 4630, "token_acc": 0.9011725293132329, "train_speed(iter/s)": 0.315042 }, { "epoch": 1.7663871951219512, "grad_norm": 1.060257077217102, "learning_rate": 7.223962283808674e-05, "loss": 0.2042846202850342, "memory(GiB)": 153.57, "step": 4635, "token_acc": 0.9095399814757641, "train_speed(iter/s)": 0.315079 }, { "epoch": 1.7682926829268293, "grad_norm": 0.7154037952423096, "learning_rate": 7.218599189334799e-05, "loss": 0.21670453548431395, "memory(GiB)": 153.57, "step": 4640, "token_acc": 0.9189321577271614, "train_speed(iter/s)": 0.315016 }, { "epoch": 1.7701981707317072, "grad_norm": 1.2388092279434204, "learning_rate": 7.213232914686929e-05, "loss": 0.2583340644836426, "memory(GiB)": 153.57, "step": 4645, "token_acc": 0.8992682926829269, "train_speed(iter/s)": 0.314957 }, { "epoch": 1.7721036585365852, "grad_norm": 1.1143810749053955, "learning_rate": 7.207863467557162e-05, "loss": 0.21180176734924316, "memory(GiB)": 153.57, "step": 4650, "token_acc": 0.9151076874705235, "train_speed(iter/s)": 0.314953 }, { "epoch": 1.7740091463414633, "grad_norm": 1.2445030212402344, "learning_rate": 7.202490855642148e-05, "loss": 0.1853073477745056, "memory(GiB)": 153.57, "step": 4655, "token_acc": 0.9203561711600297, "train_speed(iter/s)": 0.314962 }, { "epoch": 1.7759146341463414, "grad_norm": 1.197721242904663, "learning_rate": 7.19711508664307e-05, "loss": 0.17873916625976563, "memory(GiB)": 153.57, "step": 4660, "token_acc": 0.9267548321464903, "train_speed(iter/s)": 0.314923 }, { "epoch": 1.7778201219512195, "grad_norm": 1.7052221298217773, "learning_rate": 7.191736168265634e-05, "loss": 0.18792204856872557, "memory(GiB)": 153.57, "step": 4665, "token_acc": 0.9178803842578246, "train_speed(iter/s)": 0.314973 }, { "epoch": 1.7797256097560976, "grad_norm": 1.4131916761398315, "learning_rate": 7.186354108220072e-05, "loss": 0.19132752418518068, "memory(GiB)": 153.57, "step": 4670, "token_acc": 0.9205856255545697, "train_speed(iter/s)": 0.314973 }, { "epoch": 1.7816310975609757, "grad_norm": 1.3540807962417603, "learning_rate": 7.180968914221103e-05, "loss": 0.179830002784729, "memory(GiB)": 153.57, "step": 4675, "token_acc": 0.9245822339489885, "train_speed(iter/s)": 0.314994 }, { "epoch": 1.7835365853658538, "grad_norm": 1.1409012079238892, "learning_rate": 7.175580593987951e-05, "loss": 0.1771461009979248, "memory(GiB)": 153.57, "step": 4680, "token_acc": 0.9236729949631926, "train_speed(iter/s)": 0.315015 }, { "epoch": 1.7854420731707317, "grad_norm": 1.1472073793411255, "learning_rate": 7.170189155244313e-05, "loss": 0.2027907133102417, "memory(GiB)": 153.57, "step": 4685, "token_acc": 0.9125560538116592, "train_speed(iter/s)": 0.31505 }, { "epoch": 1.7873475609756098, "grad_norm": 0.6036607623100281, "learning_rate": 7.164794605718366e-05, "loss": 0.2136305570602417, "memory(GiB)": 153.57, "step": 4690, "token_acc": 0.9060897435897436, "train_speed(iter/s)": 0.315072 }, { "epoch": 1.7892530487804879, "grad_norm": 1.8088209629058838, "learning_rate": 7.159396953142733e-05, "loss": 0.22774758338928222, "memory(GiB)": 153.57, "step": 4695, "token_acc": 0.9034618632141852, "train_speed(iter/s)": 0.315119 }, { "epoch": 1.7911585365853657, "grad_norm": 2.742065191268921, "learning_rate": 7.153996205254495e-05, "loss": 0.2104412317276001, "memory(GiB)": 153.57, "step": 4700, "token_acc": 0.9171998494542717, "train_speed(iter/s)": 0.315159 }, { "epoch": 1.7930640243902438, "grad_norm": 1.0207840204238892, "learning_rate": 7.148592369795165e-05, "loss": 0.2064544677734375, "memory(GiB)": 153.57, "step": 4705, "token_acc": 0.9076508257499157, "train_speed(iter/s)": 0.315163 }, { "epoch": 1.794969512195122, "grad_norm": 1.1725687980651855, "learning_rate": 7.143185454510686e-05, "loss": 0.2295001745223999, "memory(GiB)": 153.57, "step": 4710, "token_acc": 0.9018433179723502, "train_speed(iter/s)": 0.315203 }, { "epoch": 1.796875, "grad_norm": 0.9802600741386414, "learning_rate": 7.137775467151411e-05, "loss": 0.22910988330841064, "memory(GiB)": 153.57, "step": 4715, "token_acc": 0.9074116305587229, "train_speed(iter/s)": 0.315211 }, { "epoch": 1.798780487804878, "grad_norm": 1.155203104019165, "learning_rate": 7.1323624154721e-05, "loss": 0.2113737106323242, "memory(GiB)": 153.57, "step": 4720, "token_acc": 0.9021872695994102, "train_speed(iter/s)": 0.31524 }, { "epoch": 1.8006859756097562, "grad_norm": 0.9621526002883911, "learning_rate": 7.126946307231901e-05, "loss": 0.21349632740020752, "memory(GiB)": 153.57, "step": 4725, "token_acc": 0.9125560538116592, "train_speed(iter/s)": 0.315245 }, { "epoch": 1.8025914634146343, "grad_norm": 0.9597629308700562, "learning_rate": 7.121527150194349e-05, "loss": 0.18741828203201294, "memory(GiB)": 153.57, "step": 4730, "token_acc": 0.9117851354480204, "train_speed(iter/s)": 0.315279 }, { "epoch": 1.8044969512195121, "grad_norm": 1.1130253076553345, "learning_rate": 7.116104952127347e-05, "loss": 0.18731274604797363, "memory(GiB)": 153.57, "step": 4735, "token_acc": 0.9128680919725696, "train_speed(iter/s)": 0.315263 }, { "epoch": 1.8064024390243902, "grad_norm": 1.1767582893371582, "learning_rate": 7.110679720803156e-05, "loss": 0.22751517295837403, "memory(GiB)": 153.57, "step": 4740, "token_acc": 0.9002687791766869, "train_speed(iter/s)": 0.315259 }, { "epoch": 1.8083079268292683, "grad_norm": 0.9792355895042419, "learning_rate": 7.105251463998384e-05, "loss": 0.199342679977417, "memory(GiB)": 153.57, "step": 4745, "token_acc": 0.926860025220681, "train_speed(iter/s)": 0.31522 }, { "epoch": 1.8102134146341462, "grad_norm": 0.9510716795921326, "learning_rate": 7.099820189493977e-05, "loss": 0.23098354339599608, "memory(GiB)": 153.57, "step": 4750, "token_acc": 0.8993301079270561, "train_speed(iter/s)": 0.315217 }, { "epoch": 1.8121189024390243, "grad_norm": 0.9536182284355164, "learning_rate": 7.09438590507521e-05, "loss": 0.18952903747558594, "memory(GiB)": 153.57, "step": 4755, "token_acc": 0.9194966040751099, "train_speed(iter/s)": 0.315237 }, { "epoch": 1.8140243902439024, "grad_norm": 1.470848560333252, "learning_rate": 7.088948618531667e-05, "loss": 0.19961965084075928, "memory(GiB)": 153.57, "step": 4760, "token_acc": 0.915929203539823, "train_speed(iter/s)": 0.315267 }, { "epoch": 1.8159298780487805, "grad_norm": 0.9799200892448425, "learning_rate": 7.083508337657239e-05, "loss": 0.2627227306365967, "memory(GiB)": 153.57, "step": 4765, "token_acc": 0.9101487981686379, "train_speed(iter/s)": 0.315288 }, { "epoch": 1.8178353658536586, "grad_norm": 1.2690006494522095, "learning_rate": 7.078065070250106e-05, "loss": 0.18258092403411866, "memory(GiB)": 153.57, "step": 4770, "token_acc": 0.9186637618010167, "train_speed(iter/s)": 0.315294 }, { "epoch": 1.8197408536585367, "grad_norm": 1.5024844408035278, "learning_rate": 7.072618824112733e-05, "loss": 0.2425856113433838, "memory(GiB)": 153.57, "step": 4775, "token_acc": 0.9200863930885529, "train_speed(iter/s)": 0.315288 }, { "epoch": 1.8216463414634148, "grad_norm": 1.189483404159546, "learning_rate": 7.067169607051851e-05, "loss": 0.18137468099594117, "memory(GiB)": 153.57, "step": 4780, "token_acc": 0.9200325291406886, "train_speed(iter/s)": 0.31531 }, { "epoch": 1.8235518292682928, "grad_norm": 1.4395270347595215, "learning_rate": 7.061717426878452e-05, "loss": 0.20234062671661376, "memory(GiB)": 153.57, "step": 4785, "token_acc": 0.9135959339263025, "train_speed(iter/s)": 0.315353 }, { "epoch": 1.8254573170731707, "grad_norm": 1.3285664319992065, "learning_rate": 7.056262291407772e-05, "loss": 0.23424062728881836, "memory(GiB)": 153.57, "step": 4790, "token_acc": 0.9004509191814083, "train_speed(iter/s)": 0.315391 }, { "epoch": 1.8273628048780488, "grad_norm": 1.273376703262329, "learning_rate": 7.05080420845929e-05, "loss": 0.1690147876739502, "memory(GiB)": 153.57, "step": 4795, "token_acc": 0.925142999253917, "train_speed(iter/s)": 0.31543 }, { "epoch": 1.8292682926829267, "grad_norm": 1.552180290222168, "learning_rate": 7.045343185856701e-05, "loss": 0.2383031129837036, "memory(GiB)": 153.57, "step": 4800, "token_acc": 0.9053627760252366, "train_speed(iter/s)": 0.31535 }, { "epoch": 1.8292682926829267, "eval_loss": 0.1832420974969864, "eval_runtime": 33.9477, "eval_samples_per_second": 3.122, "eval_steps_per_second": 3.122, "eval_token_acc": 0.9026401810409856, "step": 4800 }, { "epoch": 1.8311737804878048, "grad_norm": 1.0496562719345093, "learning_rate": 7.03987923142792e-05, "loss": 0.2143018960952759, "memory(GiB)": 153.57, "step": 4805, "token_acc": 0.9042714201631211, "train_speed(iter/s)": 0.314568 }, { "epoch": 1.8330792682926829, "grad_norm": 0.8746898174285889, "learning_rate": 7.034412353005063e-05, "loss": 0.20459563732147218, "memory(GiB)": 153.57, "step": 4810, "token_acc": 0.9107799852832965, "train_speed(iter/s)": 0.314545 }, { "epoch": 1.834984756097561, "grad_norm": 1.6519426107406616, "learning_rate": 7.028942558424436e-05, "loss": 0.2744938373565674, "memory(GiB)": 153.57, "step": 4815, "token_acc": 0.886847599164927, "train_speed(iter/s)": 0.314477 }, { "epoch": 1.836890243902439, "grad_norm": 1.272260308265686, "learning_rate": 7.02346985552653e-05, "loss": 0.2083333969116211, "memory(GiB)": 153.57, "step": 4820, "token_acc": 0.9137443438914027, "train_speed(iter/s)": 0.314434 }, { "epoch": 1.8387957317073171, "grad_norm": 1.1579340696334839, "learning_rate": 7.017994252155996e-05, "loss": 0.2088139295578003, "memory(GiB)": 153.57, "step": 4825, "token_acc": 0.9137893593919653, "train_speed(iter/s)": 0.314463 }, { "epoch": 1.8407012195121952, "grad_norm": 1.5320556163787842, "learning_rate": 7.01251575616165e-05, "loss": 0.20255773067474364, "memory(GiB)": 153.57, "step": 4830, "token_acc": 0.917893243482887, "train_speed(iter/s)": 0.314476 }, { "epoch": 1.8426067073170733, "grad_norm": 1.2632861137390137, "learning_rate": 7.007034375396453e-05, "loss": 0.13069710731506348, "memory(GiB)": 153.57, "step": 4835, "token_acc": 0.9281862049389724, "train_speed(iter/s)": 0.314434 }, { "epoch": 1.8445121951219512, "grad_norm": 0.8786910176277161, "learning_rate": 7.0015501177175e-05, "loss": 0.20844452381134032, "memory(GiB)": 153.57, "step": 4840, "token_acc": 0.913197869402249, "train_speed(iter/s)": 0.314448 }, { "epoch": 1.8464176829268293, "grad_norm": 0.6315520405769348, "learning_rate": 6.996062990986006e-05, "loss": 0.17583916187286378, "memory(GiB)": 153.57, "step": 4845, "token_acc": 0.92268752876208, "train_speed(iter/s)": 0.314506 }, { "epoch": 1.8483231707317072, "grad_norm": 1.2429978847503662, "learning_rate": 6.990573003067304e-05, "loss": 0.1868835210800171, "memory(GiB)": 153.57, "step": 4850, "token_acc": 0.9269102990033222, "train_speed(iter/s)": 0.314546 }, { "epoch": 1.8502286585365852, "grad_norm": 1.3580161333084106, "learning_rate": 6.985080161830829e-05, "loss": 0.19349400997161864, "memory(GiB)": 153.57, "step": 4855, "token_acc": 0.9258373205741627, "train_speed(iter/s)": 0.31459 }, { "epoch": 1.8521341463414633, "grad_norm": 1.59812331199646, "learning_rate": 6.979584475150103e-05, "loss": 0.24012088775634766, "memory(GiB)": 153.57, "step": 4860, "token_acc": 0.8991627229705133, "train_speed(iter/s)": 0.314638 }, { "epoch": 1.8540396341463414, "grad_norm": 0.9004155993461609, "learning_rate": 6.974085950902726e-05, "loss": 0.21862783432006835, "memory(GiB)": 153.57, "step": 4865, "token_acc": 0.9041811846689896, "train_speed(iter/s)": 0.314679 }, { "epoch": 1.8559451219512195, "grad_norm": 2.7025558948516846, "learning_rate": 6.968584596970364e-05, "loss": 0.1784755229949951, "memory(GiB)": 153.57, "step": 4870, "token_acc": 0.907010428736964, "train_speed(iter/s)": 0.314716 }, { "epoch": 1.8578506097560976, "grad_norm": 1.1142921447753906, "learning_rate": 6.963080421238749e-05, "loss": 0.23456552028656005, "memory(GiB)": 153.57, "step": 4875, "token_acc": 0.905952380952381, "train_speed(iter/s)": 0.314726 }, { "epoch": 1.8597560975609757, "grad_norm": 1.4472969770431519, "learning_rate": 6.957573431597646e-05, "loss": 0.18027958869934083, "memory(GiB)": 153.57, "step": 4880, "token_acc": 0.9197988353626257, "train_speed(iter/s)": 0.314755 }, { "epoch": 1.8616615853658538, "grad_norm": 1.19624924659729, "learning_rate": 6.952063635940858e-05, "loss": 0.17553585767745972, "memory(GiB)": 153.57, "step": 4885, "token_acc": 0.9266953553939754, "train_speed(iter/s)": 0.314601 }, { "epoch": 1.8635670731707317, "grad_norm": 1.0159894227981567, "learning_rate": 6.946551042166209e-05, "loss": 0.19503729343414306, "memory(GiB)": 153.57, "step": 4890, "token_acc": 0.9271054493984431, "train_speed(iter/s)": 0.314634 }, { "epoch": 1.8654725609756098, "grad_norm": 0.7548432946205139, "learning_rate": 6.94103565817554e-05, "loss": 0.18535857200622557, "memory(GiB)": 153.57, "step": 4895, "token_acc": 0.9281701444622793, "train_speed(iter/s)": 0.314658 }, { "epoch": 1.8673780487804879, "grad_norm": 1.0107831954956055, "learning_rate": 6.935517491874683e-05, "loss": 0.21640949249267577, "memory(GiB)": 153.57, "step": 4900, "token_acc": 0.9084640989063243, "train_speed(iter/s)": 0.314685 }, { "epoch": 1.8692835365853657, "grad_norm": 1.1442503929138184, "learning_rate": 6.929996551173464e-05, "loss": 0.21545689105987548, "memory(GiB)": 153.57, "step": 4905, "token_acc": 0.9107402031930334, "train_speed(iter/s)": 0.31462 }, { "epoch": 1.8711890243902438, "grad_norm": 1.9041807651519775, "learning_rate": 6.92447284398568e-05, "loss": 0.15587425231933594, "memory(GiB)": 153.57, "step": 4910, "token_acc": 0.9363557105492589, "train_speed(iter/s)": 0.31467 }, { "epoch": 1.873094512195122, "grad_norm": 1.2923179864883423, "learning_rate": 6.918946378229103e-05, "loss": 0.17438627481460572, "memory(GiB)": 153.57, "step": 4915, "token_acc": 0.9224500525762356, "train_speed(iter/s)": 0.314617 }, { "epoch": 1.875, "grad_norm": 0.9943439364433289, "learning_rate": 6.91341716182545e-05, "loss": 0.205202054977417, "memory(GiB)": 153.57, "step": 4920, "token_acc": 0.9208269188475918, "train_speed(iter/s)": 0.314636 }, { "epoch": 1.876905487804878, "grad_norm": 2.2986297607421875, "learning_rate": 6.907885202700383e-05, "loss": 0.2481520175933838, "memory(GiB)": 153.57, "step": 4925, "token_acc": 0.898989898989899, "train_speed(iter/s)": 0.314685 }, { "epoch": 1.8788109756097562, "grad_norm": 0.7821398973464966, "learning_rate": 6.902350508783502e-05, "loss": 0.15415924787521362, "memory(GiB)": 153.57, "step": 4930, "token_acc": 0.9268178629880758, "train_speed(iter/s)": 0.314651 }, { "epoch": 1.8807164634146343, "grad_norm": 0.9643509387969971, "learning_rate": 6.896813088008315e-05, "loss": 0.1911299705505371, "memory(GiB)": 153.57, "step": 4935, "token_acc": 0.9162213740458015, "train_speed(iter/s)": 0.314652 }, { "epoch": 1.8826219512195121, "grad_norm": 1.7413806915283203, "learning_rate": 6.89127294831225e-05, "loss": 0.24440443515777588, "memory(GiB)": 153.57, "step": 4940, "token_acc": 0.9088669950738916, "train_speed(iter/s)": 0.314586 }, { "epoch": 1.8845274390243902, "grad_norm": 0.9801636934280396, "learning_rate": 6.885730097636629e-05, "loss": 0.21207551956176757, "memory(GiB)": 153.57, "step": 4945, "token_acc": 0.9209100758396533, "train_speed(iter/s)": 0.314568 }, { "epoch": 1.8864329268292683, "grad_norm": 1.1391537189483643, "learning_rate": 6.880184543926655e-05, "loss": 0.22446730136871337, "memory(GiB)": 153.57, "step": 4950, "token_acc": 0.8977437383564479, "train_speed(iter/s)": 0.314584 }, { "epoch": 1.8883384146341462, "grad_norm": 1.0503562688827515, "learning_rate": 6.874636295131411e-05, "loss": 0.20353665351867675, "memory(GiB)": 153.57, "step": 4955, "token_acc": 0.9126693518857562, "train_speed(iter/s)": 0.314609 }, { "epoch": 1.8902439024390243, "grad_norm": 0.9949665069580078, "learning_rate": 6.869085359203844e-05, "loss": 0.18287686109542847, "memory(GiB)": 153.57, "step": 4960, "token_acc": 0.9252291365171249, "train_speed(iter/s)": 0.314624 }, { "epoch": 1.8921493902439024, "grad_norm": 1.9515154361724854, "learning_rate": 6.863531744100749e-05, "loss": 0.22859957218170165, "memory(GiB)": 153.57, "step": 4965, "token_acc": 0.881538822425522, "train_speed(iter/s)": 0.314651 }, { "epoch": 1.8940548780487805, "grad_norm": 1.0027538537979126, "learning_rate": 6.85797545778276e-05, "loss": 0.1797175884246826, "memory(GiB)": 153.57, "step": 4970, "token_acc": 0.9086966995904602, "train_speed(iter/s)": 0.314683 }, { "epoch": 1.8959603658536586, "grad_norm": 1.6341501474380493, "learning_rate": 6.852416508214345e-05, "loss": 0.1794668197631836, "memory(GiB)": 153.57, "step": 4975, "token_acc": 0.9248275862068965, "train_speed(iter/s)": 0.314725 }, { "epoch": 1.8978658536585367, "grad_norm": 1.6102640628814697, "learning_rate": 6.84685490336379e-05, "loss": 0.202622127532959, "memory(GiB)": 153.57, "step": 4980, "token_acc": 0.9111524163568773, "train_speed(iter/s)": 0.31474 }, { "epoch": 1.8997713414634148, "grad_norm": 1.1518466472625732, "learning_rate": 6.84129065120318e-05, "loss": 0.18839445114135742, "memory(GiB)": 153.57, "step": 4985, "token_acc": 0.9104210212003583, "train_speed(iter/s)": 0.314789 }, { "epoch": 1.9016768292682928, "grad_norm": 0.38556942343711853, "learning_rate": 6.835723759708401e-05, "loss": 0.17948442697525024, "memory(GiB)": 153.57, "step": 4990, "token_acc": 0.9322235434007135, "train_speed(iter/s)": 0.314818 }, { "epoch": 1.9035823170731707, "grad_norm": 1.5773354768753052, "learning_rate": 6.83015423685912e-05, "loss": 0.19944019317626954, "memory(GiB)": 153.57, "step": 4995, "token_acc": 0.9018036072144289, "train_speed(iter/s)": 0.314839 }, { "epoch": 1.9054878048780488, "grad_norm": 1.1488503217697144, "learning_rate": 6.824582090638777e-05, "loss": 0.18115735054016113, "memory(GiB)": 153.57, "step": 5000, "token_acc": 0.9289923394225104, "train_speed(iter/s)": 0.314716 }, { "epoch": 1.9073932926829267, "grad_norm": 0.7579629421234131, "learning_rate": 6.819007329034571e-05, "loss": 0.2073660135269165, "memory(GiB)": 153.57, "step": 5005, "token_acc": 0.9236829148202855, "train_speed(iter/s)": 0.314575 }, { "epoch": 1.9092987804878048, "grad_norm": 1.418177843093872, "learning_rate": 6.81342996003745e-05, "loss": 0.21747195720672607, "memory(GiB)": 153.57, "step": 5010, "token_acc": 0.9097151576805697, "train_speed(iter/s)": 0.31459 }, { "epoch": 1.9112042682926829, "grad_norm": 1.225791096687317, "learning_rate": 6.807849991642101e-05, "loss": 0.21432268619537354, "memory(GiB)": 153.57, "step": 5015, "token_acc": 0.9156913740788903, "train_speed(iter/s)": 0.314601 }, { "epoch": 1.913109756097561, "grad_norm": 0.8856857419013977, "learning_rate": 6.802267431846934e-05, "loss": 0.17412679195404052, "memory(GiB)": 153.57, "step": 5020, "token_acc": 0.9205240174672489, "train_speed(iter/s)": 0.314619 }, { "epoch": 1.915015243902439, "grad_norm": 1.1511757373809814, "learning_rate": 6.79668228865408e-05, "loss": 0.21878385543823242, "memory(GiB)": 153.57, "step": 5025, "token_acc": 0.9154239019407558, "train_speed(iter/s)": 0.31464 }, { "epoch": 1.9169207317073171, "grad_norm": 1.610443115234375, "learning_rate": 6.791094570069365e-05, "loss": 0.24131205081939697, "memory(GiB)": 153.57, "step": 5030, "token_acc": 0.900523560209424, "train_speed(iter/s)": 0.314671 }, { "epoch": 1.9188262195121952, "grad_norm": 1.0308468341827393, "learning_rate": 6.785504284102308e-05, "loss": 0.20265281200408936, "memory(GiB)": 153.57, "step": 5035, "token_acc": 0.910659114315139, "train_speed(iter/s)": 0.314681 }, { "epoch": 1.9207317073170733, "grad_norm": 0.9227601289749146, "learning_rate": 6.779911438766116e-05, "loss": 0.1873704433441162, "memory(GiB)": 153.57, "step": 5040, "token_acc": 0.9238997019112748, "train_speed(iter/s)": 0.314677 }, { "epoch": 1.9226371951219512, "grad_norm": 1.4967421293258667, "learning_rate": 6.774316042077658e-05, "loss": 0.1721649646759033, "memory(GiB)": 153.57, "step": 5045, "token_acc": 0.9160142348754449, "train_speed(iter/s)": 0.314725 }, { "epoch": 1.9245426829268293, "grad_norm": 0.97368985414505, "learning_rate": 6.768718102057457e-05, "loss": 0.1816102981567383, "memory(GiB)": 153.57, "step": 5050, "token_acc": 0.9194508009153318, "train_speed(iter/s)": 0.314749 }, { "epoch": 1.9264481707317072, "grad_norm": 1.2913367748260498, "learning_rate": 6.763117626729686e-05, "loss": 0.21731100082397461, "memory(GiB)": 153.57, "step": 5055, "token_acc": 0.912031649988364, "train_speed(iter/s)": 0.31462 }, { "epoch": 1.9283536585365852, "grad_norm": 0.7688092589378357, "learning_rate": 6.757514624122158e-05, "loss": 0.15437638759613037, "memory(GiB)": 153.57, "step": 5060, "token_acc": 0.9329968673860076, "train_speed(iter/s)": 0.314619 }, { "epoch": 1.9302591463414633, "grad_norm": 1.2990503311157227, "learning_rate": 6.751909102266299e-05, "loss": 0.2615509033203125, "memory(GiB)": 153.57, "step": 5065, "token_acc": 0.8844610091743119, "train_speed(iter/s)": 0.314655 }, { "epoch": 1.9321646341463414, "grad_norm": 1.0534157752990723, "learning_rate": 6.746301069197148e-05, "loss": 0.1891866683959961, "memory(GiB)": 153.57, "step": 5070, "token_acc": 0.9191499191499192, "train_speed(iter/s)": 0.314684 }, { "epoch": 1.9340701219512195, "grad_norm": 0.9463886022567749, "learning_rate": 6.740690532953348e-05, "loss": 0.19855825901031493, "memory(GiB)": 153.57, "step": 5075, "token_acc": 0.9052827073875361, "train_speed(iter/s)": 0.314714 }, { "epoch": 1.9359756097560976, "grad_norm": 1.0111576318740845, "learning_rate": 6.735077501577126e-05, "loss": 0.20453715324401855, "memory(GiB)": 153.57, "step": 5080, "token_acc": 0.9118797611694461, "train_speed(iter/s)": 0.314721 }, { "epoch": 1.9378810975609757, "grad_norm": 0.8723740577697754, "learning_rate": 6.729461983114287e-05, "loss": 0.21995718479156495, "memory(GiB)": 153.57, "step": 5085, "token_acc": 0.9114939114939115, "train_speed(iter/s)": 0.314777 }, { "epoch": 1.9397865853658538, "grad_norm": 1.054841160774231, "learning_rate": 6.723843985614201e-05, "loss": 0.271032977104187, "memory(GiB)": 153.57, "step": 5090, "token_acc": 0.891279468986038, "train_speed(iter/s)": 0.314801 }, { "epoch": 1.9416920731707317, "grad_norm": 1.3072396516799927, "learning_rate": 6.718223517129792e-05, "loss": 0.23691129684448242, "memory(GiB)": 153.57, "step": 5095, "token_acc": 0.9040675364543361, "train_speed(iter/s)": 0.314825 }, { "epoch": 1.9435975609756098, "grad_norm": 1.581091046333313, "learning_rate": 6.712600585717525e-05, "loss": 0.19795191287994385, "memory(GiB)": 153.57, "step": 5100, "token_acc": 0.919931856899489, "train_speed(iter/s)": 0.314764 }, { "epoch": 1.9455030487804879, "grad_norm": 1.0106184482574463, "learning_rate": 6.706975199437397e-05, "loss": 0.21159684658050537, "memory(GiB)": 153.57, "step": 5105, "token_acc": 0.9022589052997394, "train_speed(iter/s)": 0.314803 }, { "epoch": 1.9474085365853657, "grad_norm": 1.2942981719970703, "learning_rate": 6.701347366352922e-05, "loss": 0.19464614391326904, "memory(GiB)": 153.57, "step": 5110, "token_acc": 0.9138861138861138, "train_speed(iter/s)": 0.314821 }, { "epoch": 1.9493140243902438, "grad_norm": 1.2144663333892822, "learning_rate": 6.695717094531123e-05, "loss": 0.23281564712524414, "memory(GiB)": 153.57, "step": 5115, "token_acc": 0.892229069592665, "train_speed(iter/s)": 0.314815 }, { "epoch": 1.951219512195122, "grad_norm": 1.6719708442687988, "learning_rate": 6.690084392042513e-05, "loss": 0.1922488570213318, "memory(GiB)": 153.57, "step": 5120, "token_acc": 0.9258683255572836, "train_speed(iter/s)": 0.314846 }, { "epoch": 1.953125, "grad_norm": 1.3851492404937744, "learning_rate": 6.6844492669611e-05, "loss": 0.2037435531616211, "memory(GiB)": 153.57, "step": 5125, "token_acc": 0.9095171120487576, "train_speed(iter/s)": 0.314902 }, { "epoch": 1.955030487804878, "grad_norm": 1.2233893871307373, "learning_rate": 6.678811727364355e-05, "loss": 0.2086869716644287, "memory(GiB)": 153.57, "step": 5130, "token_acc": 0.9147766323024055, "train_speed(iter/s)": 0.314883 }, { "epoch": 1.9569359756097562, "grad_norm": 0.9366785883903503, "learning_rate": 6.673171781333217e-05, "loss": 0.2300556182861328, "memory(GiB)": 153.57, "step": 5135, "token_acc": 0.9178302900107411, "train_speed(iter/s)": 0.314833 }, { "epoch": 1.9588414634146343, "grad_norm": 1.0439931154251099, "learning_rate": 6.667529436952063e-05, "loss": 0.16076194047927855, "memory(GiB)": 153.57, "step": 5140, "token_acc": 0.9329044117647058, "train_speed(iter/s)": 0.314777 }, { "epoch": 1.9607469512195121, "grad_norm": 1.2146488428115845, "learning_rate": 6.661884702308725e-05, "loss": 0.20047991275787352, "memory(GiB)": 153.57, "step": 5145, "token_acc": 0.9141414141414141, "train_speed(iter/s)": 0.314817 }, { "epoch": 1.9626524390243902, "grad_norm": 1.04020357131958, "learning_rate": 6.656237585494448e-05, "loss": 0.16403684616088868, "memory(GiB)": 153.57, "step": 5150, "token_acc": 0.9081493062341216, "train_speed(iter/s)": 0.314843 }, { "epoch": 1.9645579268292683, "grad_norm": 1.3909624814987183, "learning_rate": 6.650588094603892e-05, "loss": 0.1868462085723877, "memory(GiB)": 153.57, "step": 5155, "token_acc": 0.9226050023596036, "train_speed(iter/s)": 0.314865 }, { "epoch": 1.9664634146341462, "grad_norm": 0.9200443625450134, "learning_rate": 6.644936237735128e-05, "loss": 0.20707204341888427, "memory(GiB)": 153.57, "step": 5160, "token_acc": 0.9218106995884774, "train_speed(iter/s)": 0.314683 }, { "epoch": 1.9683689024390243, "grad_norm": 1.0813096761703491, "learning_rate": 6.639282022989614e-05, "loss": 0.22788312435150146, "memory(GiB)": 153.57, "step": 5165, "token_acc": 0.9052754982415006, "train_speed(iter/s)": 0.31469 }, { "epoch": 1.9702743902439024, "grad_norm": 1.8623137474060059, "learning_rate": 6.633625458472187e-05, "loss": 0.21102485656738282, "memory(GiB)": 153.57, "step": 5170, "token_acc": 0.9287612971823498, "train_speed(iter/s)": 0.314706 }, { "epoch": 1.9721798780487805, "grad_norm": 1.4873621463775635, "learning_rate": 6.627966552291052e-05, "loss": 0.16630241870880128, "memory(GiB)": 153.57, "step": 5175, "token_acc": 0.9267015706806283, "train_speed(iter/s)": 0.314756 }, { "epoch": 1.9740853658536586, "grad_norm": 1.4487882852554321, "learning_rate": 6.622305312557773e-05, "loss": 0.19219690561294556, "memory(GiB)": 153.57, "step": 5180, "token_acc": 0.9260765550239235, "train_speed(iter/s)": 0.314779 }, { "epoch": 1.9759908536585367, "grad_norm": 0.7223525047302246, "learning_rate": 6.616641747387257e-05, "loss": 0.1737621545791626, "memory(GiB)": 153.57, "step": 5185, "token_acc": 0.9189443920829407, "train_speed(iter/s)": 0.31473 }, { "epoch": 1.9778963414634148, "grad_norm": 0.7507862448692322, "learning_rate": 6.610975864897746e-05, "loss": 0.173081111907959, "memory(GiB)": 153.57, "step": 5190, "token_acc": 0.9252152521525215, "train_speed(iter/s)": 0.314743 }, { "epoch": 1.9798018292682928, "grad_norm": 1.1470071077346802, "learning_rate": 6.605307673210801e-05, "loss": 0.21953170299530028, "memory(GiB)": 153.57, "step": 5195, "token_acc": 0.9025938189845475, "train_speed(iter/s)": 0.314788 }, { "epoch": 1.9817073170731707, "grad_norm": 1.2378381490707397, "learning_rate": 6.599637180451294e-05, "loss": 0.23548038005828859, "memory(GiB)": 153.57, "step": 5200, "token_acc": 0.8931027297353615, "train_speed(iter/s)": 0.314826 }, { "epoch": 1.9836128048780488, "grad_norm": 0.9113010168075562, "learning_rate": 6.593964394747399e-05, "loss": 0.2329866886138916, "memory(GiB)": 153.57, "step": 5205, "token_acc": 0.908581179912238, "train_speed(iter/s)": 0.314845 }, { "epoch": 1.9855182926829267, "grad_norm": 0.9979197382926941, "learning_rate": 6.588289324230573e-05, "loss": 0.18097052574157715, "memory(GiB)": 153.57, "step": 5210, "token_acc": 0.9212719298245614, "train_speed(iter/s)": 0.314848 }, { "epoch": 1.9874237804878048, "grad_norm": 1.5010836124420166, "learning_rate": 6.582611977035546e-05, "loss": 0.22901277542114257, "memory(GiB)": 153.57, "step": 5215, "token_acc": 0.9054964539007092, "train_speed(iter/s)": 0.314853 }, { "epoch": 1.9893292682926829, "grad_norm": 1.4459878206253052, "learning_rate": 6.576932361300315e-05, "loss": 0.17088087797164916, "memory(GiB)": 153.57, "step": 5220, "token_acc": 0.9184617608789938, "train_speed(iter/s)": 0.314839 }, { "epoch": 1.991234756097561, "grad_norm": 1.0889779329299927, "learning_rate": 6.571250485166131e-05, "loss": 0.2154294490814209, "memory(GiB)": 153.57, "step": 5225, "token_acc": 0.9110986860497624, "train_speed(iter/s)": 0.314763 }, { "epoch": 1.993140243902439, "grad_norm": 0.8559303879737854, "learning_rate": 6.56556635677748e-05, "loss": 0.21972329616546632, "memory(GiB)": 153.57, "step": 5230, "token_acc": 0.9240581429842777, "train_speed(iter/s)": 0.314756 }, { "epoch": 1.9950457317073171, "grad_norm": 1.3480780124664307, "learning_rate": 6.559879984282079e-05, "loss": 0.17652004957199097, "memory(GiB)": 153.57, "step": 5235, "token_acc": 0.9239450441609421, "train_speed(iter/s)": 0.314689 }, { "epoch": 1.9969512195121952, "grad_norm": 1.0217713117599487, "learning_rate": 6.55419137583086e-05, "loss": 0.24434142112731932, "memory(GiB)": 153.57, "step": 5240, "token_acc": 0.9059056230573609, "train_speed(iter/s)": 0.314703 }, { "epoch": 1.9988567073170733, "grad_norm": 1.7811146974563599, "learning_rate": 6.548500539577964e-05, "loss": 0.2368468999862671, "memory(GiB)": 153.57, "step": 5245, "token_acc": 0.9066175428652402, "train_speed(iter/s)": 0.314659 }, { "epoch": 2.0007621951219514, "grad_norm": 1.1155062913894653, "learning_rate": 6.54280748368072e-05, "loss": 0.14532467126846313, "memory(GiB)": 153.57, "step": 5250, "token_acc": 0.931497649429147, "train_speed(iter/s)": 0.314625 }, { "epoch": 2.002667682926829, "grad_norm": 0.6737852692604065, "learning_rate": 6.537112216299643e-05, "loss": 0.1435774326324463, "memory(GiB)": 153.57, "step": 5255, "token_acc": 0.9376181474480151, "train_speed(iter/s)": 0.314617 }, { "epoch": 2.004573170731707, "grad_norm": 1.2940014600753784, "learning_rate": 6.531414745598416e-05, "loss": 0.17416806221008302, "memory(GiB)": 153.57, "step": 5260, "token_acc": 0.92407277816655, "train_speed(iter/s)": 0.314545 }, { "epoch": 2.0064786585365852, "grad_norm": 0.7342056035995483, "learning_rate": 6.525715079743879e-05, "loss": 0.15006937980651855, "memory(GiB)": 153.57, "step": 5265, "token_acc": 0.9387964674471144, "train_speed(iter/s)": 0.31448 }, { "epoch": 2.0083841463414633, "grad_norm": 1.1393882036209106, "learning_rate": 6.52001322690602e-05, "loss": 0.16660547256469727, "memory(GiB)": 153.57, "step": 5270, "token_acc": 0.9388087009554787, "train_speed(iter/s)": 0.3145 }, { "epoch": 2.0102896341463414, "grad_norm": 0.9243036508560181, "learning_rate": 6.514309195257967e-05, "loss": 0.1631234645843506, "memory(GiB)": 153.57, "step": 5275, "token_acc": 0.9295089936801166, "train_speed(iter/s)": 0.314519 }, { "epoch": 2.0121951219512195, "grad_norm": 1.6543718576431274, "learning_rate": 6.508602992975963e-05, "loss": 0.17114875316619874, "memory(GiB)": 153.57, "step": 5280, "token_acc": 0.9291845493562232, "train_speed(iter/s)": 0.314545 }, { "epoch": 2.0141006097560976, "grad_norm": 1.5296275615692139, "learning_rate": 6.502894628239362e-05, "loss": 0.18954846858978272, "memory(GiB)": 153.57, "step": 5285, "token_acc": 0.9272065514103731, "train_speed(iter/s)": 0.314566 }, { "epoch": 2.0160060975609757, "grad_norm": 1.2968790531158447, "learning_rate": 6.497184109230628e-05, "loss": 0.2158041477203369, "memory(GiB)": 153.57, "step": 5290, "token_acc": 0.9048231511254019, "train_speed(iter/s)": 0.314591 }, { "epoch": 2.017911585365854, "grad_norm": 1.2421761751174927, "learning_rate": 6.4914714441353e-05, "loss": 0.16348893642425538, "memory(GiB)": 153.57, "step": 5295, "token_acc": 0.9327272727272727, "train_speed(iter/s)": 0.314568 }, { "epoch": 2.019817073170732, "grad_norm": 1.974718451499939, "learning_rate": 6.485756641142005e-05, "loss": 0.17258079051971437, "memory(GiB)": 153.57, "step": 5300, "token_acc": 0.9282920469361148, "train_speed(iter/s)": 0.314397 }, { "epoch": 2.0217225609756095, "grad_norm": 1.48311185836792, "learning_rate": 6.480039708442425e-05, "loss": 0.16814295053482056, "memory(GiB)": 153.57, "step": 5305, "token_acc": 0.9344770566349732, "train_speed(iter/s)": 0.314411 }, { "epoch": 2.0236280487804876, "grad_norm": 0.9516241550445557, "learning_rate": 6.474320654231298e-05, "loss": 0.17844457626342775, "memory(GiB)": 153.57, "step": 5310, "token_acc": 0.9261752893928655, "train_speed(iter/s)": 0.314379 }, { "epoch": 2.0255335365853657, "grad_norm": 1.234339714050293, "learning_rate": 6.468599486706408e-05, "loss": 0.10793913602828979, "memory(GiB)": 153.57, "step": 5315, "token_acc": 0.9474056603773585, "train_speed(iter/s)": 0.314374 }, { "epoch": 2.027439024390244, "grad_norm": 1.008665680885315, "learning_rate": 6.462876214068562e-05, "loss": 0.18806517124176025, "memory(GiB)": 153.57, "step": 5320, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.314226 }, { "epoch": 2.029344512195122, "grad_norm": 1.239043951034546, "learning_rate": 6.457150844521586e-05, "loss": 0.19084135293960572, "memory(GiB)": 153.57, "step": 5325, "token_acc": 0.911776367961935, "train_speed(iter/s)": 0.31424 }, { "epoch": 2.03125, "grad_norm": 1.7995973825454712, "learning_rate": 6.451423386272312e-05, "loss": 0.20435080528259278, "memory(GiB)": 153.57, "step": 5330, "token_acc": 0.91094956837801, "train_speed(iter/s)": 0.314293 }, { "epoch": 2.033155487804878, "grad_norm": 1.289467453956604, "learning_rate": 6.445693847530567e-05, "loss": 0.1891680598258972, "memory(GiB)": 153.57, "step": 5335, "token_acc": 0.9122670807453416, "train_speed(iter/s)": 0.314354 }, { "epoch": 2.035060975609756, "grad_norm": 1.1881511211395264, "learning_rate": 6.43996223650916e-05, "loss": 0.1612159252166748, "memory(GiB)": 153.57, "step": 5340, "token_acc": 0.9280724450194049, "train_speed(iter/s)": 0.314301 }, { "epoch": 2.0369664634146343, "grad_norm": 1.3428404331207275, "learning_rate": 6.434228561423868e-05, "loss": 0.15997674465179443, "memory(GiB)": 153.57, "step": 5345, "token_acc": 0.9408502772643254, "train_speed(iter/s)": 0.31433 }, { "epoch": 2.0388719512195124, "grad_norm": 1.2836415767669678, "learning_rate": 6.42849283049343e-05, "loss": 0.15191922187805176, "memory(GiB)": 153.57, "step": 5350, "token_acc": 0.9383070301291249, "train_speed(iter/s)": 0.31435 }, { "epoch": 2.0407774390243905, "grad_norm": 1.3063247203826904, "learning_rate": 6.422755051939528e-05, "loss": 0.16564474105834961, "memory(GiB)": 153.57, "step": 5355, "token_acc": 0.9269882659713168, "train_speed(iter/s)": 0.314363 }, { "epoch": 2.042682926829268, "grad_norm": 2.148331642150879, "learning_rate": 6.417015233986786e-05, "loss": 0.18067260980606079, "memory(GiB)": 153.57, "step": 5360, "token_acc": 0.920527441197434, "train_speed(iter/s)": 0.314393 }, { "epoch": 2.044588414634146, "grad_norm": 1.4761160612106323, "learning_rate": 6.411273384862744e-05, "loss": 0.15276081562042237, "memory(GiB)": 153.57, "step": 5365, "token_acc": 0.9368095448519664, "train_speed(iter/s)": 0.3144 }, { "epoch": 2.0464939024390243, "grad_norm": 0.9646481871604919, "learning_rate": 6.405529512797857e-05, "loss": 0.19801536798477173, "memory(GiB)": 153.57, "step": 5370, "token_acc": 0.9177526169621876, "train_speed(iter/s)": 0.314431 }, { "epoch": 2.0483993902439024, "grad_norm": 1.8968311548233032, "learning_rate": 6.399783626025477e-05, "loss": 0.17452664375305177, "memory(GiB)": 153.57, "step": 5375, "token_acc": 0.9181426519865965, "train_speed(iter/s)": 0.314471 }, { "epoch": 2.0503048780487805, "grad_norm": 1.3539304733276367, "learning_rate": 6.394035732781847e-05, "loss": 0.1799065351486206, "memory(GiB)": 153.57, "step": 5380, "token_acc": 0.918492784607162, "train_speed(iter/s)": 0.314516 }, { "epoch": 2.0522103658536586, "grad_norm": 1.291258692741394, "learning_rate": 6.388285841306087e-05, "loss": 0.17875136137008668, "memory(GiB)": 153.57, "step": 5385, "token_acc": 0.9041331802525833, "train_speed(iter/s)": 0.314569 }, { "epoch": 2.0541158536585367, "grad_norm": 1.146483063697815, "learning_rate": 6.382533959840177e-05, "loss": 0.18099279403686525, "memory(GiB)": 153.57, "step": 5390, "token_acc": 0.9238985313751669, "train_speed(iter/s)": 0.314575 }, { "epoch": 2.0560213414634148, "grad_norm": 1.5646774768829346, "learning_rate": 6.376780096628955e-05, "loss": 0.18399883508682252, "memory(GiB)": 153.57, "step": 5395, "token_acc": 0.9206072672971628, "train_speed(iter/s)": 0.314595 }, { "epoch": 2.057926829268293, "grad_norm": 1.6051470041275024, "learning_rate": 6.371024259920091e-05, "loss": 0.15968307256698608, "memory(GiB)": 153.57, "step": 5400, "token_acc": 0.9313186813186813, "train_speed(iter/s)": 0.314623 }, { "epoch": 2.059832317073171, "grad_norm": 1.6307401657104492, "learning_rate": 6.365266457964096e-05, "loss": 0.21457891464233397, "memory(GiB)": 153.57, "step": 5405, "token_acc": 0.900792283844299, "train_speed(iter/s)": 0.314655 }, { "epoch": 2.0617378048780486, "grad_norm": 1.346611738204956, "learning_rate": 6.359506699014286e-05, "loss": 0.1663034200668335, "memory(GiB)": 153.57, "step": 5410, "token_acc": 0.9315323707498836, "train_speed(iter/s)": 0.314576 }, { "epoch": 2.0636432926829267, "grad_norm": 1.266196846961975, "learning_rate": 6.35374499132679e-05, "loss": 0.16658827066421508, "memory(GiB)": 153.57, "step": 5415, "token_acc": 0.9343373493975904, "train_speed(iter/s)": 0.314628 }, { "epoch": 2.0655487804878048, "grad_norm": 1.6666911840438843, "learning_rate": 6.347981343160526e-05, "loss": 0.1709218740463257, "memory(GiB)": 153.57, "step": 5420, "token_acc": 0.9361636194607995, "train_speed(iter/s)": 0.314669 }, { "epoch": 2.067454268292683, "grad_norm": 1.304839849472046, "learning_rate": 6.342215762777198e-05, "loss": 0.15231664180755616, "memory(GiB)": 153.57, "step": 5425, "token_acc": 0.9366295264623955, "train_speed(iter/s)": 0.314732 }, { "epoch": 2.069359756097561, "grad_norm": 2.9622371196746826, "learning_rate": 6.336448258441275e-05, "loss": 0.15797747373580934, "memory(GiB)": 153.57, "step": 5430, "token_acc": 0.932599202582115, "train_speed(iter/s)": 0.314653 }, { "epoch": 2.071265243902439, "grad_norm": 1.8688775300979614, "learning_rate": 6.330678838419988e-05, "loss": 0.17306137084960938, "memory(GiB)": 153.57, "step": 5435, "token_acc": 0.9239956568946797, "train_speed(iter/s)": 0.314686 }, { "epoch": 2.073170731707317, "grad_norm": 1.317941665649414, "learning_rate": 6.32490751098331e-05, "loss": 0.21602516174316405, "memory(GiB)": 153.57, "step": 5440, "token_acc": 0.9119213437115936, "train_speed(iter/s)": 0.314691 }, { "epoch": 2.0750762195121952, "grad_norm": 1.033875823020935, "learning_rate": 6.319134284403949e-05, "loss": 0.1780030608177185, "memory(GiB)": 153.57, "step": 5445, "token_acc": 0.9202881152460984, "train_speed(iter/s)": 0.314717 }, { "epoch": 2.0769817073170733, "grad_norm": 0.8095862865447998, "learning_rate": 6.31335916695734e-05, "loss": 0.16998127698898316, "memory(GiB)": 153.57, "step": 5450, "token_acc": 0.9230455194937742, "train_speed(iter/s)": 0.314742 }, { "epoch": 2.0788871951219514, "grad_norm": 1.5567771196365356, "learning_rate": 6.307582166921622e-05, "loss": 0.1708164095878601, "memory(GiB)": 153.57, "step": 5455, "token_acc": 0.9216514954486346, "train_speed(iter/s)": 0.314773 }, { "epoch": 2.080792682926829, "grad_norm": 1.3382314443588257, "learning_rate": 6.301803292577635e-05, "loss": 0.20106110572814942, "memory(GiB)": 153.57, "step": 5460, "token_acc": 0.8970504821327283, "train_speed(iter/s)": 0.314728 }, { "epoch": 2.082698170731707, "grad_norm": 0.7418006062507629, "learning_rate": 6.296022552208906e-05, "loss": 0.16259474754333497, "memory(GiB)": 153.57, "step": 5465, "token_acc": 0.9386252045826514, "train_speed(iter/s)": 0.314754 }, { "epoch": 2.0846036585365852, "grad_norm": 1.6958506107330322, "learning_rate": 6.290239954101638e-05, "loss": 0.21281774044036866, "memory(GiB)": 153.57, "step": 5470, "token_acc": 0.9249779346866726, "train_speed(iter/s)": 0.314773 }, { "epoch": 2.0865091463414633, "grad_norm": 1.815185785293579, "learning_rate": 6.284455506544694e-05, "loss": 0.15813546180725097, "memory(GiB)": 153.57, "step": 5475, "token_acc": 0.9308801591248135, "train_speed(iter/s)": 0.314833 }, { "epoch": 2.0884146341463414, "grad_norm": 2.1088273525238037, "learning_rate": 6.27866921782959e-05, "loss": 0.1700990915298462, "memory(GiB)": 153.57, "step": 5480, "token_acc": 0.9294225481209899, "train_speed(iter/s)": 0.314719 }, { "epoch": 2.0903201219512195, "grad_norm": 1.0138131380081177, "learning_rate": 6.272881096250482e-05, "loss": 0.25276823043823243, "memory(GiB)": 153.57, "step": 5485, "token_acc": 0.9135626448048476, "train_speed(iter/s)": 0.314732 }, { "epoch": 2.0922256097560976, "grad_norm": 1.4412651062011719, "learning_rate": 6.26709115010415e-05, "loss": 0.18063662052154542, "memory(GiB)": 153.57, "step": 5490, "token_acc": 0.9169950738916256, "train_speed(iter/s)": 0.314681 }, { "epoch": 2.0941310975609757, "grad_norm": 1.3137131929397583, "learning_rate": 6.261299387689992e-05, "loss": 0.14013843536376952, "memory(GiB)": 153.57, "step": 5495, "token_acc": 0.9445193171608266, "train_speed(iter/s)": 0.314693 }, { "epoch": 2.096036585365854, "grad_norm": 0.792199969291687, "learning_rate": 6.255505817310009e-05, "loss": 0.16628425121307372, "memory(GiB)": 153.57, "step": 5500, "token_acc": 0.9312027707808564, "train_speed(iter/s)": 0.314683 }, { "epoch": 2.097942073170732, "grad_norm": 1.2689191102981567, "learning_rate": 6.249710447268793e-05, "loss": 0.16478928327560424, "memory(GiB)": 153.57, "step": 5505, "token_acc": 0.9337029337029337, "train_speed(iter/s)": 0.314699 }, { "epoch": 2.0998475609756095, "grad_norm": 0.9555128812789917, "learning_rate": 6.243913285873517e-05, "loss": 0.17856279611587525, "memory(GiB)": 153.57, "step": 5510, "token_acc": 0.9311356334683615, "train_speed(iter/s)": 0.314698 }, { "epoch": 2.1017530487804876, "grad_norm": 1.394572138786316, "learning_rate": 6.23811434143392e-05, "loss": 0.1358415126800537, "memory(GiB)": 153.57, "step": 5515, "token_acc": 0.9333938843475629, "train_speed(iter/s)": 0.314731 }, { "epoch": 2.1036585365853657, "grad_norm": 1.1626840829849243, "learning_rate": 6.232313622262296e-05, "loss": 0.1514343023300171, "memory(GiB)": 153.57, "step": 5520, "token_acc": 0.9350736278447122, "train_speed(iter/s)": 0.314786 }, { "epoch": 2.105564024390244, "grad_norm": 1.1866151094436646, "learning_rate": 6.226511136673487e-05, "loss": 0.16530239582061768, "memory(GiB)": 153.57, "step": 5525, "token_acc": 0.922105828632823, "train_speed(iter/s)": 0.314819 }, { "epoch": 2.107469512195122, "grad_norm": 0.6071298718452454, "learning_rate": 6.220706892984865e-05, "loss": 0.15574573278427123, "memory(GiB)": 153.57, "step": 5530, "token_acc": 0.9379107183322003, "train_speed(iter/s)": 0.314838 }, { "epoch": 2.109375, "grad_norm": 1.0221107006072998, "learning_rate": 6.21490089951632e-05, "loss": 0.19233291149139403, "memory(GiB)": 153.57, "step": 5535, "token_acc": 0.9156129741616272, "train_speed(iter/s)": 0.314851 }, { "epoch": 2.111280487804878, "grad_norm": 1.705976128578186, "learning_rate": 6.209093164590252e-05, "loss": 0.18901797533035278, "memory(GiB)": 153.57, "step": 5540, "token_acc": 0.9262899262899262, "train_speed(iter/s)": 0.314788 }, { "epoch": 2.113185975609756, "grad_norm": 1.0073857307434082, "learning_rate": 6.203283696531558e-05, "loss": 0.15541337728500365, "memory(GiB)": 153.57, "step": 5545, "token_acc": 0.9357126515671471, "train_speed(iter/s)": 0.314802 }, { "epoch": 2.1150914634146343, "grad_norm": 1.2715319395065308, "learning_rate": 6.197472503667616e-05, "loss": 0.20190751552581787, "memory(GiB)": 153.57, "step": 5550, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 0.314818 }, { "epoch": 2.1169969512195124, "grad_norm": 1.0465279817581177, "learning_rate": 6.191659594328281e-05, "loss": 0.18852912187576293, "memory(GiB)": 153.57, "step": 5555, "token_acc": 0.9226833428084139, "train_speed(iter/s)": 0.314859 }, { "epoch": 2.1189024390243905, "grad_norm": 0.8566867113113403, "learning_rate": 6.185844976845866e-05, "loss": 0.15741174221038817, "memory(GiB)": 153.57, "step": 5560, "token_acc": 0.9383995394358089, "train_speed(iter/s)": 0.314858 }, { "epoch": 2.120807926829268, "grad_norm": 2.2001729011535645, "learning_rate": 6.180028659555131e-05, "loss": 0.1974867820739746, "memory(GiB)": 153.57, "step": 5565, "token_acc": 0.9188175601147143, "train_speed(iter/s)": 0.314874 }, { "epoch": 2.122713414634146, "grad_norm": 1.0676491260528564, "learning_rate": 6.174210650793276e-05, "loss": 0.16295723915100097, "memory(GiB)": 153.57, "step": 5570, "token_acc": 0.92835873182694, "train_speed(iter/s)": 0.314877 }, { "epoch": 2.1246189024390243, "grad_norm": 1.270781397819519, "learning_rate": 6.168390958899921e-05, "loss": 0.22009127140045165, "memory(GiB)": 153.57, "step": 5575, "token_acc": 0.9111969111969112, "train_speed(iter/s)": 0.314895 }, { "epoch": 2.1265243902439024, "grad_norm": 0.45429515838623047, "learning_rate": 6.162569592217105e-05, "loss": 0.20597367286682128, "memory(GiB)": 153.57, "step": 5580, "token_acc": 0.9055155875299761, "train_speed(iter/s)": 0.314924 }, { "epoch": 2.1284298780487805, "grad_norm": 0.6279653906822205, "learning_rate": 6.156746559089261e-05, "loss": 0.1385067105293274, "memory(GiB)": 153.57, "step": 5585, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.31497 }, { "epoch": 2.1303353658536586, "grad_norm": 1.0849207639694214, "learning_rate": 6.150921867863215e-05, "loss": 0.2303936719894409, "memory(GiB)": 153.57, "step": 5590, "token_acc": 0.9079266946345772, "train_speed(iter/s)": 0.314896 }, { "epoch": 2.1322408536585367, "grad_norm": 0.788948118686676, "learning_rate": 6.14509552688817e-05, "loss": 0.15240522623062133, "memory(GiB)": 153.57, "step": 5595, "token_acc": 0.9367676321025867, "train_speed(iter/s)": 0.31491 }, { "epoch": 2.1341463414634148, "grad_norm": 1.2646316289901733, "learning_rate": 6.139267544515689e-05, "loss": 0.19808260202407837, "memory(GiB)": 153.57, "step": 5600, "token_acc": 0.9283400809716599, "train_speed(iter/s)": 0.314947 }, { "epoch": 2.1341463414634148, "eval_loss": 0.17648139595985413, "eval_runtime": 33.034, "eval_samples_per_second": 3.209, "eval_steps_per_second": 3.209, "eval_token_acc": 0.9107870253960272, "step": 5600 }, { "epoch": 2.136051829268293, "grad_norm": 0.9761647582054138, "learning_rate": 6.133437929099692e-05, "loss": 0.1874315023422241, "memory(GiB)": 153.57, "step": 5605, "token_acc": 0.911203351695244, "train_speed(iter/s)": 0.314378 }, { "epoch": 2.137957317073171, "grad_norm": 1.5713189840316772, "learning_rate": 6.127606688996441e-05, "loss": 0.17547584772109986, "memory(GiB)": 153.57, "step": 5610, "token_acc": 0.9285481239804242, "train_speed(iter/s)": 0.314409 }, { "epoch": 2.1398628048780486, "grad_norm": 1.6881431341171265, "learning_rate": 6.121773832564522e-05, "loss": 0.21903984546661376, "memory(GiB)": 153.57, "step": 5615, "token_acc": 0.9157678479712378, "train_speed(iter/s)": 0.314443 }, { "epoch": 2.1417682926829267, "grad_norm": 1.088974118232727, "learning_rate": 6.115939368164841e-05, "loss": 0.17022570371627807, "memory(GiB)": 153.57, "step": 5620, "token_acc": 0.9287443791075752, "train_speed(iter/s)": 0.31444 }, { "epoch": 2.1436737804878048, "grad_norm": 1.3210487365722656, "learning_rate": 6.110103304160611e-05, "loss": 0.19628796577453614, "memory(GiB)": 153.57, "step": 5625, "token_acc": 0.9100719424460432, "train_speed(iter/s)": 0.314409 }, { "epoch": 2.145579268292683, "grad_norm": 1.095274806022644, "learning_rate": 6.104265648917333e-05, "loss": 0.13705213069915773, "memory(GiB)": 153.57, "step": 5630, "token_acc": 0.939381693271368, "train_speed(iter/s)": 0.31442 }, { "epoch": 2.147484756097561, "grad_norm": 1.6271371841430664, "learning_rate": 6.098426410802791e-05, "loss": 0.2084033727645874, "memory(GiB)": 153.57, "step": 5635, "token_acc": 0.9095946340040828, "train_speed(iter/s)": 0.31445 }, { "epoch": 2.149390243902439, "grad_norm": 1.3318315744400024, "learning_rate": 6.09258559818704e-05, "loss": 0.1933051586151123, "memory(GiB)": 153.57, "step": 5640, "token_acc": 0.9192653529749378, "train_speed(iter/s)": 0.314464 }, { "epoch": 2.151295731707317, "grad_norm": 0.9334048628807068, "learning_rate": 6.086743219442388e-05, "loss": 0.1699568510055542, "memory(GiB)": 153.57, "step": 5645, "token_acc": 0.9303724265556327, "train_speed(iter/s)": 0.314409 }, { "epoch": 2.1532012195121952, "grad_norm": 1.813536524772644, "learning_rate": 6.080899282943391e-05, "loss": 0.19650890827178955, "memory(GiB)": 153.57, "step": 5650, "token_acc": 0.9231722428748451, "train_speed(iter/s)": 0.314447 }, { "epoch": 2.1551067073170733, "grad_norm": 0.8360101580619812, "learning_rate": 6.075053797066838e-05, "loss": 0.1626096248626709, "memory(GiB)": 153.57, "step": 5655, "token_acc": 0.9296338672768879, "train_speed(iter/s)": 0.314463 }, { "epoch": 2.1570121951219514, "grad_norm": 1.6042349338531494, "learning_rate": 6.069206770191736e-05, "loss": 0.21857218742370604, "memory(GiB)": 153.57, "step": 5660, "token_acc": 0.9097412160740584, "train_speed(iter/s)": 0.314473 }, { "epoch": 2.158917682926829, "grad_norm": 1.4355485439300537, "learning_rate": 6.063358210699305e-05, "loss": 0.19125471115112305, "memory(GiB)": 153.57, "step": 5665, "token_acc": 0.9294271400566325, "train_speed(iter/s)": 0.314491 }, { "epoch": 2.160823170731707, "grad_norm": 1.6613630056381226, "learning_rate": 6.057508126972956e-05, "loss": 0.15807559490203857, "memory(GiB)": 153.57, "step": 5670, "token_acc": 0.9327641408751334, "train_speed(iter/s)": 0.314528 }, { "epoch": 2.1627286585365852, "grad_norm": 2.4135687351226807, "learning_rate": 6.051656527398293e-05, "loss": 0.2572104215621948, "memory(GiB)": 153.57, "step": 5675, "token_acc": 0.8959753683747526, "train_speed(iter/s)": 0.314562 }, { "epoch": 2.1646341463414633, "grad_norm": 1.4240607023239136, "learning_rate": 6.045803420363084e-05, "loss": 0.17834664583206178, "memory(GiB)": 153.57, "step": 5680, "token_acc": 0.9271334792122539, "train_speed(iter/s)": 0.314591 }, { "epoch": 2.1665396341463414, "grad_norm": 0.8282479047775269, "learning_rate": 6.039948814257266e-05, "loss": 0.1818472623825073, "memory(GiB)": 153.57, "step": 5685, "token_acc": 0.9220454545454545, "train_speed(iter/s)": 0.31462 }, { "epoch": 2.1684451219512195, "grad_norm": 1.1489776372909546, "learning_rate": 6.0340927174729166e-05, "loss": 0.13027355670928956, "memory(GiB)": 153.57, "step": 5690, "token_acc": 0.9451843273499324, "train_speed(iter/s)": 0.314631 }, { "epoch": 2.1703506097560976, "grad_norm": 1.7379004955291748, "learning_rate": 6.028235138404258e-05, "loss": 0.1472163200378418, "memory(GiB)": 153.57, "step": 5695, "token_acc": 0.9358752166377816, "train_speed(iter/s)": 0.314607 }, { "epoch": 2.1722560975609757, "grad_norm": 1.2138713598251343, "learning_rate": 6.022376085447632e-05, "loss": 0.17384756803512574, "memory(GiB)": 153.57, "step": 5700, "token_acc": 0.9320102432778489, "train_speed(iter/s)": 0.314533 }, { "epoch": 2.174161585365854, "grad_norm": 0.6127244234085083, "learning_rate": 6.016515567001495e-05, "loss": 0.1801366925239563, "memory(GiB)": 153.57, "step": 5705, "token_acc": 0.9223050022806751, "train_speed(iter/s)": 0.314543 }, { "epoch": 2.176067073170732, "grad_norm": 1.0727810859680176, "learning_rate": 6.010653591466403e-05, "loss": 0.17679212093353272, "memory(GiB)": 153.57, "step": 5710, "token_acc": 0.9274028629856851, "train_speed(iter/s)": 0.314464 }, { "epoch": 2.1779725609756095, "grad_norm": 1.3103644847869873, "learning_rate": 6.004790167245004e-05, "loss": 0.1827747106552124, "memory(GiB)": 153.57, "step": 5715, "token_acc": 0.9263873159682899, "train_speed(iter/s)": 0.314399 }, { "epoch": 2.1798780487804876, "grad_norm": 1.145224690437317, "learning_rate": 5.998925302742017e-05, "loss": 0.19745360612869262, "memory(GiB)": 153.57, "step": 5720, "token_acc": 0.913539609392144, "train_speed(iter/s)": 0.31435 }, { "epoch": 2.1817835365853657, "grad_norm": 1.3625850677490234, "learning_rate": 5.9930590063642286e-05, "loss": 0.1880973219871521, "memory(GiB)": 153.57, "step": 5725, "token_acc": 0.9043412417970722, "train_speed(iter/s)": 0.314388 }, { "epoch": 2.183689024390244, "grad_norm": 2.301525592803955, "learning_rate": 5.987191286520479e-05, "loss": 0.1962212324142456, "memory(GiB)": 153.57, "step": 5730, "token_acc": 0.9263252470799641, "train_speed(iter/s)": 0.314332 }, { "epoch": 2.185594512195122, "grad_norm": 1.2715367078781128, "learning_rate": 5.981322151621648e-05, "loss": 0.1392587661743164, "memory(GiB)": 153.57, "step": 5735, "token_acc": 0.9357499457347515, "train_speed(iter/s)": 0.314271 }, { "epoch": 2.1875, "grad_norm": 1.0287173986434937, "learning_rate": 5.9754516100806423e-05, "loss": 0.17946956157684327, "memory(GiB)": 153.57, "step": 5740, "token_acc": 0.917885703140238, "train_speed(iter/s)": 0.314281 }, { "epoch": 2.189405487804878, "grad_norm": 1.07825767993927, "learning_rate": 5.969579670312385e-05, "loss": 0.18585443496704102, "memory(GiB)": 153.57, "step": 5745, "token_acc": 0.9215851602023609, "train_speed(iter/s)": 0.314226 }, { "epoch": 2.191310975609756, "grad_norm": 1.5267311334609985, "learning_rate": 5.963706340733807e-05, "loss": 0.17777930498123168, "memory(GiB)": 153.57, "step": 5750, "token_acc": 0.9226519337016574, "train_speed(iter/s)": 0.314264 }, { "epoch": 2.1932164634146343, "grad_norm": 1.4951726198196411, "learning_rate": 5.9578316297638246e-05, "loss": 0.15798747539520264, "memory(GiB)": 153.57, "step": 5755, "token_acc": 0.9177688326624497, "train_speed(iter/s)": 0.314146 }, { "epoch": 2.1951219512195124, "grad_norm": 1.159936785697937, "learning_rate": 5.951955545823342e-05, "loss": 0.20601356029510498, "memory(GiB)": 153.57, "step": 5760, "token_acc": 0.9158080659405358, "train_speed(iter/s)": 0.31414 }, { "epoch": 2.1970274390243905, "grad_norm": 1.9048447608947754, "learning_rate": 5.946078097335227e-05, "loss": 0.18821322917938232, "memory(GiB)": 153.57, "step": 5765, "token_acc": 0.9234726688102894, "train_speed(iter/s)": 0.314153 }, { "epoch": 2.198932926829268, "grad_norm": 0.426616907119751, "learning_rate": 5.940199292724303e-05, "loss": 0.19881619215011598, "memory(GiB)": 153.57, "step": 5770, "token_acc": 0.9188585607940447, "train_speed(iter/s)": 0.314177 }, { "epoch": 2.200838414634146, "grad_norm": 1.1009455919265747, "learning_rate": 5.934319140417339e-05, "loss": 0.18399029970169067, "memory(GiB)": 153.57, "step": 5775, "token_acc": 0.9234318493577753, "train_speed(iter/s)": 0.314156 }, { "epoch": 2.2027439024390243, "grad_norm": 1.2577377557754517, "learning_rate": 5.928437648843036e-05, "loss": 0.1545989394187927, "memory(GiB)": 153.57, "step": 5780, "token_acc": 0.9336139896373057, "train_speed(iter/s)": 0.314199 }, { "epoch": 2.2046493902439024, "grad_norm": 1.2724330425262451, "learning_rate": 5.922554826432013e-05, "loss": 0.1570546507835388, "memory(GiB)": 153.57, "step": 5785, "token_acc": 0.9336265493802479, "train_speed(iter/s)": 0.314214 }, { "epoch": 2.2065548780487805, "grad_norm": 1.7591174840927124, "learning_rate": 5.9166706816167975e-05, "loss": 0.15478999614715577, "memory(GiB)": 153.57, "step": 5790, "token_acc": 0.9283741484072915, "train_speed(iter/s)": 0.314224 }, { "epoch": 2.2084603658536586, "grad_norm": 0.8813377022743225, "learning_rate": 5.9107852228318116e-05, "loss": 0.1557974100112915, "memory(GiB)": 153.57, "step": 5795, "token_acc": 0.9369447453954496, "train_speed(iter/s)": 0.314181 }, { "epoch": 2.2103658536585367, "grad_norm": 1.1828527450561523, "learning_rate": 5.9048984585133646e-05, "loss": 0.22268567085266114, "memory(GiB)": 153.57, "step": 5800, "token_acc": 0.9118449781659389, "train_speed(iter/s)": 0.314206 }, { "epoch": 2.2122713414634148, "grad_norm": 1.8522746562957764, "learning_rate": 5.899010397099634e-05, "loss": 0.13291914463043214, "memory(GiB)": 153.57, "step": 5805, "token_acc": 0.9496442255062945, "train_speed(iter/s)": 0.314245 }, { "epoch": 2.214176829268293, "grad_norm": 1.05617094039917, "learning_rate": 5.893121047030654e-05, "loss": 0.20280637741088867, "memory(GiB)": 153.57, "step": 5810, "token_acc": 0.9112483745123537, "train_speed(iter/s)": 0.314279 }, { "epoch": 2.216082317073171, "grad_norm": 2.5636260509490967, "learning_rate": 5.887230416748312e-05, "loss": 0.15457890033721924, "memory(GiB)": 153.57, "step": 5815, "token_acc": 0.9328739719937764, "train_speed(iter/s)": 0.314315 }, { "epoch": 2.2179878048780486, "grad_norm": 1.5051605701446533, "learning_rate": 5.881338514696326e-05, "loss": 0.18503841161727905, "memory(GiB)": 153.57, "step": 5820, "token_acc": 0.9229678638941399, "train_speed(iter/s)": 0.314296 }, { "epoch": 2.2198932926829267, "grad_norm": 0.8652966618537903, "learning_rate": 5.875445349320241e-05, "loss": 0.1281900644302368, "memory(GiB)": 153.57, "step": 5825, "token_acc": 0.9381114903299204, "train_speed(iter/s)": 0.314314 }, { "epoch": 2.2217987804878048, "grad_norm": 1.0562940835952759, "learning_rate": 5.8695509290674066e-05, "loss": 0.19743187427520753, "memory(GiB)": 153.57, "step": 5830, "token_acc": 0.8979172023656466, "train_speed(iter/s)": 0.314358 }, { "epoch": 2.223704268292683, "grad_norm": 1.4574158191680908, "learning_rate": 5.863655262386978e-05, "loss": 0.23212857246398927, "memory(GiB)": 153.57, "step": 5835, "token_acc": 0.9116077384923282, "train_speed(iter/s)": 0.31436 }, { "epoch": 2.225609756097561, "grad_norm": 1.415425419807434, "learning_rate": 5.8577583577298924e-05, "loss": 0.1837940216064453, "memory(GiB)": 153.57, "step": 5840, "token_acc": 0.9189034837235865, "train_speed(iter/s)": 0.314332 }, { "epoch": 2.227515243902439, "grad_norm": 1.4995334148406982, "learning_rate": 5.851860223548863e-05, "loss": 0.16874070167541505, "memory(GiB)": 153.57, "step": 5845, "token_acc": 0.9334730225248822, "train_speed(iter/s)": 0.314285 }, { "epoch": 2.229420731707317, "grad_norm": 1.5051698684692383, "learning_rate": 5.845960868298366e-05, "loss": 0.16483454704284667, "memory(GiB)": 153.57, "step": 5850, "token_acc": 0.9291311754684838, "train_speed(iter/s)": 0.31432 }, { "epoch": 2.2313262195121952, "grad_norm": 1.5405550003051758, "learning_rate": 5.840060300434627e-05, "loss": 0.19346989393234254, "memory(GiB)": 153.57, "step": 5855, "token_acc": 0.9137594194808819, "train_speed(iter/s)": 0.314351 }, { "epoch": 2.2332317073170733, "grad_norm": 1.466912865638733, "learning_rate": 5.834158528415611e-05, "loss": 0.1726980686187744, "memory(GiB)": 153.57, "step": 5860, "token_acc": 0.9239592183517417, "train_speed(iter/s)": 0.314274 }, { "epoch": 2.2351371951219514, "grad_norm": 1.4280897378921509, "learning_rate": 5.828255560701008e-05, "loss": 0.1825254440307617, "memory(GiB)": 153.57, "step": 5865, "token_acc": 0.9239594450373533, "train_speed(iter/s)": 0.314297 }, { "epoch": 2.237042682926829, "grad_norm": 1.438656210899353, "learning_rate": 5.822351405752221e-05, "loss": 0.19941933155059816, "memory(GiB)": 153.57, "step": 5870, "token_acc": 0.9220721389342438, "train_speed(iter/s)": 0.314294 }, { "epoch": 2.238948170731707, "grad_norm": 1.7109620571136475, "learning_rate": 5.8164460720323567e-05, "loss": 0.18719930648803712, "memory(GiB)": 153.57, "step": 5875, "token_acc": 0.9196283391405342, "train_speed(iter/s)": 0.314324 }, { "epoch": 2.2408536585365852, "grad_norm": 1.6517454385757446, "learning_rate": 5.810539568006213e-05, "loss": 0.15343265533447265, "memory(GiB)": 153.57, "step": 5880, "token_acc": 0.942252948352989, "train_speed(iter/s)": 0.314365 }, { "epoch": 2.2427591463414633, "grad_norm": 1.2404829263687134, "learning_rate": 5.80463190214026e-05, "loss": 0.18904366493225097, "memory(GiB)": 153.57, "step": 5885, "token_acc": 0.9327463927610663, "train_speed(iter/s)": 0.314317 }, { "epoch": 2.2446646341463414, "grad_norm": 1.3504488468170166, "learning_rate": 5.798723082902636e-05, "loss": 0.13870553970336913, "memory(GiB)": 153.57, "step": 5890, "token_acc": 0.9362599206349206, "train_speed(iter/s)": 0.314345 }, { "epoch": 2.2465701219512195, "grad_norm": 0.754664957523346, "learning_rate": 5.792813118763134e-05, "loss": 0.1486583471298218, "memory(GiB)": 153.57, "step": 5895, "token_acc": 0.926865356161488, "train_speed(iter/s)": 0.314362 }, { "epoch": 2.2484756097560976, "grad_norm": 1.1436866521835327, "learning_rate": 5.786902018193189e-05, "loss": 0.14257489442825316, "memory(GiB)": 153.57, "step": 5900, "token_acc": 0.946414499605989, "train_speed(iter/s)": 0.314387 }, { "epoch": 2.2503810975609757, "grad_norm": 1.169019103050232, "learning_rate": 5.780989789665859e-05, "loss": 0.17516165971755981, "memory(GiB)": 153.57, "step": 5905, "token_acc": 0.9276126558005753, "train_speed(iter/s)": 0.314224 }, { "epoch": 2.252286585365854, "grad_norm": 1.168870449066162, "learning_rate": 5.7750764416558265e-05, "loss": 0.16664706468582152, "memory(GiB)": 153.57, "step": 5910, "token_acc": 0.9277336360196746, "train_speed(iter/s)": 0.314127 }, { "epoch": 2.254192073170732, "grad_norm": 1.5235249996185303, "learning_rate": 5.7691619826393716e-05, "loss": 0.17444767951965331, "memory(GiB)": 153.57, "step": 5915, "token_acc": 0.9124899274778404, "train_speed(iter/s)": 0.314135 }, { "epoch": 2.2560975609756095, "grad_norm": 1.4414076805114746, "learning_rate": 5.7632464210943726e-05, "loss": 0.23115382194519044, "memory(GiB)": 153.57, "step": 5920, "token_acc": 0.9054774600096946, "train_speed(iter/s)": 0.314161 }, { "epoch": 2.2580030487804876, "grad_norm": 1.46425461769104, "learning_rate": 5.757329765500286e-05, "loss": 0.1687697410583496, "memory(GiB)": 153.57, "step": 5925, "token_acc": 0.9319781078967944, "train_speed(iter/s)": 0.314135 }, { "epoch": 2.2599085365853657, "grad_norm": 0.8292970061302185, "learning_rate": 5.7514120243381345e-05, "loss": 0.16992748975753785, "memory(GiB)": 153.57, "step": 5930, "token_acc": 0.9372738238841978, "train_speed(iter/s)": 0.314075 }, { "epoch": 2.261814024390244, "grad_norm": 1.2588436603546143, "learning_rate": 5.7454932060904974e-05, "loss": 0.1716240406036377, "memory(GiB)": 153.57, "step": 5935, "token_acc": 0.925011327594019, "train_speed(iter/s)": 0.314102 }, { "epoch": 2.263719512195122, "grad_norm": 1.3888020515441895, "learning_rate": 5.739573319241505e-05, "loss": 0.18770105838775636, "memory(GiB)": 153.57, "step": 5940, "token_acc": 0.9217185028993147, "train_speed(iter/s)": 0.314121 }, { "epoch": 2.265625, "grad_norm": 1.1928375959396362, "learning_rate": 5.733652372276809e-05, "loss": 0.17934473752975463, "memory(GiB)": 153.57, "step": 5945, "token_acc": 0.9191691126665161, "train_speed(iter/s)": 0.314137 }, { "epoch": 2.267530487804878, "grad_norm": 0.8815988898277283, "learning_rate": 5.727730373683586e-05, "loss": 0.19314682483673096, "memory(GiB)": 153.57, "step": 5950, "token_acc": 0.9241367579520327, "train_speed(iter/s)": 0.314132 }, { "epoch": 2.269435975609756, "grad_norm": 1.6233898401260376, "learning_rate": 5.721807331950522e-05, "loss": 0.15273300409317017, "memory(GiB)": 153.57, "step": 5955, "token_acc": 0.9351528384279476, "train_speed(iter/s)": 0.314139 }, { "epoch": 2.2713414634146343, "grad_norm": 1.4367362260818481, "learning_rate": 5.71588325556779e-05, "loss": 0.1859726905822754, "memory(GiB)": 153.57, "step": 5960, "token_acc": 0.916005291005291, "train_speed(iter/s)": 0.314034 }, { "epoch": 2.2732469512195124, "grad_norm": 0.9408221244812012, "learning_rate": 5.7099581530270576e-05, "loss": 0.15213967561721803, "memory(GiB)": 153.57, "step": 5965, "token_acc": 0.9347046161205453, "train_speed(iter/s)": 0.314051 }, { "epoch": 2.2751524390243905, "grad_norm": 1.3369593620300293, "learning_rate": 5.704032032821454e-05, "loss": 0.18680355548858643, "memory(GiB)": 153.57, "step": 5970, "token_acc": 0.923744068073965, "train_speed(iter/s)": 0.314037 }, { "epoch": 2.277057926829268, "grad_norm": 1.1216751337051392, "learning_rate": 5.698104903445571e-05, "loss": 0.18230631351470947, "memory(GiB)": 153.57, "step": 5975, "token_acc": 0.9183673469387755, "train_speed(iter/s)": 0.314001 }, { "epoch": 2.278963414634146, "grad_norm": 2.385309934616089, "learning_rate": 5.692176773395446e-05, "loss": 0.16140669584274292, "memory(GiB)": 153.57, "step": 5980, "token_acc": 0.933636955107352, "train_speed(iter/s)": 0.313941 }, { "epoch": 2.2808689024390243, "grad_norm": 2.125633478164673, "learning_rate": 5.686247651168554e-05, "loss": 0.14897223711013793, "memory(GiB)": 153.57, "step": 5985, "token_acc": 0.9324184360435008, "train_speed(iter/s)": 0.31388 }, { "epoch": 2.2827743902439024, "grad_norm": 1.0325595140457153, "learning_rate": 5.6803175452637856e-05, "loss": 0.18259016275405884, "memory(GiB)": 153.57, "step": 5990, "token_acc": 0.9248915401301518, "train_speed(iter/s)": 0.313838 }, { "epoch": 2.2846798780487805, "grad_norm": 1.7365566492080688, "learning_rate": 5.6743864641814495e-05, "loss": 0.1669189453125, "memory(GiB)": 153.57, "step": 5995, "token_acc": 0.9254249046132501, "train_speed(iter/s)": 0.313863 }, { "epoch": 2.2865853658536586, "grad_norm": 1.2795321941375732, "learning_rate": 5.668454416423242e-05, "loss": 0.2014054536819458, "memory(GiB)": 153.57, "step": 6000, "token_acc": 0.919831223628692, "train_speed(iter/s)": 0.313901 }, { "epoch": 2.2884908536585367, "grad_norm": 1.5478732585906982, "learning_rate": 5.6625214104922574e-05, "loss": 0.16953421831130983, "memory(GiB)": 153.57, "step": 6005, "token_acc": 0.9273084479371316, "train_speed(iter/s)": 0.313937 }, { "epoch": 2.2903963414634148, "grad_norm": 1.5084577798843384, "learning_rate": 5.656587454892954e-05, "loss": 0.1759919285774231, "memory(GiB)": 153.57, "step": 6010, "token_acc": 0.9274340309372157, "train_speed(iter/s)": 0.313949 }, { "epoch": 2.292301829268293, "grad_norm": 1.1933704614639282, "learning_rate": 5.650652558131154e-05, "loss": 0.2113962173461914, "memory(GiB)": 153.57, "step": 6015, "token_acc": 0.9268166537367468, "train_speed(iter/s)": 0.313909 }, { "epoch": 2.2942073170731705, "grad_norm": 1.342742681503296, "learning_rate": 5.64471672871403e-05, "loss": 0.13992347717285156, "memory(GiB)": 153.57, "step": 6020, "token_acc": 0.9456521739130435, "train_speed(iter/s)": 0.313942 }, { "epoch": 2.2961128048780486, "grad_norm": 0.8925923705101013, "learning_rate": 5.638779975150091e-05, "loss": 0.2190507173538208, "memory(GiB)": 153.57, "step": 6025, "token_acc": 0.9217391304347826, "train_speed(iter/s)": 0.313856 }, { "epoch": 2.2980182926829267, "grad_norm": 1.097330927848816, "learning_rate": 5.632842305949171e-05, "loss": 0.1524062156677246, "memory(GiB)": 153.57, "step": 6030, "token_acc": 0.9329446064139941, "train_speed(iter/s)": 0.313885 }, { "epoch": 2.2999237804878048, "grad_norm": 1.2596807479858398, "learning_rate": 5.626903729622414e-05, "loss": 0.1386427879333496, "memory(GiB)": 153.57, "step": 6035, "token_acc": 0.9422382671480144, "train_speed(iter/s)": 0.313887 }, { "epoch": 2.301829268292683, "grad_norm": 0.910976767539978, "learning_rate": 5.620964254682266e-05, "loss": 0.20214571952819824, "memory(GiB)": 153.57, "step": 6040, "token_acc": 0.925845147219193, "train_speed(iter/s)": 0.313905 }, { "epoch": 2.303734756097561, "grad_norm": 1.2000677585601807, "learning_rate": 5.6150238896424625e-05, "loss": 0.16337941884994506, "memory(GiB)": 153.57, "step": 6045, "token_acc": 0.9238204833141542, "train_speed(iter/s)": 0.313937 }, { "epoch": 2.305640243902439, "grad_norm": 1.156322956085205, "learning_rate": 5.6090826430180136e-05, "loss": 0.15049439668655396, "memory(GiB)": 153.57, "step": 6050, "token_acc": 0.9380210221094599, "train_speed(iter/s)": 0.313984 }, { "epoch": 2.307545731707317, "grad_norm": 1.2156561613082886, "learning_rate": 5.603140523325192e-05, "loss": 0.16315839290618897, "memory(GiB)": 153.57, "step": 6055, "token_acc": 0.9229850746268656, "train_speed(iter/s)": 0.314026 }, { "epoch": 2.3094512195121952, "grad_norm": 1.2301154136657715, "learning_rate": 5.597197539081523e-05, "loss": 0.20517218112945557, "memory(GiB)": 153.57, "step": 6060, "token_acc": 0.9173179118474524, "train_speed(iter/s)": 0.314019 }, { "epoch": 2.3113567073170733, "grad_norm": 0.7136746048927307, "learning_rate": 5.591253698805773e-05, "loss": 0.1355486035346985, "memory(GiB)": 153.57, "step": 6065, "token_acc": 0.9288728149487643, "train_speed(iter/s)": 0.31405 }, { "epoch": 2.3132621951219514, "grad_norm": 1.370450496673584, "learning_rate": 5.585309011017931e-05, "loss": 0.17020517587661743, "memory(GiB)": 153.57, "step": 6070, "token_acc": 0.9305555555555556, "train_speed(iter/s)": 0.314071 }, { "epoch": 2.3151676829268295, "grad_norm": 1.6310523748397827, "learning_rate": 5.5793634842392054e-05, "loss": 0.18251502513885498, "memory(GiB)": 153.57, "step": 6075, "token_acc": 0.9299941417691857, "train_speed(iter/s)": 0.314107 }, { "epoch": 2.317073170731707, "grad_norm": 0.2736798822879791, "learning_rate": 5.573417126992003e-05, "loss": 0.13768789768218995, "memory(GiB)": 153.57, "step": 6080, "token_acc": 0.9386706948640483, "train_speed(iter/s)": 0.314138 }, { "epoch": 2.3189786585365852, "grad_norm": 0.4979095458984375, "learning_rate": 5.567469947799924e-05, "loss": 0.19576566219329833, "memory(GiB)": 153.57, "step": 6085, "token_acc": 0.9091122042632935, "train_speed(iter/s)": 0.314148 }, { "epoch": 2.3208841463414633, "grad_norm": 1.242283821105957, "learning_rate": 5.5615219551877474e-05, "loss": 0.2572754383087158, "memory(GiB)": 153.57, "step": 6090, "token_acc": 0.8941659337437702, "train_speed(iter/s)": 0.314162 }, { "epoch": 2.3227896341463414, "grad_norm": 1.376664400100708, "learning_rate": 5.555573157681415e-05, "loss": 0.1721290111541748, "memory(GiB)": 153.57, "step": 6095, "token_acc": 0.930957184487783, "train_speed(iter/s)": 0.314184 }, { "epoch": 2.3246951219512195, "grad_norm": 1.3975622653961182, "learning_rate": 5.5496235638080254e-05, "loss": 0.18094792366027831, "memory(GiB)": 153.57, "step": 6100, "token_acc": 0.9199553239017125, "train_speed(iter/s)": 0.314215 }, { "epoch": 2.3266006097560976, "grad_norm": 2.118154764175415, "learning_rate": 5.543673182095815e-05, "loss": 0.17381689548492432, "memory(GiB)": 153.57, "step": 6105, "token_acc": 0.9253424657534246, "train_speed(iter/s)": 0.314228 }, { "epoch": 2.3285060975609757, "grad_norm": 0.92275071144104, "learning_rate": 5.5377220210741564e-05, "loss": 0.17762489318847657, "memory(GiB)": 153.57, "step": 6110, "token_acc": 0.9194063498027428, "train_speed(iter/s)": 0.3142 }, { "epoch": 2.330411585365854, "grad_norm": 1.448676586151123, "learning_rate": 5.5317700892735315e-05, "loss": 0.12139714956283569, "memory(GiB)": 153.57, "step": 6115, "token_acc": 0.9508716323296355, "train_speed(iter/s)": 0.314233 }, { "epoch": 2.332317073170732, "grad_norm": 2.171936511993408, "learning_rate": 5.52581739522553e-05, "loss": 0.15114911794662475, "memory(GiB)": 153.57, "step": 6120, "token_acc": 0.9299191374663073, "train_speed(iter/s)": 0.314282 }, { "epoch": 2.3342225609756095, "grad_norm": 0.9836276769638062, "learning_rate": 5.5198639474628356e-05, "loss": 0.19193636178970336, "memory(GiB)": 153.57, "step": 6125, "token_acc": 0.91959629941127, "train_speed(iter/s)": 0.314285 }, { "epoch": 2.3361280487804876, "grad_norm": 0.8592477440834045, "learning_rate": 5.5139097545192106e-05, "loss": 0.1858493447303772, "memory(GiB)": 153.57, "step": 6130, "token_acc": 0.9194958847736625, "train_speed(iter/s)": 0.314309 }, { "epoch": 2.3380335365853657, "grad_norm": 1.0779606103897095, "learning_rate": 5.507954824929488e-05, "loss": 0.1642375946044922, "memory(GiB)": 153.57, "step": 6135, "token_acc": 0.9243134628265238, "train_speed(iter/s)": 0.314346 }, { "epoch": 2.339939024390244, "grad_norm": 1.4008221626281738, "learning_rate": 5.501999167229553e-05, "loss": 0.15777859687805176, "memory(GiB)": 153.57, "step": 6140, "token_acc": 0.9366020524515394, "train_speed(iter/s)": 0.314344 }, { "epoch": 2.341844512195122, "grad_norm": 0.9784363508224487, "learning_rate": 5.496042789956338e-05, "loss": 0.20633399486541748, "memory(GiB)": 153.57, "step": 6145, "token_acc": 0.9170640680074642, "train_speed(iter/s)": 0.314298 }, { "epoch": 2.34375, "grad_norm": 1.628700613975525, "learning_rate": 5.490085701647805e-05, "loss": 0.12210159301757813, "memory(GiB)": 153.57, "step": 6150, "token_acc": 0.934769478558486, "train_speed(iter/s)": 0.31431 }, { "epoch": 2.345655487804878, "grad_norm": 1.120477318763733, "learning_rate": 5.484127910842932e-05, "loss": 0.20123779773712158, "memory(GiB)": 153.57, "step": 6155, "token_acc": 0.9247583018074821, "train_speed(iter/s)": 0.314309 }, { "epoch": 2.347560975609756, "grad_norm": 1.8749126195907593, "learning_rate": 5.478169426081712e-05, "loss": 0.15351524353027343, "memory(GiB)": 153.57, "step": 6160, "token_acc": 0.9372772630078403, "train_speed(iter/s)": 0.314352 }, { "epoch": 2.3494664634146343, "grad_norm": 0.7595517635345459, "learning_rate": 5.472210255905127e-05, "loss": 0.10642286539077758, "memory(GiB)": 153.57, "step": 6165, "token_acc": 0.9432753888380604, "train_speed(iter/s)": 0.314385 }, { "epoch": 2.3513719512195124, "grad_norm": 0.2908843755722046, "learning_rate": 5.466250408855141e-05, "loss": 0.08924359679222107, "memory(GiB)": 153.57, "step": 6170, "token_acc": 0.9434782608695652, "train_speed(iter/s)": 0.314367 }, { "epoch": 2.3532774390243905, "grad_norm": 0.923550009727478, "learning_rate": 5.46028989347469e-05, "loss": 0.2062150001525879, "memory(GiB)": 153.57, "step": 6175, "token_acc": 0.9189156626506024, "train_speed(iter/s)": 0.314336 }, { "epoch": 2.355182926829268, "grad_norm": 1.6634023189544678, "learning_rate": 5.4543287183076706e-05, "loss": 0.1774724006652832, "memory(GiB)": 153.57, "step": 6180, "token_acc": 0.9295454545454546, "train_speed(iter/s)": 0.314321 }, { "epoch": 2.357088414634146, "grad_norm": 1.224130630493164, "learning_rate": 5.4483668918989196e-05, "loss": 0.16486620903015137, "memory(GiB)": 153.57, "step": 6185, "token_acc": 0.9273527793065492, "train_speed(iter/s)": 0.314349 }, { "epoch": 2.3589939024390243, "grad_norm": 1.6442604064941406, "learning_rate": 5.4424044227942116e-05, "loss": 0.20081744194030762, "memory(GiB)": 153.57, "step": 6190, "token_acc": 0.9226441631504922, "train_speed(iter/s)": 0.314377 }, { "epoch": 2.3608993902439024, "grad_norm": 0.9977948665618896, "learning_rate": 5.436441319540241e-05, "loss": 0.17783124446868898, "memory(GiB)": 153.57, "step": 6195, "token_acc": 0.9252771974931705, "train_speed(iter/s)": 0.314357 }, { "epoch": 2.3628048780487805, "grad_norm": 2.077744245529175, "learning_rate": 5.43047759068461e-05, "loss": 0.17287496328353882, "memory(GiB)": 153.57, "step": 6200, "token_acc": 0.9309608540925267, "train_speed(iter/s)": 0.314355 }, { "epoch": 2.3647103658536586, "grad_norm": 1.0591288805007935, "learning_rate": 5.4245132447758185e-05, "loss": 0.1734602451324463, "memory(GiB)": 153.57, "step": 6205, "token_acc": 0.9151905528717122, "train_speed(iter/s)": 0.314316 }, { "epoch": 2.3666158536585367, "grad_norm": 0.17385165393352509, "learning_rate": 5.418548290363253e-05, "loss": 0.1466023325920105, "memory(GiB)": 153.57, "step": 6210, "token_acc": 0.9352771547580904, "train_speed(iter/s)": 0.314261 }, { "epoch": 2.3685213414634148, "grad_norm": 2.1194119453430176, "learning_rate": 5.412582735997169e-05, "loss": 0.1738971948623657, "memory(GiB)": 153.57, "step": 6215, "token_acc": 0.9291062236780534, "train_speed(iter/s)": 0.31427 }, { "epoch": 2.370426829268293, "grad_norm": 1.9914796352386475, "learning_rate": 5.4066165902286836e-05, "loss": 0.17239091396331788, "memory(GiB)": 153.57, "step": 6220, "token_acc": 0.9325577167787675, "train_speed(iter/s)": 0.314265 }, { "epoch": 2.3723323170731705, "grad_norm": 1.1021537780761719, "learning_rate": 5.400649861609762e-05, "loss": 0.17725691795349122, "memory(GiB)": 153.57, "step": 6225, "token_acc": 0.9310185185185185, "train_speed(iter/s)": 0.314265 }, { "epoch": 2.3742378048780486, "grad_norm": 1.0776363611221313, "learning_rate": 5.394682558693204e-05, "loss": 0.1279654622077942, "memory(GiB)": 153.57, "step": 6230, "token_acc": 0.9366404715127702, "train_speed(iter/s)": 0.314295 }, { "epoch": 2.3761432926829267, "grad_norm": 1.3200819492340088, "learning_rate": 5.3887146900326316e-05, "loss": 0.18480745553970337, "memory(GiB)": 153.57, "step": 6235, "token_acc": 0.9266206896551724, "train_speed(iter/s)": 0.314322 }, { "epoch": 2.3780487804878048, "grad_norm": 1.4872231483459473, "learning_rate": 5.38274626418248e-05, "loss": 0.14116193056106568, "memory(GiB)": 153.57, "step": 6240, "token_acc": 0.9378834355828221, "train_speed(iter/s)": 0.314339 }, { "epoch": 2.379954268292683, "grad_norm": 1.66390860080719, "learning_rate": 5.3767772896979804e-05, "loss": 0.15734695196151732, "memory(GiB)": 153.57, "step": 6245, "token_acc": 0.9416445623342176, "train_speed(iter/s)": 0.314387 }, { "epoch": 2.381859756097561, "grad_norm": 1.2814404964447021, "learning_rate": 5.370807775135155e-05, "loss": 0.20214710235595704, "memory(GiB)": 153.57, "step": 6250, "token_acc": 0.9139593002499108, "train_speed(iter/s)": 0.314438 }, { "epoch": 2.383765243902439, "grad_norm": 0.8532494902610779, "learning_rate": 5.364837729050797e-05, "loss": 0.15296640396118164, "memory(GiB)": 153.57, "step": 6255, "token_acc": 0.9330318316228909, "train_speed(iter/s)": 0.314443 }, { "epoch": 2.385670731707317, "grad_norm": 1.2483536005020142, "learning_rate": 5.3588671600024585e-05, "loss": 0.1635507583618164, "memory(GiB)": 153.57, "step": 6260, "token_acc": 0.9310806289729007, "train_speed(iter/s)": 0.314448 }, { "epoch": 2.3875762195121952, "grad_norm": 1.1608022451400757, "learning_rate": 5.352896076548447e-05, "loss": 0.1702750563621521, "memory(GiB)": 153.57, "step": 6265, "token_acc": 0.9316326530612244, "train_speed(iter/s)": 0.314487 }, { "epoch": 2.3894817073170733, "grad_norm": 1.5140377283096313, "learning_rate": 5.346924487247804e-05, "loss": 0.17604689598083495, "memory(GiB)": 153.57, "step": 6270, "token_acc": 0.9343990348363747, "train_speed(iter/s)": 0.314483 }, { "epoch": 2.3913871951219514, "grad_norm": 1.322965145111084, "learning_rate": 5.3409524006602984e-05, "loss": 0.12671231031417846, "memory(GiB)": 153.57, "step": 6275, "token_acc": 0.9335624284077892, "train_speed(iter/s)": 0.314519 }, { "epoch": 2.3932926829268295, "grad_norm": 1.1214691400527954, "learning_rate": 5.334979825346409e-05, "loss": 0.12806609869003296, "memory(GiB)": 153.57, "step": 6280, "token_acc": 0.945773233522001, "train_speed(iter/s)": 0.314527 }, { "epoch": 2.395198170731707, "grad_norm": 1.1113719940185547, "learning_rate": 5.329006769867315e-05, "loss": 0.19698708057403563, "memory(GiB)": 153.57, "step": 6285, "token_acc": 0.9175442527447905, "train_speed(iter/s)": 0.314561 }, { "epoch": 2.3971036585365852, "grad_norm": 1.1628334522247314, "learning_rate": 5.3230332427848896e-05, "loss": 0.1588343381881714, "memory(GiB)": 153.57, "step": 6290, "token_acc": 0.9342651536119568, "train_speed(iter/s)": 0.314545 }, { "epoch": 2.3990091463414633, "grad_norm": 0.8092828392982483, "learning_rate": 5.317059252661675e-05, "loss": 0.16678247451782227, "memory(GiB)": 153.57, "step": 6295, "token_acc": 0.9178175232057958, "train_speed(iter/s)": 0.314512 }, { "epoch": 2.4009146341463414, "grad_norm": 1.2530735731124878, "learning_rate": 5.3110848080608796e-05, "loss": 0.19258521795272826, "memory(GiB)": 153.57, "step": 6300, "token_acc": 0.9263865546218487, "train_speed(iter/s)": 0.314518 }, { "epoch": 2.4028201219512195, "grad_norm": 0.934618353843689, "learning_rate": 5.3051099175463636e-05, "loss": 0.15105446577072143, "memory(GiB)": 153.57, "step": 6305, "token_acc": 0.9381667325167918, "train_speed(iter/s)": 0.314468 }, { "epoch": 2.4047256097560976, "grad_norm": 1.405389666557312, "learning_rate": 5.2991345896826286e-05, "loss": 0.16951391696929932, "memory(GiB)": 153.57, "step": 6310, "token_acc": 0.9190272581507215, "train_speed(iter/s)": 0.314503 }, { "epoch": 2.4066310975609757, "grad_norm": 1.7480323314666748, "learning_rate": 5.293158833034799e-05, "loss": 0.18014365434646606, "memory(GiB)": 153.57, "step": 6315, "token_acc": 0.922089825847846, "train_speed(iter/s)": 0.314533 }, { "epoch": 2.408536585365854, "grad_norm": 1.1300103664398193, "learning_rate": 5.287182656168618e-05, "loss": 0.17137905359268188, "memory(GiB)": 153.57, "step": 6320, "token_acc": 0.9313668787352998, "train_speed(iter/s)": 0.314553 }, { "epoch": 2.410442073170732, "grad_norm": 0.8372225761413574, "learning_rate": 5.2812060676504235e-05, "loss": 0.1818007469177246, "memory(GiB)": 153.57, "step": 6325, "token_acc": 0.928886217948718, "train_speed(iter/s)": 0.314572 }, { "epoch": 2.4123475609756095, "grad_norm": 1.0470532178878784, "learning_rate": 5.275229076047156e-05, "loss": 0.20168066024780273, "memory(GiB)": 153.57, "step": 6330, "token_acc": 0.9212113435134095, "train_speed(iter/s)": 0.314561 }, { "epoch": 2.4142530487804876, "grad_norm": 0.9655967354774475, "learning_rate": 5.269251689926322e-05, "loss": 0.16285194158554078, "memory(GiB)": 153.57, "step": 6335, "token_acc": 0.931211317418214, "train_speed(iter/s)": 0.314577 }, { "epoch": 2.4161585365853657, "grad_norm": 2.204951286315918, "learning_rate": 5.2632739178559995e-05, "loss": 0.1754870057106018, "memory(GiB)": 153.57, "step": 6340, "token_acc": 0.9343137254901961, "train_speed(iter/s)": 0.314616 }, { "epoch": 2.418064024390244, "grad_norm": 0.8978320956230164, "learning_rate": 5.257295768404818e-05, "loss": 0.13774576187133789, "memory(GiB)": 153.57, "step": 6345, "token_acc": 0.9412892403546609, "train_speed(iter/s)": 0.314628 }, { "epoch": 2.419969512195122, "grad_norm": 2.3456528186798096, "learning_rate": 5.2513172501419484e-05, "loss": 0.16455404758453368, "memory(GiB)": 153.57, "step": 6350, "token_acc": 0.9271168274383709, "train_speed(iter/s)": 0.314645 }, { "epoch": 2.421875, "grad_norm": 1.1990785598754883, "learning_rate": 5.245338371637091e-05, "loss": 0.15462743043899535, "memory(GiB)": 153.57, "step": 6355, "token_acc": 0.9319983312473926, "train_speed(iter/s)": 0.314669 }, { "epoch": 2.423780487804878, "grad_norm": 1.038454532623291, "learning_rate": 5.2393591414604604e-05, "loss": 0.19244033098220825, "memory(GiB)": 153.57, "step": 6360, "token_acc": 0.9185700099304865, "train_speed(iter/s)": 0.314657 }, { "epoch": 2.425685975609756, "grad_norm": 1.7607699632644653, "learning_rate": 5.233379568182778e-05, "loss": 0.13412306308746338, "memory(GiB)": 153.57, "step": 6365, "token_acc": 0.9387915249803819, "train_speed(iter/s)": 0.314681 }, { "epoch": 2.4275914634146343, "grad_norm": 1.010088562965393, "learning_rate": 5.2273996603752525e-05, "loss": 0.12348319292068481, "memory(GiB)": 153.57, "step": 6370, "token_acc": 0.9496420047732697, "train_speed(iter/s)": 0.314691 }, { "epoch": 2.4294969512195124, "grad_norm": 1.0274627208709717, "learning_rate": 5.2214194266095796e-05, "loss": 0.14929742813110353, "memory(GiB)": 153.57, "step": 6375, "token_acc": 0.9369670003707824, "train_speed(iter/s)": 0.314733 }, { "epoch": 2.4314024390243905, "grad_norm": 1.3717832565307617, "learning_rate": 5.215438875457914e-05, "loss": 0.15023504495620726, "memory(GiB)": 153.57, "step": 6380, "token_acc": 0.937888198757764, "train_speed(iter/s)": 0.314733 }, { "epoch": 2.433307926829268, "grad_norm": 1.0776559114456177, "learning_rate": 5.2094580154928716e-05, "loss": 0.23598806858062743, "memory(GiB)": 153.57, "step": 6385, "token_acc": 0.9013144168579178, "train_speed(iter/s)": 0.314753 }, { "epoch": 2.435213414634146, "grad_norm": 1.6720821857452393, "learning_rate": 5.2034768552875065e-05, "loss": 0.23355813026428224, "memory(GiB)": 153.57, "step": 6390, "token_acc": 0.9135545023696683, "train_speed(iter/s)": 0.314764 }, { "epoch": 2.4371189024390243, "grad_norm": 1.2007592916488647, "learning_rate": 5.197495403415308e-05, "loss": 0.1541445732116699, "memory(GiB)": 153.57, "step": 6395, "token_acc": 0.929407883885865, "train_speed(iter/s)": 0.314758 }, { "epoch": 2.4390243902439024, "grad_norm": 1.3104238510131836, "learning_rate": 5.191513668450178e-05, "loss": 0.16566379070281984, "memory(GiB)": 153.57, "step": 6400, "token_acc": 0.9346534653465347, "train_speed(iter/s)": 0.314787 }, { "epoch": 2.4390243902439024, "eval_loss": 0.16659080982208252, "eval_runtime": 32.4084, "eval_samples_per_second": 3.271, "eval_steps_per_second": 3.271, "eval_token_acc": 0.9118430978124215, "step": 6400 }, { "epoch": 2.4409298780487805, "grad_norm": 1.0337032079696655, "learning_rate": 5.1855316589664285e-05, "loss": 0.15250234603881835, "memory(GiB)": 153.57, "step": 6405, "token_acc": 0.9151868654136963, "train_speed(iter/s)": 0.314326 }, { "epoch": 2.4428353658536586, "grad_norm": 1.1108150482177734, "learning_rate": 5.1795493835387596e-05, "loss": 0.19173382520675658, "memory(GiB)": 153.57, "step": 6410, "token_acc": 0.9207172175415856, "train_speed(iter/s)": 0.31434 }, { "epoch": 2.4447408536585367, "grad_norm": 1.2530125379562378, "learning_rate": 5.173566850742263e-05, "loss": 0.17565642595291137, "memory(GiB)": 153.57, "step": 6415, "token_acc": 0.9147331786542924, "train_speed(iter/s)": 0.314362 }, { "epoch": 2.4466463414634148, "grad_norm": 1.6191238164901733, "learning_rate": 5.167584069152388e-05, "loss": 0.17903728485107423, "memory(GiB)": 153.57, "step": 6420, "token_acc": 0.9281616363182839, "train_speed(iter/s)": 0.314299 }, { "epoch": 2.448551829268293, "grad_norm": 1.739485740661621, "learning_rate": 5.161601047344946e-05, "loss": 0.1639564037322998, "memory(GiB)": 153.57, "step": 6425, "token_acc": 0.9302231237322515, "train_speed(iter/s)": 0.314304 }, { "epoch": 2.4504573170731705, "grad_norm": 1.320376992225647, "learning_rate": 5.1556177938960915e-05, "loss": 0.16518037319183348, "memory(GiB)": 153.57, "step": 6430, "token_acc": 0.9252811997857525, "train_speed(iter/s)": 0.314274 }, { "epoch": 2.4523628048780486, "grad_norm": 1.3424999713897705, "learning_rate": 5.149634317382312e-05, "loss": 0.1413271903991699, "memory(GiB)": 153.57, "step": 6435, "token_acc": 0.9351684172401303, "train_speed(iter/s)": 0.314247 }, { "epoch": 2.4542682926829267, "grad_norm": 1.7341127395629883, "learning_rate": 5.143650626380416e-05, "loss": 0.19674510955810548, "memory(GiB)": 153.57, "step": 6440, "token_acc": 0.9153599093141885, "train_speed(iter/s)": 0.314254 }, { "epoch": 2.4561737804878048, "grad_norm": 1.6260411739349365, "learning_rate": 5.1376667294675155e-05, "loss": 0.1560159683227539, "memory(GiB)": 153.57, "step": 6445, "token_acc": 0.9337855297157622, "train_speed(iter/s)": 0.314291 }, { "epoch": 2.458079268292683, "grad_norm": 1.7354668378829956, "learning_rate": 5.131682635221019e-05, "loss": 0.12393393516540527, "memory(GiB)": 153.57, "step": 6450, "token_acc": 0.9359286293592863, "train_speed(iter/s)": 0.314343 }, { "epoch": 2.459984756097561, "grad_norm": 1.0188342332839966, "learning_rate": 5.125698352218621e-05, "loss": 0.162791109085083, "memory(GiB)": 153.57, "step": 6455, "token_acc": 0.927422956677088, "train_speed(iter/s)": 0.314362 }, { "epoch": 2.461890243902439, "grad_norm": 0.8178674578666687, "learning_rate": 5.1197138890382835e-05, "loss": 0.15361305475234985, "memory(GiB)": 153.57, "step": 6460, "token_acc": 0.943099710982659, "train_speed(iter/s)": 0.314259 }, { "epoch": 2.463795731707317, "grad_norm": 1.1139410734176636, "learning_rate": 5.113729254258227e-05, "loss": 0.1474296808242798, "memory(GiB)": 153.57, "step": 6465, "token_acc": 0.934305027460921, "train_speed(iter/s)": 0.314272 }, { "epoch": 2.4657012195121952, "grad_norm": 1.7316724061965942, "learning_rate": 5.107744456456919e-05, "loss": 0.13671519756317138, "memory(GiB)": 153.57, "step": 6470, "token_acc": 0.9410265125764786, "train_speed(iter/s)": 0.314281 }, { "epoch": 2.4676067073170733, "grad_norm": 1.1584030389785767, "learning_rate": 5.1017595042130596e-05, "loss": 0.16846590042114257, "memory(GiB)": 153.57, "step": 6475, "token_acc": 0.933649289099526, "train_speed(iter/s)": 0.314312 }, { "epoch": 2.4695121951219514, "grad_norm": 1.5491071939468384, "learning_rate": 5.095774406105571e-05, "loss": 0.14999698400497435, "memory(GiB)": 153.57, "step": 6480, "token_acc": 0.9378238341968912, "train_speed(iter/s)": 0.314338 }, { "epoch": 2.4714176829268295, "grad_norm": 1.2891926765441895, "learning_rate": 5.089789170713585e-05, "loss": 0.21982617378234864, "memory(GiB)": 153.57, "step": 6485, "token_acc": 0.9082883766552231, "train_speed(iter/s)": 0.314289 }, { "epoch": 2.473323170731707, "grad_norm": 1.0814622640609741, "learning_rate": 5.0838038066164285e-05, "loss": 0.15485773086547852, "memory(GiB)": 153.57, "step": 6490, "token_acc": 0.9368255395683454, "train_speed(iter/s)": 0.314304 }, { "epoch": 2.4752286585365852, "grad_norm": 0.24097014963626862, "learning_rate": 5.0778183223936146e-05, "loss": 0.09613268375396729, "memory(GiB)": 153.57, "step": 6495, "token_acc": 0.9522821576763485, "train_speed(iter/s)": 0.314335 }, { "epoch": 2.4771341463414633, "grad_norm": 1.3589218854904175, "learning_rate": 5.071832726624828e-05, "loss": 0.1649237871170044, "memory(GiB)": 153.57, "step": 6500, "token_acc": 0.9307958477508651, "train_speed(iter/s)": 0.314299 }, { "epoch": 2.4790396341463414, "grad_norm": 1.2895249128341675, "learning_rate": 5.065847027889913e-05, "loss": 0.18279740810394288, "memory(GiB)": 153.57, "step": 6505, "token_acc": 0.9260585633031388, "train_speed(iter/s)": 0.314299 }, { "epoch": 2.4809451219512195, "grad_norm": 0.9553548097610474, "learning_rate": 5.05986123476886e-05, "loss": 0.16541427373886108, "memory(GiB)": 153.57, "step": 6510, "token_acc": 0.935432443204464, "train_speed(iter/s)": 0.314308 }, { "epoch": 2.4828506097560976, "grad_norm": 0.7765644788742065, "learning_rate": 5.053875355841799e-05, "loss": 0.14994572401046752, "memory(GiB)": 153.57, "step": 6515, "token_acc": 0.9389517569982132, "train_speed(iter/s)": 0.314336 }, { "epoch": 2.4847560975609757, "grad_norm": 1.488777756690979, "learning_rate": 5.0478893996889796e-05, "loss": 0.15056474208831788, "memory(GiB)": 153.57, "step": 6520, "token_acc": 0.928117224218966, "train_speed(iter/s)": 0.314365 }, { "epoch": 2.486661585365854, "grad_norm": 1.2588107585906982, "learning_rate": 5.0419033748907626e-05, "loss": 0.18655786514282227, "memory(GiB)": 153.57, "step": 6525, "token_acc": 0.9137483787289234, "train_speed(iter/s)": 0.314346 }, { "epoch": 2.488567073170732, "grad_norm": 1.555403232574463, "learning_rate": 5.0359172900276063e-05, "loss": 0.20142817497253418, "memory(GiB)": 153.57, "step": 6530, "token_acc": 0.9159199237368922, "train_speed(iter/s)": 0.314347 }, { "epoch": 2.4904725609756095, "grad_norm": 1.1079380512237549, "learning_rate": 5.029931153680059e-05, "loss": 0.14459507465362548, "memory(GiB)": 153.57, "step": 6535, "token_acc": 0.9361256544502617, "train_speed(iter/s)": 0.314349 }, { "epoch": 2.4923780487804876, "grad_norm": 1.7830029726028442, "learning_rate": 5.023944974428738e-05, "loss": 0.18489620685577393, "memory(GiB)": 153.57, "step": 6540, "token_acc": 0.9206578709560816, "train_speed(iter/s)": 0.314347 }, { "epoch": 2.4942835365853657, "grad_norm": 0.9545581340789795, "learning_rate": 5.0179587608543246e-05, "loss": 0.14646226167678833, "memory(GiB)": 153.57, "step": 6545, "token_acc": 0.9454392247735411, "train_speed(iter/s)": 0.314358 }, { "epoch": 2.496189024390244, "grad_norm": 1.0005924701690674, "learning_rate": 5.011972521537547e-05, "loss": 0.14096106290817262, "memory(GiB)": 153.57, "step": 6550, "token_acc": 0.9425437451031601, "train_speed(iter/s)": 0.314382 }, { "epoch": 2.498094512195122, "grad_norm": 1.9091421365737915, "learning_rate": 5.005986265059176e-05, "loss": 0.18927040100097656, "memory(GiB)": 153.57, "step": 6555, "token_acc": 0.9171904892047007, "train_speed(iter/s)": 0.314421 }, { "epoch": 2.5, "grad_norm": 1.2497636079788208, "learning_rate": 5e-05, "loss": 0.1960073232650757, "memory(GiB)": 153.57, "step": 6560, "token_acc": 0.9303342705404561, "train_speed(iter/s)": 0.314415 }, { "epoch": 2.501905487804878, "grad_norm": 1.5191583633422852, "learning_rate": 4.9940137349408254e-05, "loss": 0.17366509437561034, "memory(GiB)": 153.57, "step": 6565, "token_acc": 0.9267947993216507, "train_speed(iter/s)": 0.314383 }, { "epoch": 2.503810975609756, "grad_norm": 1.8458067178726196, "learning_rate": 4.988027478462454e-05, "loss": 0.1852563738822937, "memory(GiB)": 153.57, "step": 6570, "token_acc": 0.9211510791366907, "train_speed(iter/s)": 0.314407 }, { "epoch": 2.5057164634146343, "grad_norm": 1.2106804847717285, "learning_rate": 4.9820412391456786e-05, "loss": 0.14192707538604737, "memory(GiB)": 153.57, "step": 6575, "token_acc": 0.9404264673554628, "train_speed(iter/s)": 0.314365 }, { "epoch": 2.5076219512195124, "grad_norm": 1.276026964187622, "learning_rate": 4.976055025571264e-05, "loss": 0.12350438833236695, "memory(GiB)": 153.57, "step": 6580, "token_acc": 0.9463414634146341, "train_speed(iter/s)": 0.314385 }, { "epoch": 2.5095274390243905, "grad_norm": 2.0160419940948486, "learning_rate": 4.9700688463199415e-05, "loss": 0.16057975292205812, "memory(GiB)": 153.57, "step": 6585, "token_acc": 0.9321780059905862, "train_speed(iter/s)": 0.314409 }, { "epoch": 2.5114329268292686, "grad_norm": 1.222845196723938, "learning_rate": 4.9640827099723935e-05, "loss": 0.1847793936729431, "memory(GiB)": 153.57, "step": 6590, "token_acc": 0.9197116996312437, "train_speed(iter/s)": 0.314405 }, { "epoch": 2.513338414634146, "grad_norm": 1.8089131116867065, "learning_rate": 4.958096625109238e-05, "loss": 0.15947959423065186, "memory(GiB)": 153.57, "step": 6595, "token_acc": 0.936806411837238, "train_speed(iter/s)": 0.314428 }, { "epoch": 2.5152439024390243, "grad_norm": 0.8941625356674194, "learning_rate": 4.9521106003110216e-05, "loss": 0.14229111671447753, "memory(GiB)": 153.57, "step": 6600, "token_acc": 0.939938171647284, "train_speed(iter/s)": 0.314417 }, { "epoch": 2.5171493902439024, "grad_norm": 1.1838666200637817, "learning_rate": 4.946124644158202e-05, "loss": 0.15057830810546874, "memory(GiB)": 153.57, "step": 6605, "token_acc": 0.9335526315789474, "train_speed(iter/s)": 0.31444 }, { "epoch": 2.5190548780487805, "grad_norm": 0.9073184132575989, "learning_rate": 4.940138765231141e-05, "loss": 0.16195249557495117, "memory(GiB)": 153.57, "step": 6610, "token_acc": 0.9240506329113924, "train_speed(iter/s)": 0.314455 }, { "epoch": 2.5209603658536586, "grad_norm": 0.730213463306427, "learning_rate": 4.9341529721100894e-05, "loss": 0.15835602283477784, "memory(GiB)": 153.57, "step": 6615, "token_acc": 0.936838124054463, "train_speed(iter/s)": 0.314438 }, { "epoch": 2.5228658536585367, "grad_norm": 1.5524035692214966, "learning_rate": 4.9281672733751746e-05, "loss": 0.18178175687789916, "memory(GiB)": 153.57, "step": 6620, "token_acc": 0.9336734693877551, "train_speed(iter/s)": 0.314455 }, { "epoch": 2.5247713414634148, "grad_norm": 1.1083933115005493, "learning_rate": 4.922181677606386e-05, "loss": 0.17921347618103028, "memory(GiB)": 153.57, "step": 6625, "token_acc": 0.9265081548437807, "train_speed(iter/s)": 0.314458 }, { "epoch": 2.526676829268293, "grad_norm": 1.1773450374603271, "learning_rate": 4.916196193383572e-05, "loss": 0.14589173793792726, "memory(GiB)": 153.57, "step": 6630, "token_acc": 0.9304054054054054, "train_speed(iter/s)": 0.314511 }, { "epoch": 2.5285823170731705, "grad_norm": 1.0450482368469238, "learning_rate": 4.9102108292864156e-05, "loss": 0.11044459342956543, "memory(GiB)": 153.57, "step": 6635, "token_acc": 0.9541498620250478, "train_speed(iter/s)": 0.314517 }, { "epoch": 2.5304878048780486, "grad_norm": 1.015348196029663, "learning_rate": 4.9042255938944296e-05, "loss": 0.18297741413116456, "memory(GiB)": 153.57, "step": 6640, "token_acc": 0.9275245394648379, "train_speed(iter/s)": 0.3145 }, { "epoch": 2.5323932926829267, "grad_norm": 1.3681613206863403, "learning_rate": 4.8982404957869416e-05, "loss": 0.18281142711639403, "memory(GiB)": 153.57, "step": 6645, "token_acc": 0.9255143059825017, "train_speed(iter/s)": 0.314516 }, { "epoch": 2.5342987804878048, "grad_norm": 1.969434380531311, "learning_rate": 4.892255543543083e-05, "loss": 0.15774544477462768, "memory(GiB)": 153.57, "step": 6650, "token_acc": 0.9324200913242009, "train_speed(iter/s)": 0.314395 }, { "epoch": 2.536204268292683, "grad_norm": 1.1668658256530762, "learning_rate": 4.886270745741774e-05, "loss": 0.15459775924682617, "memory(GiB)": 153.57, "step": 6655, "token_acc": 0.9261403756531563, "train_speed(iter/s)": 0.314391 }, { "epoch": 2.538109756097561, "grad_norm": 0.9044528603553772, "learning_rate": 4.880286110961718e-05, "loss": 0.11513693332672119, "memory(GiB)": 153.57, "step": 6660, "token_acc": 0.9427936684254373, "train_speed(iter/s)": 0.314412 }, { "epoch": 2.540015243902439, "grad_norm": 2.1839723587036133, "learning_rate": 4.874301647781379e-05, "loss": 0.15619791746139527, "memory(GiB)": 153.57, "step": 6665, "token_acc": 0.9287515762925599, "train_speed(iter/s)": 0.31445 }, { "epoch": 2.541920731707317, "grad_norm": 1.3937790393829346, "learning_rate": 4.8683173647789806e-05, "loss": 0.17389333248138428, "memory(GiB)": 153.57, "step": 6670, "token_acc": 0.9248186414596614, "train_speed(iter/s)": 0.314464 }, { "epoch": 2.5438262195121952, "grad_norm": 1.2022535800933838, "learning_rate": 4.862333270532486e-05, "loss": 0.21394853591918944, "memory(GiB)": 153.57, "step": 6675, "token_acc": 0.9170524691358025, "train_speed(iter/s)": 0.314418 }, { "epoch": 2.5457317073170733, "grad_norm": 1.1655757427215576, "learning_rate": 4.856349373619585e-05, "loss": 0.17035974264144899, "memory(GiB)": 153.57, "step": 6680, "token_acc": 0.9270426495860021, "train_speed(iter/s)": 0.314405 }, { "epoch": 2.5476371951219514, "grad_norm": 0.9396166801452637, "learning_rate": 4.850365682617688e-05, "loss": 0.1777457118034363, "memory(GiB)": 153.57, "step": 6685, "token_acc": 0.9321509009009009, "train_speed(iter/s)": 0.314361 }, { "epoch": 2.5495426829268295, "grad_norm": 1.0270785093307495, "learning_rate": 4.8443822061039104e-05, "loss": 0.14213575124740602, "memory(GiB)": 153.57, "step": 6690, "token_acc": 0.941783345615328, "train_speed(iter/s)": 0.314387 }, { "epoch": 2.551448170731707, "grad_norm": 1.4533977508544922, "learning_rate": 4.8383989526550564e-05, "loss": 0.2056816339492798, "memory(GiB)": 153.57, "step": 6695, "token_acc": 0.9177561207511291, "train_speed(iter/s)": 0.314339 }, { "epoch": 2.5533536585365852, "grad_norm": 1.165143609046936, "learning_rate": 4.832415930847615e-05, "loss": 0.1948249340057373, "memory(GiB)": 153.57, "step": 6700, "token_acc": 0.9293487359785703, "train_speed(iter/s)": 0.314296 }, { "epoch": 2.5552591463414633, "grad_norm": 2.0690195560455322, "learning_rate": 4.826433149257738e-05, "loss": 0.19879088401794434, "memory(GiB)": 153.57, "step": 6705, "token_acc": 0.9146868250539957, "train_speed(iter/s)": 0.314296 }, { "epoch": 2.5571646341463414, "grad_norm": 1.1592767238616943, "learning_rate": 4.8204506164612395e-05, "loss": 0.159381103515625, "memory(GiB)": 153.57, "step": 6710, "token_acc": 0.9408983451536643, "train_speed(iter/s)": 0.314319 }, { "epoch": 2.5590701219512195, "grad_norm": 0.8644779324531555, "learning_rate": 4.814468341033573e-05, "loss": 0.16455976963043212, "memory(GiB)": 153.57, "step": 6715, "token_acc": 0.9314945285564383, "train_speed(iter/s)": 0.314324 }, { "epoch": 2.5609756097560976, "grad_norm": 1.6967086791992188, "learning_rate": 4.8084863315498234e-05, "loss": 0.1872046947479248, "memory(GiB)": 153.57, "step": 6720, "token_acc": 0.926674593737614, "train_speed(iter/s)": 0.314268 }, { "epoch": 2.5628810975609757, "grad_norm": 0.9208652377128601, "learning_rate": 4.8025045965846935e-05, "loss": 0.13363845348358155, "memory(GiB)": 153.57, "step": 6725, "token_acc": 0.933933933933934, "train_speed(iter/s)": 0.314289 }, { "epoch": 2.564786585365854, "grad_norm": 1.1950331926345825, "learning_rate": 4.796523144712494e-05, "loss": 0.14665900468826293, "memory(GiB)": 153.57, "step": 6730, "token_acc": 0.9365125804564073, "train_speed(iter/s)": 0.314306 }, { "epoch": 2.5666920731707314, "grad_norm": 1.2610533237457275, "learning_rate": 4.79054198450713e-05, "loss": 0.14411709308624268, "memory(GiB)": 153.57, "step": 6735, "token_acc": 0.9297096703776468, "train_speed(iter/s)": 0.314322 }, { "epoch": 2.5685975609756095, "grad_norm": 1.8237422704696655, "learning_rate": 4.7845611245420876e-05, "loss": 0.1943683385848999, "memory(GiB)": 153.57, "step": 6740, "token_acc": 0.912568306010929, "train_speed(iter/s)": 0.314358 }, { "epoch": 2.5705030487804876, "grad_norm": 1.2172704935073853, "learning_rate": 4.778580573390421e-05, "loss": 0.1532779812812805, "memory(GiB)": 153.57, "step": 6745, "token_acc": 0.9356796116504854, "train_speed(iter/s)": 0.314362 }, { "epoch": 2.5724085365853657, "grad_norm": 1.125352144241333, "learning_rate": 4.772600339624748e-05, "loss": 0.150829541683197, "memory(GiB)": 153.57, "step": 6750, "token_acc": 0.9385230993088396, "train_speed(iter/s)": 0.314248 }, { "epoch": 2.574314024390244, "grad_norm": 1.2057609558105469, "learning_rate": 4.766620431817224e-05, "loss": 0.18405193090438843, "memory(GiB)": 153.57, "step": 6755, "token_acc": 0.9073064340239912, "train_speed(iter/s)": 0.314283 }, { "epoch": 2.576219512195122, "grad_norm": 1.289496898651123, "learning_rate": 4.760640858539541e-05, "loss": 0.14850639104843139, "memory(GiB)": 153.57, "step": 6760, "token_acc": 0.92595321421993, "train_speed(iter/s)": 0.314307 }, { "epoch": 2.578125, "grad_norm": 1.2412363290786743, "learning_rate": 4.7546616283629105e-05, "loss": 0.20188722610473633, "memory(GiB)": 153.57, "step": 6765, "token_acc": 0.9249967502924736, "train_speed(iter/s)": 0.314294 }, { "epoch": 2.580030487804878, "grad_norm": 0.9661383628845215, "learning_rate": 4.748682749858053e-05, "loss": 0.17474853992462158, "memory(GiB)": 153.57, "step": 6770, "token_acc": 0.9286589850453543, "train_speed(iter/s)": 0.314274 }, { "epoch": 2.581935975609756, "grad_norm": 1.5076556205749512, "learning_rate": 4.7427042315951834e-05, "loss": 0.19878797531127929, "memory(GiB)": 153.57, "step": 6775, "token_acc": 0.9141645462256149, "train_speed(iter/s)": 0.314282 }, { "epoch": 2.5838414634146343, "grad_norm": 1.8269941806793213, "learning_rate": 4.736726082144002e-05, "loss": 0.15111639499664306, "memory(GiB)": 153.57, "step": 6780, "token_acc": 0.9331997327989312, "train_speed(iter/s)": 0.314311 }, { "epoch": 2.5857469512195124, "grad_norm": 1.2316193580627441, "learning_rate": 4.7307483100736796e-05, "loss": 0.13759979009628295, "memory(GiB)": 153.57, "step": 6785, "token_acc": 0.9338124054462935, "train_speed(iter/s)": 0.314349 }, { "epoch": 2.5876524390243905, "grad_norm": 1.185383677482605, "learning_rate": 4.724770923952844e-05, "loss": 0.1930402398109436, "memory(GiB)": 153.57, "step": 6790, "token_acc": 0.9165942658557776, "train_speed(iter/s)": 0.31436 }, { "epoch": 2.5895579268292686, "grad_norm": 1.062475562095642, "learning_rate": 4.7187939323495756e-05, "loss": 0.15542428493499755, "memory(GiB)": 153.57, "step": 6795, "token_acc": 0.9373083475298126, "train_speed(iter/s)": 0.314325 }, { "epoch": 2.591463414634146, "grad_norm": 1.661979079246521, "learning_rate": 4.712817343831384e-05, "loss": 0.17357275485992432, "memory(GiB)": 153.57, "step": 6800, "token_acc": 0.9305654974946314, "train_speed(iter/s)": 0.314356 }, { "epoch": 2.5933689024390243, "grad_norm": 1.0826654434204102, "learning_rate": 4.706841166965201e-05, "loss": 0.19424721002578735, "memory(GiB)": 153.57, "step": 6805, "token_acc": 0.9146301726946888, "train_speed(iter/s)": 0.31436 }, { "epoch": 2.5952743902439024, "grad_norm": 1.3281865119934082, "learning_rate": 4.7008654103173726e-05, "loss": 0.1434034824371338, "memory(GiB)": 153.57, "step": 6810, "token_acc": 0.9239130434782609, "train_speed(iter/s)": 0.314407 }, { "epoch": 2.5971798780487805, "grad_norm": 1.1273936033248901, "learning_rate": 4.6948900824536376e-05, "loss": 0.17004773616790772, "memory(GiB)": 153.57, "step": 6815, "token_acc": 0.931248064416228, "train_speed(iter/s)": 0.314431 }, { "epoch": 2.5990853658536586, "grad_norm": 1.453766107559204, "learning_rate": 4.688915191939123e-05, "loss": 0.16118279695510865, "memory(GiB)": 153.57, "step": 6820, "token_acc": 0.9341608202914193, "train_speed(iter/s)": 0.314461 }, { "epoch": 2.6009908536585367, "grad_norm": 1.5786118507385254, "learning_rate": 4.682940747338328e-05, "loss": 0.19999067783355712, "memory(GiB)": 153.57, "step": 6825, "token_acc": 0.9238457877201333, "train_speed(iter/s)": 0.314485 }, { "epoch": 2.6028963414634148, "grad_norm": 0.85325026512146, "learning_rate": 4.676966757215112e-05, "loss": 0.1147286057472229, "memory(GiB)": 153.57, "step": 6830, "token_acc": 0.9476971116315379, "train_speed(iter/s)": 0.314515 }, { "epoch": 2.604801829268293, "grad_norm": 1.393913984298706, "learning_rate": 4.670993230132685e-05, "loss": 0.1647796392440796, "memory(GiB)": 153.57, "step": 6835, "token_acc": 0.9222070579060245, "train_speed(iter/s)": 0.314529 }, { "epoch": 2.6067073170731705, "grad_norm": 1.597601056098938, "learning_rate": 4.665020174653592e-05, "loss": 0.1784730911254883, "memory(GiB)": 153.57, "step": 6840, "token_acc": 0.9261744966442953, "train_speed(iter/s)": 0.314528 }, { "epoch": 2.6086128048780486, "grad_norm": 1.2646219730377197, "learning_rate": 4.659047599339703e-05, "loss": 0.13622626066207885, "memory(GiB)": 153.57, "step": 6845, "token_acc": 0.9350353316617278, "train_speed(iter/s)": 0.314545 }, { "epoch": 2.6105182926829267, "grad_norm": 0.7483518123626709, "learning_rate": 4.6530755127521964e-05, "loss": 0.13168367147445678, "memory(GiB)": 153.57, "step": 6850, "token_acc": 0.9388686131386861, "train_speed(iter/s)": 0.314519 }, { "epoch": 2.6124237804878048, "grad_norm": 1.1599470376968384, "learning_rate": 4.647103923451555e-05, "loss": 0.16102910041809082, "memory(GiB)": 153.57, "step": 6855, "token_acc": 0.9370937790157846, "train_speed(iter/s)": 0.314528 }, { "epoch": 2.614329268292683, "grad_norm": 1.6386523246765137, "learning_rate": 4.641132839997543e-05, "loss": 0.20019955635070802, "memory(GiB)": 153.57, "step": 6860, "token_acc": 0.9195966907962771, "train_speed(iter/s)": 0.314495 }, { "epoch": 2.616234756097561, "grad_norm": 1.5672067403793335, "learning_rate": 4.635162270949206e-05, "loss": 0.17585288286209105, "memory(GiB)": 153.57, "step": 6865, "token_acc": 0.919647355163728, "train_speed(iter/s)": 0.31446 }, { "epoch": 2.618140243902439, "grad_norm": 1.1685763597488403, "learning_rate": 4.6291922248648456e-05, "loss": 0.1746552586555481, "memory(GiB)": 153.57, "step": 6870, "token_acc": 0.9337326843878697, "train_speed(iter/s)": 0.314469 }, { "epoch": 2.620045731707317, "grad_norm": 1.079697847366333, "learning_rate": 4.62322271030202e-05, "loss": 0.13822720050811768, "memory(GiB)": 153.57, "step": 6875, "token_acc": 0.9454258675078865, "train_speed(iter/s)": 0.314439 }, { "epoch": 2.6219512195121952, "grad_norm": 1.7287964820861816, "learning_rate": 4.6172537358175214e-05, "loss": 0.18126639127731323, "memory(GiB)": 153.57, "step": 6880, "token_acc": 0.9265734265734266, "train_speed(iter/s)": 0.31444 }, { "epoch": 2.6238567073170733, "grad_norm": 0.9033949375152588, "learning_rate": 4.6112853099673696e-05, "loss": 0.15116531848907472, "memory(GiB)": 153.57, "step": 6885, "token_acc": 0.9219219219219219, "train_speed(iter/s)": 0.314409 }, { "epoch": 2.6257621951219514, "grad_norm": 0.9144802689552307, "learning_rate": 4.605317441306798e-05, "loss": 0.14724562168121338, "memory(GiB)": 153.57, "step": 6890, "token_acc": 0.9271418934112885, "train_speed(iter/s)": 0.314409 }, { "epoch": 2.6276676829268295, "grad_norm": 1.3541444540023804, "learning_rate": 4.599350138390239e-05, "loss": 0.24471018314361573, "memory(GiB)": 153.57, "step": 6895, "token_acc": 0.8812301166489925, "train_speed(iter/s)": 0.314442 }, { "epoch": 2.629573170731707, "grad_norm": 2.005352258682251, "learning_rate": 4.5933834097713176e-05, "loss": 0.13841886520385743, "memory(GiB)": 153.57, "step": 6900, "token_acc": 0.9321533923303835, "train_speed(iter/s)": 0.314472 }, { "epoch": 2.6314786585365852, "grad_norm": 1.4863495826721191, "learning_rate": 4.5874172640028326e-05, "loss": 0.20854401588439941, "memory(GiB)": 153.57, "step": 6905, "token_acc": 0.9167567567567567, "train_speed(iter/s)": 0.314491 }, { "epoch": 2.6333841463414633, "grad_norm": 1.2680598497390747, "learning_rate": 4.5814517096367473e-05, "loss": 0.16286520957946776, "memory(GiB)": 153.57, "step": 6910, "token_acc": 0.9256678281068524, "train_speed(iter/s)": 0.314444 }, { "epoch": 2.6352896341463414, "grad_norm": 1.183451771736145, "learning_rate": 4.575486755224183e-05, "loss": 0.17511956691741942, "memory(GiB)": 153.57, "step": 6915, "token_acc": 0.9338761411300271, "train_speed(iter/s)": 0.314461 }, { "epoch": 2.6371951219512195, "grad_norm": 1.2318793535232544, "learning_rate": 4.569522409315392e-05, "loss": 0.13446773290634156, "memory(GiB)": 153.57, "step": 6920, "token_acc": 0.9407779171894605, "train_speed(iter/s)": 0.31448 }, { "epoch": 2.6391006097560976, "grad_norm": 1.1523199081420898, "learning_rate": 4.56355868045976e-05, "loss": 0.1895682692527771, "memory(GiB)": 153.57, "step": 6925, "token_acc": 0.9250481695568401, "train_speed(iter/s)": 0.314474 }, { "epoch": 2.6410060975609757, "grad_norm": 1.2355948686599731, "learning_rate": 4.557595577205789e-05, "loss": 0.18560464382171632, "memory(GiB)": 153.57, "step": 6930, "token_acc": 0.9166326113608501, "train_speed(iter/s)": 0.314506 }, { "epoch": 2.642911585365854, "grad_norm": 0.7987366914749146, "learning_rate": 4.551633108101081e-05, "loss": 0.1293783664703369, "memory(GiB)": 153.57, "step": 6935, "token_acc": 0.935430881981299, "train_speed(iter/s)": 0.314441 }, { "epoch": 2.6448170731707314, "grad_norm": 1.4873278141021729, "learning_rate": 4.5456712816923305e-05, "loss": 0.18167794942855836, "memory(GiB)": 153.57, "step": 6940, "token_acc": 0.9283679283679284, "train_speed(iter/s)": 0.31447 }, { "epoch": 2.6467225609756095, "grad_norm": 1.858670711517334, "learning_rate": 4.539710106525311e-05, "loss": 0.18079544305801393, "memory(GiB)": 153.57, "step": 6945, "token_acc": 0.9327223495105187, "train_speed(iter/s)": 0.314489 }, { "epoch": 2.6486280487804876, "grad_norm": 1.1694444417953491, "learning_rate": 4.53374959114486e-05, "loss": 0.1818556785583496, "memory(GiB)": 153.57, "step": 6950, "token_acc": 0.923337091319053, "train_speed(iter/s)": 0.314518 }, { "epoch": 2.6505335365853657, "grad_norm": 1.1472736597061157, "learning_rate": 4.5277897440948744e-05, "loss": 0.15771857500076295, "memory(GiB)": 153.57, "step": 6955, "token_acc": 0.9367068527918782, "train_speed(iter/s)": 0.314452 }, { "epoch": 2.652439024390244, "grad_norm": 0.7647499442100525, "learning_rate": 4.521830573918289e-05, "loss": 0.17210379838943482, "memory(GiB)": 153.57, "step": 6960, "token_acc": 0.9337385117106434, "train_speed(iter/s)": 0.314451 }, { "epoch": 2.654344512195122, "grad_norm": 1.5355744361877441, "learning_rate": 4.515872089157069e-05, "loss": 0.18563047647476197, "memory(GiB)": 153.57, "step": 6965, "token_acc": 0.9239204934886909, "train_speed(iter/s)": 0.314475 }, { "epoch": 2.65625, "grad_norm": 0.9570334553718567, "learning_rate": 4.509914298352197e-05, "loss": 0.13044718503952027, "memory(GiB)": 153.57, "step": 6970, "token_acc": 0.9349858664927158, "train_speed(iter/s)": 0.314477 }, { "epoch": 2.658155487804878, "grad_norm": 1.646928310394287, "learning_rate": 4.5039572100436625e-05, "loss": 0.19158463478088378, "memory(GiB)": 153.57, "step": 6975, "token_acc": 0.9205479452054794, "train_speed(iter/s)": 0.314508 }, { "epoch": 2.660060975609756, "grad_norm": 1.2216600179672241, "learning_rate": 4.498000832770448e-05, "loss": 0.11161381006240845, "memory(GiB)": 153.57, "step": 6980, "token_acc": 0.935153045124645, "train_speed(iter/s)": 0.314522 }, { "epoch": 2.6619664634146343, "grad_norm": 0.9410066604614258, "learning_rate": 4.492045175070513e-05, "loss": 0.13566031455993652, "memory(GiB)": 153.57, "step": 6985, "token_acc": 0.9417204676321759, "train_speed(iter/s)": 0.314512 }, { "epoch": 2.6638719512195124, "grad_norm": 1.2411916255950928, "learning_rate": 4.4860902454807905e-05, "loss": 0.1408017635345459, "memory(GiB)": 153.57, "step": 6990, "token_acc": 0.9282945736434108, "train_speed(iter/s)": 0.314551 }, { "epoch": 2.6657774390243905, "grad_norm": 2.3111960887908936, "learning_rate": 4.4801360525371655e-05, "loss": 0.14887540340423583, "memory(GiB)": 153.57, "step": 6995, "token_acc": 0.9353115727002967, "train_speed(iter/s)": 0.314565 }, { "epoch": 2.6676829268292686, "grad_norm": 0.9950553774833679, "learning_rate": 4.474182604774471e-05, "loss": 0.16023145914077758, "memory(GiB)": 153.57, "step": 7000, "token_acc": 0.9282296650717703, "train_speed(iter/s)": 0.314569 }, { "epoch": 2.669588414634146, "grad_norm": 1.0388141870498657, "learning_rate": 4.46822991072647e-05, "loss": 0.12380025386810303, "memory(GiB)": 153.57, "step": 7005, "token_acc": 0.926232333678042, "train_speed(iter/s)": 0.314592 }, { "epoch": 2.6714939024390243, "grad_norm": 1.3137551546096802, "learning_rate": 4.462277978925845e-05, "loss": 0.14984117746353148, "memory(GiB)": 153.57, "step": 7010, "token_acc": 0.9385039008719596, "train_speed(iter/s)": 0.314604 }, { "epoch": 2.6733993902439024, "grad_norm": 1.0683070421218872, "learning_rate": 4.456326817904185e-05, "loss": 0.15345040559768677, "memory(GiB)": 153.57, "step": 7015, "token_acc": 0.9413965087281796, "train_speed(iter/s)": 0.31458 }, { "epoch": 2.6753048780487805, "grad_norm": 1.4604713916778564, "learning_rate": 4.450376436191975e-05, "loss": 0.16747281551361085, "memory(GiB)": 153.57, "step": 7020, "token_acc": 0.9239476476830563, "train_speed(iter/s)": 0.314614 }, { "epoch": 2.6772103658536586, "grad_norm": 1.0981918573379517, "learning_rate": 4.4444268423185856e-05, "loss": 0.18234869241714477, "memory(GiB)": 153.57, "step": 7025, "token_acc": 0.9254531529231554, "train_speed(iter/s)": 0.314629 }, { "epoch": 2.6791158536585367, "grad_norm": 1.6828523874282837, "learning_rate": 4.4384780448122545e-05, "loss": 0.1379164457321167, "memory(GiB)": 153.57, "step": 7030, "token_acc": 0.9385902031063321, "train_speed(iter/s)": 0.314651 }, { "epoch": 2.6810213414634148, "grad_norm": 1.7768827676773071, "learning_rate": 4.4325300522000765e-05, "loss": 0.20560660362243652, "memory(GiB)": 153.57, "step": 7035, "token_acc": 0.9137805764107532, "train_speed(iter/s)": 0.314681 }, { "epoch": 2.682926829268293, "grad_norm": 1.6554949283599854, "learning_rate": 4.4265828730079987e-05, "loss": 0.15197210311889647, "memory(GiB)": 153.57, "step": 7040, "token_acc": 0.932955832389581, "train_speed(iter/s)": 0.314597 }, { "epoch": 2.6848323170731705, "grad_norm": 1.219303011894226, "learning_rate": 4.4206365157607964e-05, "loss": 0.14802231788635253, "memory(GiB)": 153.57, "step": 7045, "token_acc": 0.9307598039215687, "train_speed(iter/s)": 0.314554 }, { "epoch": 2.6867378048780486, "grad_norm": 1.0217152833938599, "learning_rate": 4.41469098898207e-05, "loss": 0.13380454778671264, "memory(GiB)": 153.57, "step": 7050, "token_acc": 0.9367567567567567, "train_speed(iter/s)": 0.314576 }, { "epoch": 2.6886432926829267, "grad_norm": 1.2225536108016968, "learning_rate": 4.408746301194229e-05, "loss": 0.1970059633255005, "memory(GiB)": 153.57, "step": 7055, "token_acc": 0.9199379019738301, "train_speed(iter/s)": 0.314574 }, { "epoch": 2.6905487804878048, "grad_norm": 2.032140016555786, "learning_rate": 4.402802460918478e-05, "loss": 0.17165122032165528, "memory(GiB)": 153.57, "step": 7060, "token_acc": 0.936515871032242, "train_speed(iter/s)": 0.314577 }, { "epoch": 2.692454268292683, "grad_norm": 1.3220142126083374, "learning_rate": 4.396859476674809e-05, "loss": 0.10411732196807862, "memory(GiB)": 153.57, "step": 7065, "token_acc": 0.9559228650137741, "train_speed(iter/s)": 0.3146 }, { "epoch": 2.694359756097561, "grad_norm": 1.2679332494735718, "learning_rate": 4.3909173569819876e-05, "loss": 0.17677252292633056, "memory(GiB)": 153.57, "step": 7070, "token_acc": 0.931899641577061, "train_speed(iter/s)": 0.314619 }, { "epoch": 2.696265243902439, "grad_norm": 1.249998927116394, "learning_rate": 4.384976110357537e-05, "loss": 0.1580281138420105, "memory(GiB)": 153.57, "step": 7075, "token_acc": 0.9338087809226655, "train_speed(iter/s)": 0.314635 }, { "epoch": 2.698170731707317, "grad_norm": 0.8649988174438477, "learning_rate": 4.379035745317735e-05, "loss": 0.169502854347229, "memory(GiB)": 153.57, "step": 7080, "token_acc": 0.9321266968325792, "train_speed(iter/s)": 0.31464 }, { "epoch": 2.7000762195121952, "grad_norm": 1.4559855461120605, "learning_rate": 4.373096270377587e-05, "loss": 0.15721224546432494, "memory(GiB)": 153.57, "step": 7085, "token_acc": 0.9306794783802333, "train_speed(iter/s)": 0.314653 }, { "epoch": 2.7019817073170733, "grad_norm": 1.194366693496704, "learning_rate": 4.36715769405083e-05, "loss": 0.11152516603469849, "memory(GiB)": 153.57, "step": 7090, "token_acc": 0.9558761137038608, "train_speed(iter/s)": 0.314658 }, { "epoch": 2.7038871951219514, "grad_norm": 0.7755956053733826, "learning_rate": 4.36122002484991e-05, "loss": 0.14999276399612427, "memory(GiB)": 153.57, "step": 7095, "token_acc": 0.9469803921568627, "train_speed(iter/s)": 0.314662 }, { "epoch": 2.7057926829268295, "grad_norm": 0.8938814997673035, "learning_rate": 4.355283271285971e-05, "loss": 0.15678343772888184, "memory(GiB)": 153.57, "step": 7100, "token_acc": 0.9420289855072463, "train_speed(iter/s)": 0.314673 }, { "epoch": 2.707698170731707, "grad_norm": 1.3226003646850586, "learning_rate": 4.3493474418688476e-05, "loss": 0.13260698318481445, "memory(GiB)": 153.57, "step": 7105, "token_acc": 0.9416101445001475, "train_speed(iter/s)": 0.3147 }, { "epoch": 2.7096036585365852, "grad_norm": 1.1618448495864868, "learning_rate": 4.3434125451070476e-05, "loss": 0.18488819599151612, "memory(GiB)": 153.57, "step": 7110, "token_acc": 0.9180145008365868, "train_speed(iter/s)": 0.314726 }, { "epoch": 2.7115091463414633, "grad_norm": 1.380149006843567, "learning_rate": 4.337478589507743e-05, "loss": 0.18362523317337037, "memory(GiB)": 153.57, "step": 7115, "token_acc": 0.9288718929254303, "train_speed(iter/s)": 0.314751 }, { "epoch": 2.7134146341463414, "grad_norm": 1.2204469442367554, "learning_rate": 4.331545583576758e-05, "loss": 0.19410176277160646, "memory(GiB)": 153.57, "step": 7120, "token_acc": 0.9261061946902654, "train_speed(iter/s)": 0.314755 }, { "epoch": 2.7153201219512195, "grad_norm": 1.2112582921981812, "learning_rate": 4.3256135358185524e-05, "loss": 0.18951821327209473, "memory(GiB)": 153.57, "step": 7125, "token_acc": 0.9263927401058735, "train_speed(iter/s)": 0.314773 }, { "epoch": 2.7172256097560976, "grad_norm": 0.8810627460479736, "learning_rate": 4.319682454736215e-05, "loss": 0.15154634714126586, "memory(GiB)": 153.57, "step": 7130, "token_acc": 0.9439404512677367, "train_speed(iter/s)": 0.314778 }, { "epoch": 2.7191310975609757, "grad_norm": 1.4711806774139404, "learning_rate": 4.313752348831448e-05, "loss": 0.16733913421630858, "memory(GiB)": 153.57, "step": 7135, "token_acc": 0.9225599064874342, "train_speed(iter/s)": 0.314817 }, { "epoch": 2.721036585365854, "grad_norm": 1.4376423358917236, "learning_rate": 4.3078232266045545e-05, "loss": 0.16681867837905884, "memory(GiB)": 153.57, "step": 7140, "token_acc": 0.9295743258301761, "train_speed(iter/s)": 0.314829 }, { "epoch": 2.7229420731707314, "grad_norm": 1.9863330125808716, "learning_rate": 4.3018950965544314e-05, "loss": 0.1730729818344116, "memory(GiB)": 153.57, "step": 7145, "token_acc": 0.93228582117471, "train_speed(iter/s)": 0.314866 }, { "epoch": 2.7248475609756095, "grad_norm": 1.8889150619506836, "learning_rate": 4.295967967178549e-05, "loss": 0.12622672319412231, "memory(GiB)": 153.57, "step": 7150, "token_acc": 0.9401273885350319, "train_speed(iter/s)": 0.314903 }, { "epoch": 2.7267530487804876, "grad_norm": 1.0058319568634033, "learning_rate": 4.290041846972943e-05, "loss": 0.164303195476532, "memory(GiB)": 153.57, "step": 7155, "token_acc": 0.9250448833034112, "train_speed(iter/s)": 0.31491 }, { "epoch": 2.7286585365853657, "grad_norm": 0.9056001305580139, "learning_rate": 4.28411674443221e-05, "loss": 0.1545015335083008, "memory(GiB)": 153.57, "step": 7160, "token_acc": 0.9345700676861369, "train_speed(iter/s)": 0.314715 }, { "epoch": 2.730564024390244, "grad_norm": 1.6292096376419067, "learning_rate": 4.27819266804948e-05, "loss": 0.15597670078277587, "memory(GiB)": 153.57, "step": 7165, "token_acc": 0.9387707948243993, "train_speed(iter/s)": 0.314729 }, { "epoch": 2.732469512195122, "grad_norm": 1.5703963041305542, "learning_rate": 4.2722696263164144e-05, "loss": 0.14474085569381714, "memory(GiB)": 153.57, "step": 7170, "token_acc": 0.9372419488026424, "train_speed(iter/s)": 0.31471 }, { "epoch": 2.734375, "grad_norm": 2.2594075202941895, "learning_rate": 4.2663476277231915e-05, "loss": 0.1805231213569641, "memory(GiB)": 153.57, "step": 7175, "token_acc": 0.9272681829542614, "train_speed(iter/s)": 0.314678 }, { "epoch": 2.736280487804878, "grad_norm": 1.6737638711929321, "learning_rate": 4.2604266807584964e-05, "loss": 0.14992369413375856, "memory(GiB)": 153.57, "step": 7180, "token_acc": 0.935969312481558, "train_speed(iter/s)": 0.3147 }, { "epoch": 2.738185975609756, "grad_norm": 2.0870018005371094, "learning_rate": 4.254506793909503e-05, "loss": 0.21683409214019775, "memory(GiB)": 153.57, "step": 7185, "token_acc": 0.9183531053733427, "train_speed(iter/s)": 0.314721 }, { "epoch": 2.7400914634146343, "grad_norm": 0.9134271740913391, "learning_rate": 4.248587975661869e-05, "loss": 0.1480140805244446, "memory(GiB)": 153.57, "step": 7190, "token_acc": 0.9364982755122743, "train_speed(iter/s)": 0.314643 }, { "epoch": 2.7419969512195124, "grad_norm": 0.9415486454963684, "learning_rate": 4.2426702344997174e-05, "loss": 0.14555835723876953, "memory(GiB)": 153.57, "step": 7195, "token_acc": 0.9246658566221142, "train_speed(iter/s)": 0.314669 }, { "epoch": 2.7439024390243905, "grad_norm": 1.3031607866287231, "learning_rate": 4.236753578905627e-05, "loss": 0.14663712978363036, "memory(GiB)": 153.57, "step": 7200, "token_acc": 0.9365899483325505, "train_speed(iter/s)": 0.314719 }, { "epoch": 2.7439024390243905, "eval_loss": 0.1665453463792801, "eval_runtime": 33.0991, "eval_samples_per_second": 3.203, "eval_steps_per_second": 3.203, "eval_token_acc": 0.9133517726929846, "step": 7200 }, { "epoch": 2.7458079268292686, "grad_norm": 0.8690086603164673, "learning_rate": 4.230838017360629e-05, "loss": 0.13035722970962524, "memory(GiB)": 153.57, "step": 7205, "token_acc": 0.9188051485469116, "train_speed(iter/s)": 0.314291 }, { "epoch": 2.747713414634146, "grad_norm": 2.1077301502227783, "learning_rate": 4.224923558344175e-05, "loss": 0.17907872200012206, "memory(GiB)": 153.57, "step": 7210, "token_acc": 0.9291615035109458, "train_speed(iter/s)": 0.314287 }, { "epoch": 2.7496189024390243, "grad_norm": 2.0094563961029053, "learning_rate": 4.2190102103341414e-05, "loss": 0.1625037431716919, "memory(GiB)": 153.57, "step": 7215, "token_acc": 0.9326353220071119, "train_speed(iter/s)": 0.314292 }, { "epoch": 2.7515243902439024, "grad_norm": 0.7605774998664856, "learning_rate": 4.213097981806813e-05, "loss": 0.19386054277420045, "memory(GiB)": 153.57, "step": 7220, "token_acc": 0.9130854659584742, "train_speed(iter/s)": 0.314315 }, { "epoch": 2.7534298780487805, "grad_norm": 0.8533841371536255, "learning_rate": 4.2071868812368665e-05, "loss": 0.1674828052520752, "memory(GiB)": 153.57, "step": 7225, "token_acc": 0.9279527559055119, "train_speed(iter/s)": 0.31435 }, { "epoch": 2.7553353658536586, "grad_norm": 1.084843397140503, "learning_rate": 4.201276917097366e-05, "loss": 0.22305819988250733, "memory(GiB)": 153.57, "step": 7230, "token_acc": 0.911504424778761, "train_speed(iter/s)": 0.31433 }, { "epoch": 2.7572408536585367, "grad_norm": 1.2463616132736206, "learning_rate": 4.1953680978597435e-05, "loss": 0.16389210224151612, "memory(GiB)": 153.57, "step": 7235, "token_acc": 0.9293942772203642, "train_speed(iter/s)": 0.314344 }, { "epoch": 2.7591463414634148, "grad_norm": 0.8475595712661743, "learning_rate": 4.189460431993788e-05, "loss": 0.13962726593017577, "memory(GiB)": 153.57, "step": 7240, "token_acc": 0.9379611915823995, "train_speed(iter/s)": 0.314361 }, { "epoch": 2.761051829268293, "grad_norm": 1.1124588251113892, "learning_rate": 4.1835539279676425e-05, "loss": 0.19056724309921264, "memory(GiB)": 153.57, "step": 7245, "token_acc": 0.915682569674067, "train_speed(iter/s)": 0.314393 }, { "epoch": 2.7629573170731705, "grad_norm": 2.856384038925171, "learning_rate": 4.177648594247779e-05, "loss": 0.14484221935272218, "memory(GiB)": 153.57, "step": 7250, "token_acc": 0.9296722378988496, "train_speed(iter/s)": 0.314415 }, { "epoch": 2.7648628048780486, "grad_norm": 2.0789289474487305, "learning_rate": 4.1717444392989937e-05, "loss": 0.1572974443435669, "memory(GiB)": 153.57, "step": 7255, "token_acc": 0.9348058425365159, "train_speed(iter/s)": 0.314445 }, { "epoch": 2.7667682926829267, "grad_norm": 0.9140135049819946, "learning_rate": 4.16584147158439e-05, "loss": 0.15907434225082398, "memory(GiB)": 153.57, "step": 7260, "token_acc": 0.935379241516966, "train_speed(iter/s)": 0.314458 }, { "epoch": 2.7686737804878048, "grad_norm": 1.2592531442642212, "learning_rate": 4.1599396995653744e-05, "loss": 0.15318598747253417, "memory(GiB)": 153.57, "step": 7265, "token_acc": 0.9338989739542226, "train_speed(iter/s)": 0.314471 }, { "epoch": 2.770579268292683, "grad_norm": 1.8911926746368408, "learning_rate": 4.154039131701636e-05, "loss": 0.18040133714675904, "memory(GiB)": 153.57, "step": 7270, "token_acc": 0.9261992619926199, "train_speed(iter/s)": 0.314518 }, { "epoch": 2.772484756097561, "grad_norm": 1.5189930200576782, "learning_rate": 4.148139776451139e-05, "loss": 0.16149307489395143, "memory(GiB)": 153.57, "step": 7275, "token_acc": 0.9358226371061844, "train_speed(iter/s)": 0.314547 }, { "epoch": 2.774390243902439, "grad_norm": 1.1768279075622559, "learning_rate": 4.142241642270108e-05, "loss": 0.1690164566040039, "memory(GiB)": 153.57, "step": 7280, "token_acc": 0.9319497106365995, "train_speed(iter/s)": 0.314561 }, { "epoch": 2.776295731707317, "grad_norm": 1.514961838722229, "learning_rate": 4.1363447376130225e-05, "loss": 0.12042449712753296, "memory(GiB)": 153.57, "step": 7285, "token_acc": 0.9484193011647255, "train_speed(iter/s)": 0.31458 }, { "epoch": 2.7782012195121952, "grad_norm": 1.5738859176635742, "learning_rate": 4.130449070932594e-05, "loss": 0.12346988916397095, "memory(GiB)": 153.57, "step": 7290, "token_acc": 0.9405320813771518, "train_speed(iter/s)": 0.314558 }, { "epoch": 2.7801067073170733, "grad_norm": 1.342018961906433, "learning_rate": 4.12455465067976e-05, "loss": 0.14132895469665527, "memory(GiB)": 153.57, "step": 7295, "token_acc": 0.9396411092985318, "train_speed(iter/s)": 0.314543 }, { "epoch": 2.7820121951219514, "grad_norm": 1.9450713396072388, "learning_rate": 4.1186614853036745e-05, "loss": 0.19587392807006837, "memory(GiB)": 153.57, "step": 7300, "token_acc": 0.9163139042657115, "train_speed(iter/s)": 0.314549 }, { "epoch": 2.7839176829268295, "grad_norm": 0.6451703906059265, "learning_rate": 4.112769583251689e-05, "loss": 0.1489277482032776, "memory(GiB)": 153.57, "step": 7305, "token_acc": 0.9343702579666161, "train_speed(iter/s)": 0.314519 }, { "epoch": 2.785823170731707, "grad_norm": 1.0092781782150269, "learning_rate": 4.106878952969348e-05, "loss": 0.2323472261428833, "memory(GiB)": 153.57, "step": 7310, "token_acc": 0.9014641867827463, "train_speed(iter/s)": 0.31451 }, { "epoch": 2.7877286585365852, "grad_norm": 1.0317084789276123, "learning_rate": 4.100989602900369e-05, "loss": 0.12234753370285034, "memory(GiB)": 153.57, "step": 7315, "token_acc": 0.9365787432117921, "train_speed(iter/s)": 0.314527 }, { "epoch": 2.7896341463414633, "grad_norm": 1.2221993207931519, "learning_rate": 4.095101541486636e-05, "loss": 0.16872717142105104, "memory(GiB)": 153.57, "step": 7320, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 0.314528 }, { "epoch": 2.7915396341463414, "grad_norm": 1.6893329620361328, "learning_rate": 4.089214777168188e-05, "loss": 0.17382255792617798, "memory(GiB)": 153.57, "step": 7325, "token_acc": 0.9280794165316045, "train_speed(iter/s)": 0.314551 }, { "epoch": 2.7934451219512195, "grad_norm": 1.1489046812057495, "learning_rate": 4.083329318383204e-05, "loss": 0.21926159858703614, "memory(GiB)": 153.57, "step": 7330, "token_acc": 0.9218048665249232, "train_speed(iter/s)": 0.314573 }, { "epoch": 2.7953506097560976, "grad_norm": 2.229832172393799, "learning_rate": 4.077445173567989e-05, "loss": 0.16113659143447875, "memory(GiB)": 153.57, "step": 7335, "token_acc": 0.9312977099236641, "train_speed(iter/s)": 0.31454 }, { "epoch": 2.7972560975609757, "grad_norm": 1.3202130794525146, "learning_rate": 4.071562351156966e-05, "loss": 0.17560791969299316, "memory(GiB)": 153.57, "step": 7340, "token_acc": 0.9371889073240104, "train_speed(iter/s)": 0.314567 }, { "epoch": 2.799161585365854, "grad_norm": 1.249165654182434, "learning_rate": 4.0656808595826626e-05, "loss": 0.17677336931228638, "memory(GiB)": 153.57, "step": 7345, "token_acc": 0.9309280797531451, "train_speed(iter/s)": 0.314591 }, { "epoch": 2.8010670731707314, "grad_norm": 1.3151508569717407, "learning_rate": 4.0598007072756985e-05, "loss": 0.1688065528869629, "memory(GiB)": 153.57, "step": 7350, "token_acc": 0.9303912647861693, "train_speed(iter/s)": 0.314595 }, { "epoch": 2.8029725609756095, "grad_norm": 1.1549469232559204, "learning_rate": 4.053921902664775e-05, "loss": 0.17139875888824463, "memory(GiB)": 153.57, "step": 7355, "token_acc": 0.9269375971574506, "train_speed(iter/s)": 0.314559 }, { "epoch": 2.8048780487804876, "grad_norm": 0.9003484845161438, "learning_rate": 4.0480444541766576e-05, "loss": 0.14883521795272828, "memory(GiB)": 153.57, "step": 7360, "token_acc": 0.9305895802711089, "train_speed(iter/s)": 0.314571 }, { "epoch": 2.8067835365853657, "grad_norm": 0.8212793469429016, "learning_rate": 4.042168370236176e-05, "loss": 0.1733539342880249, "memory(GiB)": 153.57, "step": 7365, "token_acc": 0.9304568527918782, "train_speed(iter/s)": 0.314595 }, { "epoch": 2.808689024390244, "grad_norm": 1.4888135194778442, "learning_rate": 4.036293659266195e-05, "loss": 0.18187780380249025, "memory(GiB)": 153.57, "step": 7370, "token_acc": 0.9272363150867824, "train_speed(iter/s)": 0.314621 }, { "epoch": 2.810594512195122, "grad_norm": 1.3787230253219604, "learning_rate": 4.030420329687616e-05, "loss": 0.15127842426300048, "memory(GiB)": 153.57, "step": 7375, "token_acc": 0.9296418592837186, "train_speed(iter/s)": 0.314591 }, { "epoch": 2.8125, "grad_norm": 1.018067479133606, "learning_rate": 4.0245483899193595e-05, "loss": 0.19577391147613527, "memory(GiB)": 153.57, "step": 7380, "token_acc": 0.9299044151115157, "train_speed(iter/s)": 0.314593 }, { "epoch": 2.814405487804878, "grad_norm": 1.1737974882125854, "learning_rate": 4.018677848378353e-05, "loss": 0.15100626945495604, "memory(GiB)": 153.57, "step": 7385, "token_acc": 0.9254617414248021, "train_speed(iter/s)": 0.314619 }, { "epoch": 2.816310975609756, "grad_norm": 1.238153100013733, "learning_rate": 4.012808713479522e-05, "loss": 0.17380933761596679, "memory(GiB)": 153.57, "step": 7390, "token_acc": 0.9288604180714768, "train_speed(iter/s)": 0.314615 }, { "epoch": 2.8182164634146343, "grad_norm": 1.4776312112808228, "learning_rate": 4.006940993635773e-05, "loss": 0.20915813446044923, "memory(GiB)": 153.57, "step": 7395, "token_acc": 0.9175930401159981, "train_speed(iter/s)": 0.314576 }, { "epoch": 2.8201219512195124, "grad_norm": 1.3014750480651855, "learning_rate": 4.001074697257986e-05, "loss": 0.11660041809082031, "memory(GiB)": 153.57, "step": 7400, "token_acc": 0.9598420013166556, "train_speed(iter/s)": 0.314606 }, { "epoch": 2.8220274390243905, "grad_norm": 1.3238792419433594, "learning_rate": 3.9952098327549966e-05, "loss": 0.15173965692520142, "memory(GiB)": 153.57, "step": 7405, "token_acc": 0.9439119630812921, "train_speed(iter/s)": 0.314654 }, { "epoch": 2.8239329268292686, "grad_norm": 1.1178054809570312, "learning_rate": 3.989346408533597e-05, "loss": 0.15121079683303834, "memory(GiB)": 153.57, "step": 7410, "token_acc": 0.9345845983991168, "train_speed(iter/s)": 0.314673 }, { "epoch": 2.825838414634146, "grad_norm": 0.8714100122451782, "learning_rate": 3.983484432998506e-05, "loss": 0.13340766429901124, "memory(GiB)": 153.57, "step": 7415, "token_acc": 0.9431016363909323, "train_speed(iter/s)": 0.31462 }, { "epoch": 2.8277439024390243, "grad_norm": 1.3918266296386719, "learning_rate": 3.977623914552369e-05, "loss": 0.13516165018081666, "memory(GiB)": 153.57, "step": 7420, "token_acc": 0.9389612427428212, "train_speed(iter/s)": 0.314619 }, { "epoch": 2.8296493902439024, "grad_norm": 1.266531229019165, "learning_rate": 3.971764861595744e-05, "loss": 0.16950336694717408, "memory(GiB)": 153.57, "step": 7425, "token_acc": 0.9293929712460064, "train_speed(iter/s)": 0.314544 }, { "epoch": 2.8315548780487805, "grad_norm": 1.3258169889450073, "learning_rate": 3.9659072825270846e-05, "loss": 0.11140908002853393, "memory(GiB)": 153.57, "step": 7430, "token_acc": 0.9434961742201294, "train_speed(iter/s)": 0.314581 }, { "epoch": 2.8334603658536586, "grad_norm": 1.2797867059707642, "learning_rate": 3.960051185742737e-05, "loss": 0.1800255537033081, "memory(GiB)": 153.57, "step": 7435, "token_acc": 0.9119947848761408, "train_speed(iter/s)": 0.314616 }, { "epoch": 2.8353658536585367, "grad_norm": 1.4733078479766846, "learning_rate": 3.954196579636918e-05, "loss": 0.19529283046722412, "memory(GiB)": 153.57, "step": 7440, "token_acc": 0.9154954634406689, "train_speed(iter/s)": 0.314628 }, { "epoch": 2.8372713414634148, "grad_norm": 1.3407217264175415, "learning_rate": 3.9483434726017074e-05, "loss": 0.13885093927383424, "memory(GiB)": 153.57, "step": 7445, "token_acc": 0.9474264705882353, "train_speed(iter/s)": 0.314674 }, { "epoch": 2.839176829268293, "grad_norm": 1.1785411834716797, "learning_rate": 3.942491873027043e-05, "loss": 0.21287643909454346, "memory(GiB)": 153.57, "step": 7450, "token_acc": 0.9140845070422535, "train_speed(iter/s)": 0.314696 }, { "epoch": 2.8410823170731705, "grad_norm": 1.1173062324523926, "learning_rate": 3.936641789300696e-05, "loss": 0.18316370248794556, "memory(GiB)": 153.57, "step": 7455, "token_acc": 0.9175384615384615, "train_speed(iter/s)": 0.314717 }, { "epoch": 2.8429878048780486, "grad_norm": 0.8774563670158386, "learning_rate": 3.930793229808264e-05, "loss": 0.1284467339515686, "memory(GiB)": 153.57, "step": 7460, "token_acc": 0.941837076768354, "train_speed(iter/s)": 0.314725 }, { "epoch": 2.8448932926829267, "grad_norm": 1.222671389579773, "learning_rate": 3.9249462029331626e-05, "loss": 0.14555695056915283, "memory(GiB)": 153.57, "step": 7465, "token_acc": 0.9295621430514006, "train_speed(iter/s)": 0.314738 }, { "epoch": 2.8467987804878048, "grad_norm": 1.1765981912612915, "learning_rate": 3.91910071705661e-05, "loss": 0.17329647541046142, "memory(GiB)": 153.57, "step": 7470, "token_acc": 0.9274685816876123, "train_speed(iter/s)": 0.314755 }, { "epoch": 2.848704268292683, "grad_norm": 0.732787549495697, "learning_rate": 3.9132567805576134e-05, "loss": 0.16469340324401854, "memory(GiB)": 153.57, "step": 7475, "token_acc": 0.9281275330991624, "train_speed(iter/s)": 0.314769 }, { "epoch": 2.850609756097561, "grad_norm": 0.8532246947288513, "learning_rate": 3.907414401812963e-05, "loss": 0.15486416816711426, "memory(GiB)": 153.57, "step": 7480, "token_acc": 0.9376371043560012, "train_speed(iter/s)": 0.314771 }, { "epoch": 2.852515243902439, "grad_norm": 1.2615472078323364, "learning_rate": 3.901573589197209e-05, "loss": 0.12922234535217286, "memory(GiB)": 153.57, "step": 7485, "token_acc": 0.9379892871858261, "train_speed(iter/s)": 0.314774 }, { "epoch": 2.854420731707317, "grad_norm": 1.3806835412979126, "learning_rate": 3.895734351082668e-05, "loss": 0.15670950412750245, "memory(GiB)": 153.57, "step": 7490, "token_acc": 0.9375294672324376, "train_speed(iter/s)": 0.314793 }, { "epoch": 2.8563262195121952, "grad_norm": 0.7569798827171326, "learning_rate": 3.88989669583939e-05, "loss": 0.1429172158241272, "memory(GiB)": 153.57, "step": 7495, "token_acc": 0.9374204497242257, "train_speed(iter/s)": 0.314791 }, { "epoch": 2.8582317073170733, "grad_norm": 1.3138107061386108, "learning_rate": 3.884060631835159e-05, "loss": 0.1586222767829895, "memory(GiB)": 153.57, "step": 7500, "token_acc": 0.9217687074829932, "train_speed(iter/s)": 0.314816 }, { "epoch": 2.8601371951219514, "grad_norm": 1.1436485052108765, "learning_rate": 3.8782261674354794e-05, "loss": 0.15196173191070556, "memory(GiB)": 153.57, "step": 7505, "token_acc": 0.9220667384284177, "train_speed(iter/s)": 0.314826 }, { "epoch": 2.8620426829268295, "grad_norm": 1.3556512594223022, "learning_rate": 3.872393311003561e-05, "loss": 0.17479562759399414, "memory(GiB)": 153.57, "step": 7510, "token_acc": 0.9305242843357993, "train_speed(iter/s)": 0.31484 }, { "epoch": 2.863948170731707, "grad_norm": 1.2866015434265137, "learning_rate": 3.8665620709003093e-05, "loss": 0.14903578758239747, "memory(GiB)": 153.57, "step": 7515, "token_acc": 0.9310092637737689, "train_speed(iter/s)": 0.314864 }, { "epoch": 2.8658536585365852, "grad_norm": 0.917425274848938, "learning_rate": 3.8607324554843136e-05, "loss": 0.13235096931457518, "memory(GiB)": 153.57, "step": 7520, "token_acc": 0.9468188779061674, "train_speed(iter/s)": 0.314863 }, { "epoch": 2.8677591463414633, "grad_norm": 1.4277573823928833, "learning_rate": 3.8549044731118305e-05, "loss": 0.16970160007476806, "memory(GiB)": 153.57, "step": 7525, "token_acc": 0.9259576901086335, "train_speed(iter/s)": 0.314872 }, { "epoch": 2.8696646341463414, "grad_norm": 0.885715126991272, "learning_rate": 3.8490781321367846e-05, "loss": 0.1304342269897461, "memory(GiB)": 153.57, "step": 7530, "token_acc": 0.9352913085004776, "train_speed(iter/s)": 0.31489 }, { "epoch": 2.8715701219512195, "grad_norm": 1.025346040725708, "learning_rate": 3.843253440910739e-05, "loss": 0.1698522686958313, "memory(GiB)": 153.57, "step": 7535, "token_acc": 0.9321678321678322, "train_speed(iter/s)": 0.314862 }, { "epoch": 2.8734756097560976, "grad_norm": 1.3589026927947998, "learning_rate": 3.837430407782896e-05, "loss": 0.18797721862792968, "memory(GiB)": 153.57, "step": 7540, "token_acc": 0.9212735166425471, "train_speed(iter/s)": 0.314889 }, { "epoch": 2.8753810975609757, "grad_norm": 1.3008321523666382, "learning_rate": 3.831609041100079e-05, "loss": 0.19043108224868774, "memory(GiB)": 153.57, "step": 7545, "token_acc": 0.9222196424425354, "train_speed(iter/s)": 0.314853 }, { "epoch": 2.877286585365854, "grad_norm": 0.8070974946022034, "learning_rate": 3.825789349206726e-05, "loss": 0.15475962162017823, "memory(GiB)": 153.57, "step": 7550, "token_acc": 0.93600812595226, "train_speed(iter/s)": 0.314812 }, { "epoch": 2.8791920731707314, "grad_norm": 1.125431776046753, "learning_rate": 3.819971340444871e-05, "loss": 0.13982982635498048, "memory(GiB)": 153.57, "step": 7555, "token_acc": 0.9358249323825916, "train_speed(iter/s)": 0.314828 }, { "epoch": 2.8810975609756095, "grad_norm": 1.3956855535507202, "learning_rate": 3.814155023154136e-05, "loss": 0.14261378049850465, "memory(GiB)": 153.57, "step": 7560, "token_acc": 0.9325554259043174, "train_speed(iter/s)": 0.314836 }, { "epoch": 2.8830030487804876, "grad_norm": 1.4721531867980957, "learning_rate": 3.8083404056717183e-05, "loss": 0.15512707233428955, "memory(GiB)": 153.57, "step": 7565, "token_acc": 0.9350300020691082, "train_speed(iter/s)": 0.314842 }, { "epoch": 2.8849085365853657, "grad_norm": 1.462424635887146, "learning_rate": 3.802527496332384e-05, "loss": 0.15968625545501708, "memory(GiB)": 153.57, "step": 7570, "token_acc": 0.9270216962524654, "train_speed(iter/s)": 0.31485 }, { "epoch": 2.886814024390244, "grad_norm": 2.5867624282836914, "learning_rate": 3.796716303468443e-05, "loss": 0.15775299072265625, "memory(GiB)": 153.57, "step": 7575, "token_acc": 0.9403485254691689, "train_speed(iter/s)": 0.31487 }, { "epoch": 2.888719512195122, "grad_norm": 1.096974492073059, "learning_rate": 3.790906835409749e-05, "loss": 0.15516024827957153, "memory(GiB)": 153.57, "step": 7580, "token_acc": 0.9297069825436409, "train_speed(iter/s)": 0.314869 }, { "epoch": 2.890625, "grad_norm": 0.8545968532562256, "learning_rate": 3.785099100483681e-05, "loss": 0.15936063528060912, "memory(GiB)": 153.57, "step": 7585, "token_acc": 0.9309159112222569, "train_speed(iter/s)": 0.314892 }, { "epoch": 2.892530487804878, "grad_norm": 1.35421884059906, "learning_rate": 3.7792931070151364e-05, "loss": 0.11341977119445801, "memory(GiB)": 153.57, "step": 7590, "token_acc": 0.9418574059458037, "train_speed(iter/s)": 0.314915 }, { "epoch": 2.894435975609756, "grad_norm": 1.3671538829803467, "learning_rate": 3.773488863326513e-05, "loss": 0.16036083698272705, "memory(GiB)": 153.57, "step": 7595, "token_acc": 0.9290563475899525, "train_speed(iter/s)": 0.31495 }, { "epoch": 2.8963414634146343, "grad_norm": 1.182799220085144, "learning_rate": 3.7676863777377054e-05, "loss": 0.183322274684906, "memory(GiB)": 153.57, "step": 7600, "token_acc": 0.922929120409906, "train_speed(iter/s)": 0.314969 }, { "epoch": 2.8982469512195124, "grad_norm": 1.1061089038848877, "learning_rate": 3.7618856585660826e-05, "loss": 0.1948955774307251, "memory(GiB)": 153.57, "step": 7605, "token_acc": 0.9193460490463216, "train_speed(iter/s)": 0.314925 }, { "epoch": 2.9001524390243905, "grad_norm": 2.020601987838745, "learning_rate": 3.756086714126483e-05, "loss": 0.13007745742797852, "memory(GiB)": 153.57, "step": 7610, "token_acc": 0.9401431359791802, "train_speed(iter/s)": 0.314894 }, { "epoch": 2.9020579268292686, "grad_norm": 1.4242448806762695, "learning_rate": 3.750289552731208e-05, "loss": 0.15245245695114135, "memory(GiB)": 153.57, "step": 7615, "token_acc": 0.9320409496841646, "train_speed(iter/s)": 0.314916 }, { "epoch": 2.903963414634146, "grad_norm": 0.9432724714279175, "learning_rate": 3.744494182689992e-05, "loss": 0.16509283781051637, "memory(GiB)": 153.57, "step": 7620, "token_acc": 0.9316037735849056, "train_speed(iter/s)": 0.314876 }, { "epoch": 2.9058689024390243, "grad_norm": 0.8756943941116333, "learning_rate": 3.7387006123100094e-05, "loss": 0.1477197527885437, "memory(GiB)": 153.57, "step": 7625, "token_acc": 0.9304001988565747, "train_speed(iter/s)": 0.314845 }, { "epoch": 2.9077743902439024, "grad_norm": 1.7793807983398438, "learning_rate": 3.732908849895852e-05, "loss": 0.16417135000228883, "memory(GiB)": 153.57, "step": 7630, "token_acc": 0.9411159151739757, "train_speed(iter/s)": 0.3148 }, { "epoch": 2.9096798780487805, "grad_norm": 1.2425097227096558, "learning_rate": 3.72711890374952e-05, "loss": 0.143135142326355, "memory(GiB)": 153.57, "step": 7635, "token_acc": 0.9462763986661726, "train_speed(iter/s)": 0.314777 }, { "epoch": 2.9115853658536586, "grad_norm": 1.1648540496826172, "learning_rate": 3.721330782170411e-05, "loss": 0.11886788606643676, "memory(GiB)": 153.57, "step": 7640, "token_acc": 0.9454597372929754, "train_speed(iter/s)": 0.314804 }, { "epoch": 2.9134908536585367, "grad_norm": 1.250893235206604, "learning_rate": 3.715544493455308e-05, "loss": 0.14556553363800048, "memory(GiB)": 153.57, "step": 7645, "token_acc": 0.9390450389434474, "train_speed(iter/s)": 0.314831 }, { "epoch": 2.9153963414634148, "grad_norm": 1.1875083446502686, "learning_rate": 3.7097600458983636e-05, "loss": 0.12084689140319824, "memory(GiB)": 153.57, "step": 7650, "token_acc": 0.9432847577786531, "train_speed(iter/s)": 0.314866 }, { "epoch": 2.917301829268293, "grad_norm": 0.9369703531265259, "learning_rate": 3.7039774477910944e-05, "loss": 0.146729576587677, "memory(GiB)": 153.57, "step": 7655, "token_acc": 0.9385460251046025, "train_speed(iter/s)": 0.314868 }, { "epoch": 2.9192073170731705, "grad_norm": 0.9552258849143982, "learning_rate": 3.698196707422366e-05, "loss": 0.1319243550300598, "memory(GiB)": 153.57, "step": 7660, "token_acc": 0.946174142480211, "train_speed(iter/s)": 0.314911 }, { "epoch": 2.9211128048780486, "grad_norm": 1.072961688041687, "learning_rate": 3.6924178330783796e-05, "loss": 0.22068610191345214, "memory(GiB)": 153.57, "step": 7665, "token_acc": 0.9214512810922012, "train_speed(iter/s)": 0.314845 }, { "epoch": 2.9230182926829267, "grad_norm": 1.423834204673767, "learning_rate": 3.6866408330426616e-05, "loss": 0.18621882200241088, "memory(GiB)": 153.57, "step": 7670, "token_acc": 0.9243347226675216, "train_speed(iter/s)": 0.314871 }, { "epoch": 2.9249237804878048, "grad_norm": 1.2729437351226807, "learning_rate": 3.680865715596052e-05, "loss": 0.20100550651550292, "memory(GiB)": 153.57, "step": 7675, "token_acc": 0.912889935256033, "train_speed(iter/s)": 0.314842 }, { "epoch": 2.926829268292683, "grad_norm": 1.2502052783966064, "learning_rate": 3.675092489016693e-05, "loss": 0.20016698837280272, "memory(GiB)": 153.57, "step": 7680, "token_acc": 0.9157142857142857, "train_speed(iter/s)": 0.314816 }, { "epoch": 2.928734756097561, "grad_norm": 1.0773290395736694, "learning_rate": 3.669321161580014e-05, "loss": 0.1461755871772766, "memory(GiB)": 153.57, "step": 7685, "token_acc": 0.9313279240055412, "train_speed(iter/s)": 0.31482 }, { "epoch": 2.930640243902439, "grad_norm": 1.2635338306427002, "learning_rate": 3.663551741558726e-05, "loss": 0.16809768676757814, "memory(GiB)": 153.57, "step": 7690, "token_acc": 0.928035565434843, "train_speed(iter/s)": 0.314839 }, { "epoch": 2.932545731707317, "grad_norm": 0.9528648853302002, "learning_rate": 3.6577842372228035e-05, "loss": 0.17850557565689087, "memory(GiB)": 153.57, "step": 7695, "token_acc": 0.9338972431077694, "train_speed(iter/s)": 0.314841 }, { "epoch": 2.9344512195121952, "grad_norm": 1.2672041654586792, "learning_rate": 3.652018656839474e-05, "loss": 0.1723092794418335, "memory(GiB)": 153.57, "step": 7700, "token_acc": 0.9264448336252189, "train_speed(iter/s)": 0.314822 }, { "epoch": 2.9363567073170733, "grad_norm": 1.1954395771026611, "learning_rate": 3.646255008673212e-05, "loss": 0.17313584089279174, "memory(GiB)": 153.57, "step": 7705, "token_acc": 0.9323741007194245, "train_speed(iter/s)": 0.314834 }, { "epoch": 2.9382621951219514, "grad_norm": 1.0528703927993774, "learning_rate": 3.640493300985716e-05, "loss": 0.22199783325195313, "memory(GiB)": 153.57, "step": 7710, "token_acc": 0.9123274400581255, "train_speed(iter/s)": 0.31485 }, { "epoch": 2.9401676829268295, "grad_norm": 1.7556239366531372, "learning_rate": 3.6347335420359066e-05, "loss": 0.1441631317138672, "memory(GiB)": 153.57, "step": 7715, "token_acc": 0.9324324324324325, "train_speed(iter/s)": 0.314864 }, { "epoch": 2.942073170731707, "grad_norm": 1.7887780666351318, "learning_rate": 3.62897574007991e-05, "loss": 0.19523613452911376, "memory(GiB)": 153.57, "step": 7720, "token_acc": 0.9318339902619986, "train_speed(iter/s)": 0.314879 }, { "epoch": 2.9439786585365852, "grad_norm": 1.1234270334243774, "learning_rate": 3.6232199033710474e-05, "loss": 0.21367812156677246, "memory(GiB)": 153.57, "step": 7725, "token_acc": 0.9182628062360801, "train_speed(iter/s)": 0.314888 }, { "epoch": 2.9458841463414633, "grad_norm": 1.1947578191757202, "learning_rate": 3.6174660401598224e-05, "loss": 0.22943968772888185, "memory(GiB)": 153.57, "step": 7730, "token_acc": 0.9178743961352657, "train_speed(iter/s)": 0.314842 }, { "epoch": 2.9477896341463414, "grad_norm": 1.3785741329193115, "learning_rate": 3.611714158693914e-05, "loss": 0.21016550064086914, "memory(GiB)": 153.57, "step": 7735, "token_acc": 0.9138643067846608, "train_speed(iter/s)": 0.314853 }, { "epoch": 2.9496951219512195, "grad_norm": 1.1463574171066284, "learning_rate": 3.605964267218154e-05, "loss": 0.16838927268981935, "memory(GiB)": 153.57, "step": 7740, "token_acc": 0.9209175973437972, "train_speed(iter/s)": 0.31482 }, { "epoch": 2.9516006097560976, "grad_norm": 1.2142614126205444, "learning_rate": 3.6002163739745245e-05, "loss": 0.14431146383285523, "memory(GiB)": 153.57, "step": 7745, "token_acc": 0.9308717379233759, "train_speed(iter/s)": 0.314835 }, { "epoch": 2.9535060975609757, "grad_norm": 1.2841778993606567, "learning_rate": 3.594470487202145e-05, "loss": 0.13392993211746215, "memory(GiB)": 153.57, "step": 7750, "token_acc": 0.9444305381727159, "train_speed(iter/s)": 0.314854 }, { "epoch": 2.955411585365854, "grad_norm": 1.2664353847503662, "learning_rate": 3.5887266151372565e-05, "loss": 0.13588943481445312, "memory(GiB)": 153.57, "step": 7755, "token_acc": 0.936529933481153, "train_speed(iter/s)": 0.314875 }, { "epoch": 2.9573170731707314, "grad_norm": 1.0632102489471436, "learning_rate": 3.582984766013215e-05, "loss": 0.13404110670089722, "memory(GiB)": 153.57, "step": 7760, "token_acc": 0.9377806286080821, "train_speed(iter/s)": 0.314918 }, { "epoch": 2.9592225609756095, "grad_norm": 1.0012454986572266, "learning_rate": 3.5772449480604716e-05, "loss": 0.1562551736831665, "memory(GiB)": 153.57, "step": 7765, "token_acc": 0.9377793670659753, "train_speed(iter/s)": 0.314908 }, { "epoch": 2.9611280487804876, "grad_norm": 1.319711685180664, "learning_rate": 3.571507169506571e-05, "loss": 0.21732621192932128, "memory(GiB)": 153.57, "step": 7770, "token_acc": 0.905788876276958, "train_speed(iter/s)": 0.314821 }, { "epoch": 2.9630335365853657, "grad_norm": 1.0834152698516846, "learning_rate": 3.565771438576133e-05, "loss": 0.1238470196723938, "memory(GiB)": 153.57, "step": 7775, "token_acc": 0.9436855978912053, "train_speed(iter/s)": 0.314829 }, { "epoch": 2.964939024390244, "grad_norm": 1.3834148645401, "learning_rate": 3.5600377634908415e-05, "loss": 0.20724754333496093, "memory(GiB)": 153.57, "step": 7780, "token_acc": 0.9091620986687549, "train_speed(iter/s)": 0.314868 }, { "epoch": 2.966844512195122, "grad_norm": 1.4072850942611694, "learning_rate": 3.5543061524694344e-05, "loss": 0.17278780937194824, "memory(GiB)": 153.57, "step": 7785, "token_acc": 0.9325412035262552, "train_speed(iter/s)": 0.314878 }, { "epoch": 2.96875, "grad_norm": 2.13055419921875, "learning_rate": 3.5485766137276894e-05, "loss": 0.18264223337173463, "memory(GiB)": 153.57, "step": 7790, "token_acc": 0.9173240769958978, "train_speed(iter/s)": 0.314859 }, { "epoch": 2.970655487804878, "grad_norm": 1.0184941291809082, "learning_rate": 3.542849155478415e-05, "loss": 0.2192664384841919, "memory(GiB)": 153.57, "step": 7795, "token_acc": 0.9071494893221913, "train_speed(iter/s)": 0.314883 }, { "epoch": 2.972560975609756, "grad_norm": 1.538872480392456, "learning_rate": 3.537123785931439e-05, "loss": 0.15506448745727539, "memory(GiB)": 153.57, "step": 7800, "token_acc": 0.9340774328566446, "train_speed(iter/s)": 0.314856 }, { "epoch": 2.9744664634146343, "grad_norm": 1.990516185760498, "learning_rate": 3.531400513293592e-05, "loss": 0.17681190967559815, "memory(GiB)": 153.57, "step": 7805, "token_acc": 0.9266559706623883, "train_speed(iter/s)": 0.314878 }, { "epoch": 2.9763719512195124, "grad_norm": 1.5209563970565796, "learning_rate": 3.525679345768703e-05, "loss": 0.13485100269317626, "memory(GiB)": 153.57, "step": 7810, "token_acc": 0.9418490045338064, "train_speed(iter/s)": 0.314891 }, { "epoch": 2.9782774390243905, "grad_norm": 1.972530484199524, "learning_rate": 3.519960291557577e-05, "loss": 0.17600021362304688, "memory(GiB)": 153.57, "step": 7815, "token_acc": 0.9158743877845001, "train_speed(iter/s)": 0.314914 }, { "epoch": 2.9801829268292686, "grad_norm": 0.7482591271400452, "learning_rate": 3.514243358857997e-05, "loss": 0.13174020051956176, "memory(GiB)": 153.57, "step": 7820, "token_acc": 0.9382369455362156, "train_speed(iter/s)": 0.314883 }, { "epoch": 2.982088414634146, "grad_norm": 1.3603788614273071, "learning_rate": 3.508528555864701e-05, "loss": 0.2177196264266968, "memory(GiB)": 153.57, "step": 7825, "token_acc": 0.9198329023274319, "train_speed(iter/s)": 0.314857 }, { "epoch": 2.9839939024390243, "grad_norm": 1.3848273754119873, "learning_rate": 3.502815890769374e-05, "loss": 0.12065949440002441, "memory(GiB)": 153.57, "step": 7830, "token_acc": 0.9487415592387968, "train_speed(iter/s)": 0.314881 }, { "epoch": 2.9858993902439024, "grad_norm": 0.8771776556968689, "learning_rate": 3.497105371760639e-05, "loss": 0.1315540075302124, "memory(GiB)": 153.57, "step": 7835, "token_acc": 0.9503233392122281, "train_speed(iter/s)": 0.314871 }, { "epoch": 2.9878048780487805, "grad_norm": 1.7480366230010986, "learning_rate": 3.4913970070240386e-05, "loss": 0.20479800701141357, "memory(GiB)": 153.57, "step": 7840, "token_acc": 0.9190283400809717, "train_speed(iter/s)": 0.31488 }, { "epoch": 2.9897103658536586, "grad_norm": 1.0063116550445557, "learning_rate": 3.485690804742034e-05, "loss": 0.1590970754623413, "memory(GiB)": 153.57, "step": 7845, "token_acc": 0.9293126785982855, "train_speed(iter/s)": 0.314888 }, { "epoch": 2.9916158536585367, "grad_norm": 1.0598825216293335, "learning_rate": 3.479986773093979e-05, "loss": 0.12708905935287476, "memory(GiB)": 153.57, "step": 7850, "token_acc": 0.9428017450315075, "train_speed(iter/s)": 0.314892 }, { "epoch": 2.9935213414634148, "grad_norm": 1.4012819528579712, "learning_rate": 3.4742849202561224e-05, "loss": 0.14527931213378906, "memory(GiB)": 153.57, "step": 7855, "token_acc": 0.9386446886446886, "train_speed(iter/s)": 0.314934 }, { "epoch": 2.995426829268293, "grad_norm": 1.0924241542816162, "learning_rate": 3.468585254401586e-05, "loss": 0.13999804258346557, "memory(GiB)": 153.57, "step": 7860, "token_acc": 0.9393560487652391, "train_speed(iter/s)": 0.314956 }, { "epoch": 2.9973323170731705, "grad_norm": 1.5290284156799316, "learning_rate": 3.462887783700358e-05, "loss": 0.13179442882537842, "memory(GiB)": 153.57, "step": 7865, "token_acc": 0.9380073800738007, "train_speed(iter/s)": 0.314947 }, { "epoch": 2.9992378048780486, "grad_norm": 1.4904060363769531, "learning_rate": 3.457192516319281e-05, "loss": 0.18944761753082276, "memory(GiB)": 153.57, "step": 7870, "token_acc": 0.932283208834037, "train_speed(iter/s)": 0.314942 }, { "epoch": 3.0011432926829267, "grad_norm": 1.3481941223144531, "learning_rate": 3.4514994604220376e-05, "loss": 0.15427268743515016, "memory(GiB)": 153.57, "step": 7875, "token_acc": 0.9389269406392694, "train_speed(iter/s)": 0.314988 }, { "epoch": 3.0030487804878048, "grad_norm": 1.2083715200424194, "learning_rate": 3.4458086241691415e-05, "loss": 0.14157024621963502, "memory(GiB)": 153.57, "step": 7880, "token_acc": 0.9460764587525151, "train_speed(iter/s)": 0.314991 }, { "epoch": 3.004954268292683, "grad_norm": 0.9505358338356018, "learning_rate": 3.440120015717921e-05, "loss": 0.10463219881057739, "memory(GiB)": 153.57, "step": 7885, "token_acc": 0.9552768575485092, "train_speed(iter/s)": 0.315006 }, { "epoch": 3.006859756097561, "grad_norm": 0.9766368269920349, "learning_rate": 3.4344336432225207e-05, "loss": 0.1600177049636841, "memory(GiB)": 153.57, "step": 7890, "token_acc": 0.9505783385909569, "train_speed(iter/s)": 0.315033 }, { "epoch": 3.008765243902439, "grad_norm": 1.4488446712493896, "learning_rate": 3.428749514833869e-05, "loss": 0.18250198364257814, "memory(GiB)": 153.57, "step": 7895, "token_acc": 0.9279427942794279, "train_speed(iter/s)": 0.315059 }, { "epoch": 3.010670731707317, "grad_norm": 1.1097030639648438, "learning_rate": 3.423067638699684e-05, "loss": 0.1031195878982544, "memory(GiB)": 153.57, "step": 7900, "token_acc": 0.9567730004623208, "train_speed(iter/s)": 0.315034 }, { "epoch": 3.0125762195121952, "grad_norm": 1.654984712600708, "learning_rate": 3.417388022964455e-05, "loss": 0.14905967712402343, "memory(GiB)": 153.57, "step": 7905, "token_acc": 0.9282027217268888, "train_speed(iter/s)": 0.315051 }, { "epoch": 3.0144817073170733, "grad_norm": 1.182080864906311, "learning_rate": 3.4117106757694284e-05, "loss": 0.16429036855697632, "memory(GiB)": 153.57, "step": 7910, "token_acc": 0.9291068082448469, "train_speed(iter/s)": 0.31505 }, { "epoch": 3.0163871951219514, "grad_norm": 1.1829001903533936, "learning_rate": 3.406035605252601e-05, "loss": 0.14197063446044922, "memory(GiB)": 153.57, "step": 7915, "token_acc": 0.9408440090429541, "train_speed(iter/s)": 0.315069 }, { "epoch": 3.018292682926829, "grad_norm": 2.760037422180176, "learning_rate": 3.4003628195487057e-05, "loss": 0.15021626949310302, "memory(GiB)": 153.57, "step": 7920, "token_acc": 0.9396967608545831, "train_speed(iter/s)": 0.315034 }, { "epoch": 3.020198170731707, "grad_norm": 1.2511239051818848, "learning_rate": 3.3946923267892006e-05, "loss": 0.14819034337997436, "memory(GiB)": 153.57, "step": 7925, "token_acc": 0.9363685248252375, "train_speed(iter/s)": 0.315034 }, { "epoch": 3.0221036585365852, "grad_norm": 0.8276436924934387, "learning_rate": 3.3890241351022544e-05, "loss": 0.13615368604660033, "memory(GiB)": 153.57, "step": 7930, "token_acc": 0.9351559761654399, "train_speed(iter/s)": 0.315061 }, { "epoch": 3.0240091463414633, "grad_norm": 1.975001573562622, "learning_rate": 3.383358252612742e-05, "loss": 0.16136631965637208, "memory(GiB)": 153.57, "step": 7935, "token_acc": 0.9253373966042664, "train_speed(iter/s)": 0.315091 }, { "epoch": 3.0259146341463414, "grad_norm": 1.4318139553070068, "learning_rate": 3.3776946874422263e-05, "loss": 0.12712949514389038, "memory(GiB)": 153.57, "step": 7940, "token_acc": 0.9483317886932344, "train_speed(iter/s)": 0.315114 }, { "epoch": 3.0278201219512195, "grad_norm": 1.2976945638656616, "learning_rate": 3.3720334477089474e-05, "loss": 0.15361992120742798, "memory(GiB)": 153.57, "step": 7945, "token_acc": 0.9336645236703682, "train_speed(iter/s)": 0.31514 }, { "epoch": 3.0297256097560976, "grad_norm": 1.1214057207107544, "learning_rate": 3.3663745415278134e-05, "loss": 0.18843014240264894, "memory(GiB)": 153.57, "step": 7950, "token_acc": 0.9180487804878049, "train_speed(iter/s)": 0.315129 }, { "epoch": 3.0316310975609757, "grad_norm": 1.6580244302749634, "learning_rate": 3.360717977010387e-05, "loss": 0.1180912971496582, "memory(GiB)": 153.57, "step": 7955, "token_acc": 0.9499147485080989, "train_speed(iter/s)": 0.315094 }, { "epoch": 3.033536585365854, "grad_norm": 1.9114004373550415, "learning_rate": 3.355063762264873e-05, "loss": 0.15991337299346925, "memory(GiB)": 153.57, "step": 7960, "token_acc": 0.934243803743045, "train_speed(iter/s)": 0.315128 }, { "epoch": 3.035442073170732, "grad_norm": 1.1684789657592773, "learning_rate": 3.3494119053961095e-05, "loss": 0.15416553020477294, "memory(GiB)": 153.57, "step": 7965, "token_acc": 0.938953488372093, "train_speed(iter/s)": 0.315142 }, { "epoch": 3.0373475609756095, "grad_norm": 1.5750123262405396, "learning_rate": 3.3437624145055557e-05, "loss": 0.12900123596191407, "memory(GiB)": 153.57, "step": 7970, "token_acc": 0.9482235701906413, "train_speed(iter/s)": 0.31515 }, { "epoch": 3.0392530487804876, "grad_norm": 0.9839969277381897, "learning_rate": 3.338115297691276e-05, "loss": 0.14492709636688234, "memory(GiB)": 153.57, "step": 7975, "token_acc": 0.9493830269453538, "train_speed(iter/s)": 0.31517 }, { "epoch": 3.0411585365853657, "grad_norm": 1.1869875192642212, "learning_rate": 3.3324705630479355e-05, "loss": 0.13827348947525026, "memory(GiB)": 153.57, "step": 7980, "token_acc": 0.9402926829268292, "train_speed(iter/s)": 0.315188 }, { "epoch": 3.043064024390244, "grad_norm": 0.47522690892219543, "learning_rate": 3.326828218666785e-05, "loss": 0.10583827495574952, "memory(GiB)": 153.57, "step": 7985, "token_acc": 0.9509594882729211, "train_speed(iter/s)": 0.315236 }, { "epoch": 3.044969512195122, "grad_norm": 1.5176221132278442, "learning_rate": 3.3211882726356445e-05, "loss": 0.1490524172782898, "memory(GiB)": 153.57, "step": 7990, "token_acc": 0.945695992411667, "train_speed(iter/s)": 0.315246 }, { "epoch": 3.046875, "grad_norm": 2.161252737045288, "learning_rate": 3.3155507330389e-05, "loss": 0.14789869785308837, "memory(GiB)": 153.57, "step": 7995, "token_acc": 0.9422786991169503, "train_speed(iter/s)": 0.315248 }, { "epoch": 3.048780487804878, "grad_norm": 1.258626937866211, "learning_rate": 3.309915607957487e-05, "loss": 0.13133716583251953, "memory(GiB)": 153.57, "step": 8000, "token_acc": 0.9448646011222249, "train_speed(iter/s)": 0.315215 }, { "epoch": 3.048780487804878, "eval_loss": 0.16437658667564392, "eval_runtime": 32.7771, "eval_samples_per_second": 3.234, "eval_steps_per_second": 3.234, "eval_token_acc": 0.9194870505406085, "step": 8000 }, { "epoch": 3.050685975609756, "grad_norm": 2.1694552898406982, "learning_rate": 3.30428290546888e-05, "loss": 0.17632220983505248, "memory(GiB)": 153.57, "step": 8005, "token_acc": 0.9215686274509803, "train_speed(iter/s)": 0.314811 }, { "epoch": 3.0525914634146343, "grad_norm": 1.8267829418182373, "learning_rate": 3.298652633647079e-05, "loss": 0.16488754749298096, "memory(GiB)": 153.57, "step": 8010, "token_acc": 0.9415011037527594, "train_speed(iter/s)": 0.314802 }, { "epoch": 3.0544969512195124, "grad_norm": 1.735381007194519, "learning_rate": 3.2930248005626044e-05, "loss": 0.17589170932769777, "memory(GiB)": 153.57, "step": 8015, "token_acc": 0.9346153846153846, "train_speed(iter/s)": 0.314769 }, { "epoch": 3.0564024390243905, "grad_norm": 1.2624131441116333, "learning_rate": 3.287399414282474e-05, "loss": 0.1490364670753479, "memory(GiB)": 153.57, "step": 8020, "token_acc": 0.9363817097415507, "train_speed(iter/s)": 0.314792 }, { "epoch": 3.058307926829268, "grad_norm": 1.6159719228744507, "learning_rate": 3.281776482870208e-05, "loss": 0.14982931613922118, "memory(GiB)": 153.57, "step": 8025, "token_acc": 0.935611510791367, "train_speed(iter/s)": 0.31475 }, { "epoch": 3.060213414634146, "grad_norm": 1.1534347534179688, "learning_rate": 3.2761560143857994e-05, "loss": 0.16027828454971313, "memory(GiB)": 153.57, "step": 8030, "token_acc": 0.9341432225063938, "train_speed(iter/s)": 0.314713 }, { "epoch": 3.0621189024390243, "grad_norm": 1.4878915548324585, "learning_rate": 3.270538016885715e-05, "loss": 0.144379186630249, "memory(GiB)": 153.57, "step": 8035, "token_acc": 0.9459157030958597, "train_speed(iter/s)": 0.314717 }, { "epoch": 3.0640243902439024, "grad_norm": 1.2649809122085571, "learning_rate": 3.2649224984228756e-05, "loss": 0.19886473417282105, "memory(GiB)": 153.57, "step": 8040, "token_acc": 0.919848018816718, "train_speed(iter/s)": 0.314722 }, { "epoch": 3.0659298780487805, "grad_norm": 0.8512699604034424, "learning_rate": 3.259309467046654e-05, "loss": 0.11163039207458496, "memory(GiB)": 153.57, "step": 8045, "token_acc": 0.9521960689153118, "train_speed(iter/s)": 0.314734 }, { "epoch": 3.0678353658536586, "grad_norm": 0.944353461265564, "learning_rate": 3.253698930802853e-05, "loss": 0.14583446979522705, "memory(GiB)": 153.57, "step": 8050, "token_acc": 0.937844217151849, "train_speed(iter/s)": 0.314658 }, { "epoch": 3.0697408536585367, "grad_norm": 0.7137839198112488, "learning_rate": 3.248090897733703e-05, "loss": 0.1522064208984375, "memory(GiB)": 153.57, "step": 8055, "token_acc": 0.9534261241970021, "train_speed(iter/s)": 0.314619 }, { "epoch": 3.0716463414634148, "grad_norm": 1.8662729263305664, "learning_rate": 3.242485375877841e-05, "loss": 0.1096563458442688, "memory(GiB)": 153.57, "step": 8060, "token_acc": 0.9564891222805701, "train_speed(iter/s)": 0.314653 }, { "epoch": 3.073551829268293, "grad_norm": 1.8564366102218628, "learning_rate": 3.236882373270313e-05, "loss": 0.1251922845840454, "memory(GiB)": 153.57, "step": 8065, "token_acc": 0.9349046015712682, "train_speed(iter/s)": 0.314647 }, { "epoch": 3.075457317073171, "grad_norm": 1.6207356452941895, "learning_rate": 3.231281897942544e-05, "loss": 0.16120537519454955, "memory(GiB)": 153.57, "step": 8070, "token_acc": 0.9410569105691057, "train_speed(iter/s)": 0.314638 }, { "epoch": 3.0773628048780486, "grad_norm": 1.666310429573059, "learning_rate": 3.225683957922344e-05, "loss": 0.15794849395751953, "memory(GiB)": 153.57, "step": 8075, "token_acc": 0.9376310272536688, "train_speed(iter/s)": 0.314656 }, { "epoch": 3.0792682926829267, "grad_norm": 2.3046607971191406, "learning_rate": 3.2200885612338845e-05, "loss": 0.12806957960128784, "memory(GiB)": 153.57, "step": 8080, "token_acc": 0.9537237888647867, "train_speed(iter/s)": 0.314627 }, { "epoch": 3.0811737804878048, "grad_norm": 1.0943052768707275, "learning_rate": 3.214495715897692e-05, "loss": 0.11838483810424805, "memory(GiB)": 153.57, "step": 8085, "token_acc": 0.9544732662784542, "train_speed(iter/s)": 0.314566 }, { "epoch": 3.083079268292683, "grad_norm": 1.6671109199523926, "learning_rate": 3.2089054299306375e-05, "loss": 0.16599538326263427, "memory(GiB)": 153.57, "step": 8090, "token_acc": 0.9296593741345888, "train_speed(iter/s)": 0.314587 }, { "epoch": 3.084984756097561, "grad_norm": 0.9915124773979187, "learning_rate": 3.2033177113459224e-05, "loss": 0.16327725648880004, "memory(GiB)": 153.57, "step": 8095, "token_acc": 0.9417509591907918, "train_speed(iter/s)": 0.314619 }, { "epoch": 3.086890243902439, "grad_norm": 1.1944289207458496, "learning_rate": 3.197732568153064e-05, "loss": 0.1545802354812622, "memory(GiB)": 153.57, "step": 8100, "token_acc": 0.9375446960667462, "train_speed(iter/s)": 0.31458 }, { "epoch": 3.088795731707317, "grad_norm": 1.886488437652588, "learning_rate": 3.192150008357899e-05, "loss": 0.13373126983642578, "memory(GiB)": 153.57, "step": 8105, "token_acc": 0.9587843463780183, "train_speed(iter/s)": 0.314612 }, { "epoch": 3.0907012195121952, "grad_norm": 1.1524460315704346, "learning_rate": 3.186570039962551e-05, "loss": 0.11454150676727295, "memory(GiB)": 153.57, "step": 8110, "token_acc": 0.9372044140830268, "train_speed(iter/s)": 0.314631 }, { "epoch": 3.0926067073170733, "grad_norm": 1.1783227920532227, "learning_rate": 3.1809926709654306e-05, "loss": 0.11858440637588501, "memory(GiB)": 153.57, "step": 8115, "token_acc": 0.9422207876049064, "train_speed(iter/s)": 0.314663 }, { "epoch": 3.0945121951219514, "grad_norm": 1.629910945892334, "learning_rate": 3.175417909361225e-05, "loss": 0.15671241283416748, "memory(GiB)": 153.57, "step": 8120, "token_acc": 0.9403374499913029, "train_speed(iter/s)": 0.314667 }, { "epoch": 3.096417682926829, "grad_norm": 1.4178234338760376, "learning_rate": 3.1698457631408815e-05, "loss": 0.10133224725723267, "memory(GiB)": 153.57, "step": 8125, "token_acc": 0.9593805534399594, "train_speed(iter/s)": 0.314683 }, { "epoch": 3.098323170731707, "grad_norm": 1.1428158283233643, "learning_rate": 3.1642762402916006e-05, "loss": 0.1486278295516968, "memory(GiB)": 153.57, "step": 8130, "token_acc": 0.9400299850074962, "train_speed(iter/s)": 0.314679 }, { "epoch": 3.1002286585365852, "grad_norm": 1.7167911529541016, "learning_rate": 3.158709348796822e-05, "loss": 0.14194782972335815, "memory(GiB)": 153.57, "step": 8135, "token_acc": 0.9284287616511319, "train_speed(iter/s)": 0.314708 }, { "epoch": 3.1021341463414633, "grad_norm": 0.7500281929969788, "learning_rate": 3.1531450966362106e-05, "loss": 0.12886565923690796, "memory(GiB)": 153.57, "step": 8140, "token_acc": 0.9498478040295695, "train_speed(iter/s)": 0.314717 }, { "epoch": 3.1040396341463414, "grad_norm": 1.1331907510757446, "learning_rate": 3.147583491785654e-05, "loss": 0.11563223600387573, "memory(GiB)": 153.57, "step": 8145, "token_acc": 0.9496551724137932, "train_speed(iter/s)": 0.314679 }, { "epoch": 3.1059451219512195, "grad_norm": 1.4421114921569824, "learning_rate": 3.14202454221724e-05, "loss": 0.17192047834396362, "memory(GiB)": 153.57, "step": 8150, "token_acc": 0.9240271772699197, "train_speed(iter/s)": 0.314684 }, { "epoch": 3.1078506097560976, "grad_norm": 1.463215708732605, "learning_rate": 3.136468255899253e-05, "loss": 0.13079700469970704, "memory(GiB)": 153.57, "step": 8155, "token_acc": 0.9456054542759745, "train_speed(iter/s)": 0.314673 }, { "epoch": 3.1097560975609757, "grad_norm": 1.4742263555526733, "learning_rate": 3.130914640796157e-05, "loss": 0.14725289344787598, "memory(GiB)": 153.57, "step": 8160, "token_acc": 0.9462978030919447, "train_speed(iter/s)": 0.314696 }, { "epoch": 3.111661585365854, "grad_norm": 1.402984619140625, "learning_rate": 3.125363704868589e-05, "loss": 0.1588578462600708, "memory(GiB)": 153.57, "step": 8165, "token_acc": 0.9402825913089843, "train_speed(iter/s)": 0.314707 }, { "epoch": 3.113567073170732, "grad_norm": 1.4653269052505493, "learning_rate": 3.119815456073346e-05, "loss": 0.11840912103652954, "memory(GiB)": 153.57, "step": 8170, "token_acc": 0.9415169052695705, "train_speed(iter/s)": 0.314729 }, { "epoch": 3.1154725609756095, "grad_norm": 1.3060208559036255, "learning_rate": 3.114269902363374e-05, "loss": 0.1591560125350952, "memory(GiB)": 153.57, "step": 8175, "token_acc": 0.9267657023716445, "train_speed(iter/s)": 0.314754 }, { "epoch": 3.1173780487804876, "grad_norm": 1.629056692123413, "learning_rate": 3.108727051687749e-05, "loss": 0.15592230558395387, "memory(GiB)": 153.57, "step": 8180, "token_acc": 0.9354769560557342, "train_speed(iter/s)": 0.314779 }, { "epoch": 3.1192835365853657, "grad_norm": 1.4453233480453491, "learning_rate": 3.103186911991685e-05, "loss": 0.1274026155471802, "memory(GiB)": 153.57, "step": 8185, "token_acc": 0.9532438478747204, "train_speed(iter/s)": 0.314781 }, { "epoch": 3.121189024390244, "grad_norm": 1.5895839929580688, "learning_rate": 3.0976494912165e-05, "loss": 0.13347626924514772, "memory(GiB)": 153.57, "step": 8190, "token_acc": 0.949584487534626, "train_speed(iter/s)": 0.314806 }, { "epoch": 3.123094512195122, "grad_norm": 1.027300238609314, "learning_rate": 3.0921147972996165e-05, "loss": 0.15004931688308715, "memory(GiB)": 153.57, "step": 8195, "token_acc": 0.9350732017823042, "train_speed(iter/s)": 0.314799 }, { "epoch": 3.125, "grad_norm": 1.2215323448181152, "learning_rate": 3.086582838174551e-05, "loss": 0.10912497043609619, "memory(GiB)": 153.57, "step": 8200, "token_acc": 0.9564204148217199, "train_speed(iter/s)": 0.314772 }, { "epoch": 3.126905487804878, "grad_norm": 1.0116827487945557, "learning_rate": 3.0810536217708986e-05, "loss": 0.09798255562782288, "memory(GiB)": 153.57, "step": 8205, "token_acc": 0.9630660727542687, "train_speed(iter/s)": 0.314772 }, { "epoch": 3.128810975609756, "grad_norm": 1.5931872129440308, "learning_rate": 3.075527156014321e-05, "loss": 0.12041897773742676, "memory(GiB)": 153.57, "step": 8210, "token_acc": 0.9514853601196837, "train_speed(iter/s)": 0.314782 }, { "epoch": 3.1307164634146343, "grad_norm": 2.8452210426330566, "learning_rate": 3.070003448826538e-05, "loss": 0.12846100330352783, "memory(GiB)": 153.57, "step": 8215, "token_acc": 0.9472770666088088, "train_speed(iter/s)": 0.314793 }, { "epoch": 3.1326219512195124, "grad_norm": 1.704617977142334, "learning_rate": 3.0644825081253184e-05, "loss": 0.17345850467681884, "memory(GiB)": 153.57, "step": 8220, "token_acc": 0.9270045385779122, "train_speed(iter/s)": 0.314824 }, { "epoch": 3.1345274390243905, "grad_norm": 1.8838750123977661, "learning_rate": 3.05896434182446e-05, "loss": 0.13260502815246583, "memory(GiB)": 153.57, "step": 8225, "token_acc": 0.9492864424057085, "train_speed(iter/s)": 0.314843 }, { "epoch": 3.136432926829268, "grad_norm": 2.10785174369812, "learning_rate": 3.05344895783379e-05, "loss": 0.10958900451660156, "memory(GiB)": 153.57, "step": 8230, "token_acc": 0.9463850528025995, "train_speed(iter/s)": 0.314836 }, { "epoch": 3.138338414634146, "grad_norm": 1.6980335712432861, "learning_rate": 3.0479363640591435e-05, "loss": 0.16044130325317382, "memory(GiB)": 153.57, "step": 8235, "token_acc": 0.926377022359571, "train_speed(iter/s)": 0.314851 }, { "epoch": 3.1402439024390243, "grad_norm": 2.3035855293273926, "learning_rate": 3.0424265684023558e-05, "loss": 0.12374262809753418, "memory(GiB)": 153.57, "step": 8240, "token_acc": 0.949264917843759, "train_speed(iter/s)": 0.314874 }, { "epoch": 3.1421493902439024, "grad_norm": 1.37135648727417, "learning_rate": 3.036919578761252e-05, "loss": 0.08393791913986207, "memory(GiB)": 153.57, "step": 8245, "token_acc": 0.9545222278998468, "train_speed(iter/s)": 0.314915 }, { "epoch": 3.1440548780487805, "grad_norm": 2.28181529045105, "learning_rate": 3.031415403029636e-05, "loss": 0.1259385585784912, "memory(GiB)": 153.57, "step": 8250, "token_acc": 0.9512365029606409, "train_speed(iter/s)": 0.314936 }, { "epoch": 3.1459603658536586, "grad_norm": 1.3995842933654785, "learning_rate": 3.025914049097277e-05, "loss": 0.13892409801483155, "memory(GiB)": 153.57, "step": 8255, "token_acc": 0.9396597554492291, "train_speed(iter/s)": 0.314903 }, { "epoch": 3.1478658536585367, "grad_norm": 1.2366081476211548, "learning_rate": 3.0204155248498994e-05, "loss": 0.12374976873397828, "memory(GiB)": 153.57, "step": 8260, "token_acc": 0.9476275738585497, "train_speed(iter/s)": 0.31491 }, { "epoch": 3.1497713414634148, "grad_norm": 0.9793564677238464, "learning_rate": 3.014919838169171e-05, "loss": 0.14572383165359498, "memory(GiB)": 153.57, "step": 8265, "token_acc": 0.9396349177661305, "train_speed(iter/s)": 0.314863 }, { "epoch": 3.151676829268293, "grad_norm": 1.0064960718154907, "learning_rate": 3.009426996932696e-05, "loss": 0.0983627200126648, "memory(GiB)": 153.57, "step": 8270, "token_acc": 0.9591590556608651, "train_speed(iter/s)": 0.314874 }, { "epoch": 3.153582317073171, "grad_norm": 1.1551921367645264, "learning_rate": 3.0039370090139956e-05, "loss": 0.09956080317497254, "memory(GiB)": 153.57, "step": 8275, "token_acc": 0.9534227240649259, "train_speed(iter/s)": 0.314843 }, { "epoch": 3.1554878048780486, "grad_norm": 2.108736515045166, "learning_rate": 2.9984498822825025e-05, "loss": 0.1430426836013794, "memory(GiB)": 153.57, "step": 8280, "token_acc": 0.9439085387638443, "train_speed(iter/s)": 0.314874 }, { "epoch": 3.1573932926829267, "grad_norm": 1.5179682970046997, "learning_rate": 2.9929656246035477e-05, "loss": 0.11926083564758301, "memory(GiB)": 153.57, "step": 8285, "token_acc": 0.9488912925905895, "train_speed(iter/s)": 0.314886 }, { "epoch": 3.1592987804878048, "grad_norm": 1.2660516500473022, "learning_rate": 2.9874842438383503e-05, "loss": 0.1342916488647461, "memory(GiB)": 153.57, "step": 8290, "token_acc": 0.9321888412017167, "train_speed(iter/s)": 0.314924 }, { "epoch": 3.161204268292683, "grad_norm": 1.6175800561904907, "learning_rate": 2.9820057478440057e-05, "loss": 0.16856299638748168, "memory(GiB)": 153.57, "step": 8295, "token_acc": 0.9369209809264305, "train_speed(iter/s)": 0.314926 }, { "epoch": 3.163109756097561, "grad_norm": 0.8274098038673401, "learning_rate": 2.9765301444734727e-05, "loss": 0.11068623065948487, "memory(GiB)": 153.57, "step": 8300, "token_acc": 0.9567123287671233, "train_speed(iter/s)": 0.314954 }, { "epoch": 3.165015243902439, "grad_norm": 1.250878930091858, "learning_rate": 2.971057441575563e-05, "loss": 0.15824922323226928, "memory(GiB)": 153.57, "step": 8305, "token_acc": 0.9291590493601463, "train_speed(iter/s)": 0.314948 }, { "epoch": 3.166920731707317, "grad_norm": 1.4493560791015625, "learning_rate": 2.965587646994938e-05, "loss": 0.1450185775756836, "memory(GiB)": 153.57, "step": 8310, "token_acc": 0.9345195729537367, "train_speed(iter/s)": 0.31497 }, { "epoch": 3.1688262195121952, "grad_norm": 1.4039074182510376, "learning_rate": 2.960120768572081e-05, "loss": 0.11922601461410523, "memory(GiB)": 153.57, "step": 8315, "token_acc": 0.9413716814159292, "train_speed(iter/s)": 0.314982 }, { "epoch": 3.1707317073170733, "grad_norm": 0.8588539958000183, "learning_rate": 2.9546568141433006e-05, "loss": 0.10843498706817627, "memory(GiB)": 153.57, "step": 8320, "token_acc": 0.9509008389492505, "train_speed(iter/s)": 0.314969 }, { "epoch": 3.1726371951219514, "grad_norm": 1.8766322135925293, "learning_rate": 2.949195791540712e-05, "loss": 0.16247453689575195, "memory(GiB)": 153.57, "step": 8325, "token_acc": 0.9373964889036105, "train_speed(iter/s)": 0.314981 }, { "epoch": 3.174542682926829, "grad_norm": 1.6049293279647827, "learning_rate": 2.9437377085922284e-05, "loss": 0.17208940982818605, "memory(GiB)": 153.57, "step": 8330, "token_acc": 0.9338638373121132, "train_speed(iter/s)": 0.314945 }, { "epoch": 3.176448170731707, "grad_norm": 1.2329996824264526, "learning_rate": 2.93828257312155e-05, "loss": 0.12635233402252197, "memory(GiB)": 153.57, "step": 8335, "token_acc": 0.9489160764112471, "train_speed(iter/s)": 0.314963 }, { "epoch": 3.1783536585365852, "grad_norm": 1.418857216835022, "learning_rate": 2.9328303929481505e-05, "loss": 0.1505746364593506, "memory(GiB)": 153.57, "step": 8340, "token_acc": 0.9301092043681747, "train_speed(iter/s)": 0.314988 }, { "epoch": 3.1802591463414633, "grad_norm": 1.2118350267410278, "learning_rate": 2.927381175887266e-05, "loss": 0.12947956323623658, "memory(GiB)": 153.57, "step": 8345, "token_acc": 0.9419725621060437, "train_speed(iter/s)": 0.314988 }, { "epoch": 3.1821646341463414, "grad_norm": 1.5647876262664795, "learning_rate": 2.9219349297498934e-05, "loss": 0.12498061656951905, "memory(GiB)": 153.57, "step": 8350, "token_acc": 0.9495104039167687, "train_speed(iter/s)": 0.314969 }, { "epoch": 3.1840701219512195, "grad_norm": 1.2270437479019165, "learning_rate": 2.9164916623427623e-05, "loss": 0.13141329288482667, "memory(GiB)": 153.57, "step": 8355, "token_acc": 0.952263779527559, "train_speed(iter/s)": 0.314933 }, { "epoch": 3.1859756097560976, "grad_norm": 0.9250782132148743, "learning_rate": 2.9110513814683333e-05, "loss": 0.1288602352142334, "memory(GiB)": 153.57, "step": 8360, "token_acc": 0.9553599497013517, "train_speed(iter/s)": 0.31495 }, { "epoch": 3.1878810975609757, "grad_norm": 1.992962121963501, "learning_rate": 2.9056140949247917e-05, "loss": 0.13143444061279297, "memory(GiB)": 153.57, "step": 8365, "token_acc": 0.9479944029850746, "train_speed(iter/s)": 0.314865 }, { "epoch": 3.189786585365854, "grad_norm": 1.2608939409255981, "learning_rate": 2.900179810506024e-05, "loss": 0.1160628318786621, "memory(GiB)": 153.57, "step": 8370, "token_acc": 0.9542369170133567, "train_speed(iter/s)": 0.314869 }, { "epoch": 3.191692073170732, "grad_norm": 1.7254542112350464, "learning_rate": 2.894748536001619e-05, "loss": 0.16263707876205444, "memory(GiB)": 153.57, "step": 8375, "token_acc": 0.9313497042480732, "train_speed(iter/s)": 0.314862 }, { "epoch": 3.1935975609756095, "grad_norm": 1.475400447845459, "learning_rate": 2.8893202791968466e-05, "loss": 0.10168074369430542, "memory(GiB)": 153.57, "step": 8380, "token_acc": 0.95618885580604, "train_speed(iter/s)": 0.314874 }, { "epoch": 3.1955030487804876, "grad_norm": 1.0769731998443604, "learning_rate": 2.883895047872654e-05, "loss": 0.12550333738327027, "memory(GiB)": 153.57, "step": 8385, "token_acc": 0.9446958270487682, "train_speed(iter/s)": 0.314839 }, { "epoch": 3.1974085365853657, "grad_norm": 2.5158908367156982, "learning_rate": 2.8784728498056507e-05, "loss": 0.14844528436660767, "memory(GiB)": 153.57, "step": 8390, "token_acc": 0.9249220436555529, "train_speed(iter/s)": 0.31481 }, { "epoch": 3.199314024390244, "grad_norm": 1.2765144109725952, "learning_rate": 2.8730536927681008e-05, "loss": 0.11856961250305176, "memory(GiB)": 153.57, "step": 8395, "token_acc": 0.9434433541480821, "train_speed(iter/s)": 0.31482 }, { "epoch": 3.201219512195122, "grad_norm": 2.060453414916992, "learning_rate": 2.8676375845279013e-05, "loss": 0.09891250133514404, "memory(GiB)": 153.57, "step": 8400, "token_acc": 0.9583685545224007, "train_speed(iter/s)": 0.314819 }, { "epoch": 3.203125, "grad_norm": 1.0698132514953613, "learning_rate": 2.8622245328485907e-05, "loss": 0.13973097801208495, "memory(GiB)": 153.57, "step": 8405, "token_acc": 0.9486948694869487, "train_speed(iter/s)": 0.314848 }, { "epoch": 3.205030487804878, "grad_norm": 1.1809558868408203, "learning_rate": 2.8568145454893147e-05, "loss": 0.1335063099861145, "memory(GiB)": 153.57, "step": 8410, "token_acc": 0.9434921557234166, "train_speed(iter/s)": 0.31483 }, { "epoch": 3.206935975609756, "grad_norm": 1.7437108755111694, "learning_rate": 2.8514076302048364e-05, "loss": 0.14977171421051025, "memory(GiB)": 153.57, "step": 8415, "token_acc": 0.9354395604395604, "train_speed(iter/s)": 0.314851 }, { "epoch": 3.2088414634146343, "grad_norm": 1.8003849983215332, "learning_rate": 2.846003794745507e-05, "loss": 0.1374466300010681, "memory(GiB)": 153.57, "step": 8420, "token_acc": 0.9454789786907276, "train_speed(iter/s)": 0.31487 }, { "epoch": 3.2107469512195124, "grad_norm": 1.228039026260376, "learning_rate": 2.8406030468572698e-05, "loss": 0.16113882064819335, "memory(GiB)": 153.57, "step": 8425, "token_acc": 0.9348866900734121, "train_speed(iter/s)": 0.3149 }, { "epoch": 3.2126524390243905, "grad_norm": 1.419751524925232, "learning_rate": 2.835205394281635e-05, "loss": 0.10150434970855712, "memory(GiB)": 153.57, "step": 8430, "token_acc": 0.9546652609383237, "train_speed(iter/s)": 0.314865 }, { "epoch": 3.214557926829268, "grad_norm": 1.9332129955291748, "learning_rate": 2.829810844755687e-05, "loss": 0.08846228122711182, "memory(GiB)": 153.57, "step": 8435, "token_acc": 0.9571242962321351, "train_speed(iter/s)": 0.3149 }, { "epoch": 3.216463414634146, "grad_norm": 1.7012300491333008, "learning_rate": 2.8244194060120498e-05, "loss": 0.12764008045196534, "memory(GiB)": 153.57, "step": 8440, "token_acc": 0.9483589982890029, "train_speed(iter/s)": 0.314899 }, { "epoch": 3.2183689024390243, "grad_norm": 1.1854345798492432, "learning_rate": 2.8190310857788986e-05, "loss": 0.12581162452697753, "memory(GiB)": 153.57, "step": 8445, "token_acc": 0.9429523214657506, "train_speed(iter/s)": 0.314911 }, { "epoch": 3.2202743902439024, "grad_norm": 1.229865312576294, "learning_rate": 2.8136458917799303e-05, "loss": 0.12729430198669434, "memory(GiB)": 153.57, "step": 8450, "token_acc": 0.9475132275132275, "train_speed(iter/s)": 0.314864 }, { "epoch": 3.2221798780487805, "grad_norm": 1.691948652267456, "learning_rate": 2.8082638317343667e-05, "loss": 0.14034258127212523, "memory(GiB)": 153.57, "step": 8455, "token_acc": 0.9436392914653784, "train_speed(iter/s)": 0.314874 }, { "epoch": 3.2240853658536586, "grad_norm": 1.236995816230774, "learning_rate": 2.8028849133569322e-05, "loss": 0.07277644872665405, "memory(GiB)": 153.57, "step": 8460, "token_acc": 0.9645977011494253, "train_speed(iter/s)": 0.314855 }, { "epoch": 3.2259908536585367, "grad_norm": 2.0392749309539795, "learning_rate": 2.797509144357855e-05, "loss": 0.1522086501121521, "memory(GiB)": 153.57, "step": 8465, "token_acc": 0.9358520267611177, "train_speed(iter/s)": 0.314819 }, { "epoch": 3.2278963414634148, "grad_norm": 2.0052545070648193, "learning_rate": 2.792136532442838e-05, "loss": 0.1549181818962097, "memory(GiB)": 153.57, "step": 8470, "token_acc": 0.9349112426035503, "train_speed(iter/s)": 0.314837 }, { "epoch": 3.229801829268293, "grad_norm": 1.2279058694839478, "learning_rate": 2.7867670853130723e-05, "loss": 0.13125483989715575, "memory(GiB)": 153.57, "step": 8475, "token_acc": 0.9408438451500652, "train_speed(iter/s)": 0.314806 }, { "epoch": 3.231707317073171, "grad_norm": 1.2908215522766113, "learning_rate": 2.7814008106652012e-05, "loss": 0.10770713090896607, "memory(GiB)": 153.57, "step": 8480, "token_acc": 0.950524246395806, "train_speed(iter/s)": 0.314834 }, { "epoch": 3.2336128048780486, "grad_norm": 1.4749466180801392, "learning_rate": 2.7760377161913282e-05, "loss": 0.13706870079040528, "memory(GiB)": 153.57, "step": 8485, "token_acc": 0.9383792909397861, "train_speed(iter/s)": 0.314862 }, { "epoch": 3.2355182926829267, "grad_norm": 0.6129987835884094, "learning_rate": 2.7706778095789903e-05, "loss": 0.09979647397994995, "memory(GiB)": 153.57, "step": 8490, "token_acc": 0.9596480582524272, "train_speed(iter/s)": 0.314876 }, { "epoch": 3.2374237804878048, "grad_norm": 1.184202790260315, "learning_rate": 2.7653210985111642e-05, "loss": 0.10745254755020142, "memory(GiB)": 153.57, "step": 8495, "token_acc": 0.9592833876221498, "train_speed(iter/s)": 0.314898 }, { "epoch": 3.239329268292683, "grad_norm": 1.3827711343765259, "learning_rate": 2.759967590666238e-05, "loss": 0.14803709983825683, "memory(GiB)": 153.57, "step": 8500, "token_acc": 0.9405461458693398, "train_speed(iter/s)": 0.314915 }, { "epoch": 3.241234756097561, "grad_norm": 1.9228706359863281, "learning_rate": 2.7546172937180144e-05, "loss": 0.1230665922164917, "memory(GiB)": 153.57, "step": 8505, "token_acc": 0.9541300224176582, "train_speed(iter/s)": 0.314867 }, { "epoch": 3.243140243902439, "grad_norm": 1.113930344581604, "learning_rate": 2.7492702153356854e-05, "loss": 0.11261096000671386, "memory(GiB)": 153.57, "step": 8510, "token_acc": 0.9476309226932669, "train_speed(iter/s)": 0.314875 }, { "epoch": 3.245045731707317, "grad_norm": 0.9313448667526245, "learning_rate": 2.743926363183843e-05, "loss": 0.16903659105300903, "memory(GiB)": 153.57, "step": 8515, "token_acc": 0.9447148140938291, "train_speed(iter/s)": 0.314879 }, { "epoch": 3.2469512195121952, "grad_norm": 1.8159797191619873, "learning_rate": 2.7385857449224405e-05, "loss": 0.10811780691146851, "memory(GiB)": 153.57, "step": 8520, "token_acc": 0.9508009153318078, "train_speed(iter/s)": 0.314909 }, { "epoch": 3.2488567073170733, "grad_norm": 0.9026777148246765, "learning_rate": 2.7332483682068056e-05, "loss": 0.11574270725250244, "memory(GiB)": 153.57, "step": 8525, "token_acc": 0.939161414086051, "train_speed(iter/s)": 0.314919 }, { "epoch": 3.2507621951219514, "grad_norm": 1.141029953956604, "learning_rate": 2.7279142406876124e-05, "loss": 0.1655093550682068, "memory(GiB)": 153.57, "step": 8530, "token_acc": 0.9302954238269936, "train_speed(iter/s)": 0.314928 }, { "epoch": 3.2526676829268295, "grad_norm": 1.3183574676513672, "learning_rate": 2.7225833700108838e-05, "loss": 0.10641398429870605, "memory(GiB)": 153.57, "step": 8535, "token_acc": 0.9593039573393208, "train_speed(iter/s)": 0.314902 }, { "epoch": 3.254573170731707, "grad_norm": 1.2607579231262207, "learning_rate": 2.7172557638179675e-05, "loss": 0.11205546855926514, "memory(GiB)": 153.57, "step": 8540, "token_acc": 0.9475396502643351, "train_speed(iter/s)": 0.314906 }, { "epoch": 3.2564786585365852, "grad_norm": 1.159287929534912, "learning_rate": 2.7119314297455407e-05, "loss": 0.14348719120025635, "memory(GiB)": 153.57, "step": 8545, "token_acc": 0.9477188173095751, "train_speed(iter/s)": 0.314916 }, { "epoch": 3.2583841463414633, "grad_norm": 1.0320243835449219, "learning_rate": 2.706610375425578e-05, "loss": 0.16240758895874025, "memory(GiB)": 153.57, "step": 8550, "token_acc": 0.9353565260301011, "train_speed(iter/s)": 0.314927 }, { "epoch": 3.2602896341463414, "grad_norm": 1.2057080268859863, "learning_rate": 2.7012926084853685e-05, "loss": 0.109012770652771, "memory(GiB)": 153.57, "step": 8555, "token_acc": 0.9454369869628199, "train_speed(iter/s)": 0.31489 }, { "epoch": 3.2621951219512195, "grad_norm": 2.2132747173309326, "learning_rate": 2.6959781365474758e-05, "loss": 0.1627878427505493, "memory(GiB)": 153.57, "step": 8560, "token_acc": 0.9326589595375723, "train_speed(iter/s)": 0.314918 }, { "epoch": 3.2641006097560976, "grad_norm": 2.263517141342163, "learning_rate": 2.690666967229747e-05, "loss": 0.13463023900985718, "memory(GiB)": 153.57, "step": 8565, "token_acc": 0.9426424050632911, "train_speed(iter/s)": 0.314952 }, { "epoch": 3.2660060975609757, "grad_norm": 0.8981589674949646, "learning_rate": 2.6853591081452906e-05, "loss": 0.11674795150756836, "memory(GiB)": 153.57, "step": 8570, "token_acc": 0.9545366232756946, "train_speed(iter/s)": 0.314955 }, { "epoch": 3.267911585365854, "grad_norm": 0.4085671901702881, "learning_rate": 2.6800545669024767e-05, "loss": 0.13669270277023315, "memory(GiB)": 153.57, "step": 8575, "token_acc": 0.9445100354191264, "train_speed(iter/s)": 0.314985 }, { "epoch": 3.269817073170732, "grad_norm": 1.3795597553253174, "learning_rate": 2.6747533511049127e-05, "loss": 0.1297823667526245, "memory(GiB)": 153.57, "step": 8580, "token_acc": 0.9453562850280224, "train_speed(iter/s)": 0.314998 }, { "epoch": 3.2717225609756095, "grad_norm": 1.1336694955825806, "learning_rate": 2.669455468351445e-05, "loss": 0.13530502319335938, "memory(GiB)": 153.57, "step": 8585, "token_acc": 0.9416843595187544, "train_speed(iter/s)": 0.31498 }, { "epoch": 3.2736280487804876, "grad_norm": 1.6336407661437988, "learning_rate": 2.6641609262361344e-05, "loss": 0.15506129264831542, "memory(GiB)": 153.57, "step": 8590, "token_acc": 0.9393455706304868, "train_speed(iter/s)": 0.314961 }, { "epoch": 3.2755335365853657, "grad_norm": 1.500081181526184, "learning_rate": 2.658869732348267e-05, "loss": 0.15644536018371583, "memory(GiB)": 153.57, "step": 8595, "token_acc": 0.9411576078242745, "train_speed(iter/s)": 0.314967 }, { "epoch": 3.277439024390244, "grad_norm": 1.0946706533432007, "learning_rate": 2.6535818942723144e-05, "loss": 0.09688674211502075, "memory(GiB)": 153.57, "step": 8600, "token_acc": 0.9611853088480802, "train_speed(iter/s)": 0.314976 }, { "epoch": 3.279344512195122, "grad_norm": 1.4629669189453125, "learning_rate": 2.6482974195879485e-05, "loss": 0.125210440158844, "memory(GiB)": 153.57, "step": 8605, "token_acc": 0.9482959268495428, "train_speed(iter/s)": 0.314979 }, { "epoch": 3.28125, "grad_norm": 1.1192978620529175, "learning_rate": 2.6430163158700115e-05, "loss": 0.09342519044876099, "memory(GiB)": 153.57, "step": 8610, "token_acc": 0.9554808903821923, "train_speed(iter/s)": 0.314989 }, { "epoch": 3.283155487804878, "grad_norm": 1.525815725326538, "learning_rate": 2.637738590688524e-05, "loss": 0.1250664472579956, "memory(GiB)": 153.57, "step": 8615, "token_acc": 0.9445300462249615, "train_speed(iter/s)": 0.315008 }, { "epoch": 3.285060975609756, "grad_norm": 0.9307700991630554, "learning_rate": 2.6324642516086523e-05, "loss": 0.11316391229629516, "memory(GiB)": 153.57, "step": 8620, "token_acc": 0.9546210106382979, "train_speed(iter/s)": 0.315001 }, { "epoch": 3.2869664634146343, "grad_norm": 1.3557524681091309, "learning_rate": 2.6271933061907194e-05, "loss": 0.18797240257263184, "memory(GiB)": 153.57, "step": 8625, "token_acc": 0.9238373399236127, "train_speed(iter/s)": 0.315017 }, { "epoch": 3.2888719512195124, "grad_norm": 1.9245398044586182, "learning_rate": 2.621925761990175e-05, "loss": 0.14327441453933715, "memory(GiB)": 153.57, "step": 8630, "token_acc": 0.9368964742869975, "train_speed(iter/s)": 0.315037 }, { "epoch": 3.2907774390243905, "grad_norm": 1.2906615734100342, "learning_rate": 2.6166616265575995e-05, "loss": 0.12863123416900635, "memory(GiB)": 153.57, "step": 8635, "token_acc": 0.9473439677868979, "train_speed(iter/s)": 0.315051 }, { "epoch": 3.292682926829268, "grad_norm": 0.8463602066040039, "learning_rate": 2.6114009074386846e-05, "loss": 0.15719593763351442, "memory(GiB)": 153.57, "step": 8640, "token_acc": 0.9443151298119964, "train_speed(iter/s)": 0.315058 }, { "epoch": 3.294588414634146, "grad_norm": 1.6721746921539307, "learning_rate": 2.6061436121742283e-05, "loss": 0.14289703369140624, "memory(GiB)": 153.57, "step": 8645, "token_acc": 0.9426377597109304, "train_speed(iter/s)": 0.315099 }, { "epoch": 3.2964939024390243, "grad_norm": 1.3738651275634766, "learning_rate": 2.6008897483001137e-05, "loss": 0.1436275362968445, "memory(GiB)": 153.57, "step": 8650, "token_acc": 0.9450954134583194, "train_speed(iter/s)": 0.315125 }, { "epoch": 3.2983993902439024, "grad_norm": 0.9880985021591187, "learning_rate": 2.595639323347313e-05, "loss": 0.09712170362472534, "memory(GiB)": 153.57, "step": 8655, "token_acc": 0.9473519272955186, "train_speed(iter/s)": 0.315155 }, { "epoch": 3.3003048780487805, "grad_norm": 1.9166243076324463, "learning_rate": 2.5903923448418633e-05, "loss": 0.10069873332977294, "memory(GiB)": 153.57, "step": 8660, "token_acc": 0.9582016157358623, "train_speed(iter/s)": 0.315184 }, { "epoch": 3.3022103658536586, "grad_norm": 0.7631657719612122, "learning_rate": 2.5851488203048656e-05, "loss": 0.1207470417022705, "memory(GiB)": 153.57, "step": 8665, "token_acc": 0.9395763656633221, "train_speed(iter/s)": 0.315202 }, { "epoch": 3.3041158536585367, "grad_norm": 1.0912914276123047, "learning_rate": 2.579908757252465e-05, "loss": 0.09209309816360474, "memory(GiB)": 153.57, "step": 8670, "token_acc": 0.9559055118110236, "train_speed(iter/s)": 0.315218 }, { "epoch": 3.3060213414634148, "grad_norm": 0.7610796689987183, "learning_rate": 2.5746721631958505e-05, "loss": 0.12740758657455445, "memory(GiB)": 153.57, "step": 8675, "token_acc": 0.9463712267180475, "train_speed(iter/s)": 0.315186 }, { "epoch": 3.307926829268293, "grad_norm": 1.3919028043746948, "learning_rate": 2.569439045641236e-05, "loss": 0.13619982004165648, "memory(GiB)": 153.57, "step": 8680, "token_acc": 0.9367924528301886, "train_speed(iter/s)": 0.315202 }, { "epoch": 3.3098323170731705, "grad_norm": 2.3099019527435303, "learning_rate": 2.5642094120898537e-05, "loss": 0.1676067590713501, "memory(GiB)": 153.57, "step": 8685, "token_acc": 0.9251766217084136, "train_speed(iter/s)": 0.315221 }, { "epoch": 3.3117378048780486, "grad_norm": 1.428125023841858, "learning_rate": 2.5589832700379368e-05, "loss": 0.14665160179138184, "memory(GiB)": 153.57, "step": 8690, "token_acc": 0.9376647834274953, "train_speed(iter/s)": 0.315224 }, { "epoch": 3.3136432926829267, "grad_norm": 2.365090847015381, "learning_rate": 2.55376062697672e-05, "loss": 0.11821606159210205, "memory(GiB)": 153.57, "step": 8695, "token_acc": 0.9540133779264214, "train_speed(iter/s)": 0.315232 }, { "epoch": 3.3155487804878048, "grad_norm": 1.289071798324585, "learning_rate": 2.5485414903924176e-05, "loss": 0.09445210099220276, "memory(GiB)": 153.57, "step": 8700, "token_acc": 0.9591335417597141, "train_speed(iter/s)": 0.315241 }, { "epoch": 3.317454268292683, "grad_norm": 1.1702053546905518, "learning_rate": 2.5433258677662226e-05, "loss": 0.14362895488739014, "memory(GiB)": 153.57, "step": 8705, "token_acc": 0.9439795046968403, "train_speed(iter/s)": 0.315248 }, { "epoch": 3.319359756097561, "grad_norm": 2.3561439514160156, "learning_rate": 2.5381137665742843e-05, "loss": 0.13832825422286987, "memory(GiB)": 153.57, "step": 8710, "token_acc": 0.9468403074295474, "train_speed(iter/s)": 0.315214 }, { "epoch": 3.321265243902439, "grad_norm": 1.7571706771850586, "learning_rate": 2.53290519428771e-05, "loss": 0.12100759744644166, "memory(GiB)": 153.57, "step": 8715, "token_acc": 0.9484831139095592, "train_speed(iter/s)": 0.315226 }, { "epoch": 3.323170731707317, "grad_norm": 2.05378794670105, "learning_rate": 2.527700158372548e-05, "loss": 0.12921528816223143, "memory(GiB)": 153.57, "step": 8720, "token_acc": 0.9519397799652577, "train_speed(iter/s)": 0.315237 }, { "epoch": 3.3250762195121952, "grad_norm": 1.3434348106384277, "learning_rate": 2.522498666289777e-05, "loss": 0.11704028844833374, "memory(GiB)": 153.57, "step": 8725, "token_acc": 0.9310699588477366, "train_speed(iter/s)": 0.315264 }, { "epoch": 3.3269817073170733, "grad_norm": 1.368567705154419, "learning_rate": 2.517300725495292e-05, "loss": 0.14859567880630492, "memory(GiB)": 153.57, "step": 8730, "token_acc": 0.9416635302973279, "train_speed(iter/s)": 0.315298 }, { "epoch": 3.3288871951219514, "grad_norm": 1.8689196109771729, "learning_rate": 2.512106343439905e-05, "loss": 0.12865076065063477, "memory(GiB)": 153.57, "step": 8735, "token_acc": 0.9477439259544929, "train_speed(iter/s)": 0.315289 }, { "epoch": 3.3307926829268295, "grad_norm": 1.025342583656311, "learning_rate": 2.506915527569318e-05, "loss": 0.08779987096786498, "memory(GiB)": 153.57, "step": 8740, "token_acc": 0.9597054612395173, "train_speed(iter/s)": 0.31529 }, { "epoch": 3.332698170731707, "grad_norm": 0.620232343673706, "learning_rate": 2.5017282853241298e-05, "loss": 0.10863759517669677, "memory(GiB)": 153.57, "step": 8745, "token_acc": 0.9650291423813488, "train_speed(iter/s)": 0.315333 }, { "epoch": 3.3346036585365852, "grad_norm": 1.2202972173690796, "learning_rate": 2.4965446241398087e-05, "loss": 0.11951466798782348, "memory(GiB)": 153.57, "step": 8750, "token_acc": 0.9510164879141988, "train_speed(iter/s)": 0.315329 }, { "epoch": 3.3365091463414633, "grad_norm": 1.223583459854126, "learning_rate": 2.491364551446697e-05, "loss": 0.1256582498550415, "memory(GiB)": 153.57, "step": 8755, "token_acc": 0.9433301496338745, "train_speed(iter/s)": 0.315363 }, { "epoch": 3.3384146341463414, "grad_norm": 1.61128568649292, "learning_rate": 2.486188074669989e-05, "loss": 0.1512984037399292, "memory(GiB)": 153.57, "step": 8760, "token_acc": 0.9428810253552522, "train_speed(iter/s)": 0.315374 }, { "epoch": 3.3403201219512195, "grad_norm": 0.8889504671096802, "learning_rate": 2.4810152012297282e-05, "loss": 0.11798864603042603, "memory(GiB)": 153.57, "step": 8765, "token_acc": 0.9498819684038496, "train_speed(iter/s)": 0.315383 }, { "epoch": 3.3422256097560976, "grad_norm": 1.4105138778686523, "learning_rate": 2.4758459385407863e-05, "loss": 0.11821213960647584, "memory(GiB)": 153.57, "step": 8770, "token_acc": 0.9487521432653839, "train_speed(iter/s)": 0.31531 }, { "epoch": 3.3441310975609757, "grad_norm": 1.3198580741882324, "learning_rate": 2.470680294012867e-05, "loss": 0.19114887714385986, "memory(GiB)": 153.57, "step": 8775, "token_acc": 0.913633355393779, "train_speed(iter/s)": 0.31525 }, { "epoch": 3.346036585365854, "grad_norm": 1.0317524671554565, "learning_rate": 2.4655182750504806e-05, "loss": 0.11405005455017089, "memory(GiB)": 153.57, "step": 8780, "token_acc": 0.9562113279390766, "train_speed(iter/s)": 0.315273 }, { "epoch": 3.347942073170732, "grad_norm": 1.596534013748169, "learning_rate": 2.460359889052948e-05, "loss": 0.10830960273742676, "memory(GiB)": 153.57, "step": 8785, "token_acc": 0.9534296028880866, "train_speed(iter/s)": 0.315297 }, { "epoch": 3.3498475609756095, "grad_norm": 1.2634305953979492, "learning_rate": 2.4552051434143746e-05, "loss": 0.11304295063018799, "memory(GiB)": 153.57, "step": 8790, "token_acc": 0.9554062309102016, "train_speed(iter/s)": 0.315244 }, { "epoch": 3.3517530487804876, "grad_norm": 1.1486423015594482, "learning_rate": 2.4500540455236536e-05, "loss": 0.10925298929214478, "memory(GiB)": 153.57, "step": 8795, "token_acc": 0.9547836211915389, "train_speed(iter/s)": 0.315252 }, { "epoch": 3.3536585365853657, "grad_norm": 1.0818092823028564, "learning_rate": 2.4449066027644475e-05, "loss": 0.11968621015548705, "memory(GiB)": 153.57, "step": 8800, "token_acc": 0.951138147566719, "train_speed(iter/s)": 0.315255 }, { "epoch": 3.3536585365853657, "eval_loss": 0.16340140998363495, "eval_runtime": 32.4279, "eval_samples_per_second": 3.269, "eval_steps_per_second": 3.269, "eval_token_acc": 0.9222026653256223, "step": 8800 }, { "epoch": 3.355564024390244, "grad_norm": 1.8637758493423462, "learning_rate": 2.4397628225151808e-05, "loss": 0.11699256896972657, "memory(GiB)": 153.57, "step": 8805, "token_acc": 0.9275028286468592, "train_speed(iter/s)": 0.314908 }, { "epoch": 3.357469512195122, "grad_norm": 1.4878095388412476, "learning_rate": 2.4346227121490235e-05, "loss": 0.1167555570602417, "memory(GiB)": 153.57, "step": 8810, "token_acc": 0.9467505241090147, "train_speed(iter/s)": 0.314938 }, { "epoch": 3.359375, "grad_norm": 2.073875665664673, "learning_rate": 2.4294862790338917e-05, "loss": 0.1768298029899597, "memory(GiB)": 153.57, "step": 8815, "token_acc": 0.9357528200098088, "train_speed(iter/s)": 0.314947 }, { "epoch": 3.361280487804878, "grad_norm": 1.8091835975646973, "learning_rate": 2.4243535305324246e-05, "loss": 0.12880494594573974, "memory(GiB)": 153.57, "step": 8820, "token_acc": 0.941972920696325, "train_speed(iter/s)": 0.314978 }, { "epoch": 3.363185975609756, "grad_norm": 1.738289475440979, "learning_rate": 2.4192244740019847e-05, "loss": 0.11323372125625611, "memory(GiB)": 153.57, "step": 8825, "token_acc": 0.9549085744718623, "train_speed(iter/s)": 0.314971 }, { "epoch": 3.3650914634146343, "grad_norm": 1.8465698957443237, "learning_rate": 2.414099116794637e-05, "loss": 0.14196680784225463, "memory(GiB)": 153.57, "step": 8830, "token_acc": 0.9472259810554804, "train_speed(iter/s)": 0.314991 }, { "epoch": 3.3669969512195124, "grad_norm": 1.2383906841278076, "learning_rate": 2.4089774662571502e-05, "loss": 0.1337056875228882, "memory(GiB)": 153.57, "step": 8835, "token_acc": 0.9487615806390622, "train_speed(iter/s)": 0.314958 }, { "epoch": 3.3689024390243905, "grad_norm": 1.256777286529541, "learning_rate": 2.4038595297309713e-05, "loss": 0.11765336990356445, "memory(GiB)": 153.57, "step": 8840, "token_acc": 0.9420317952694843, "train_speed(iter/s)": 0.314972 }, { "epoch": 3.370807926829268, "grad_norm": 1.6787757873535156, "learning_rate": 2.398745314552236e-05, "loss": 0.15144952535629272, "memory(GiB)": 153.57, "step": 8845, "token_acc": 0.9388538524814917, "train_speed(iter/s)": 0.314984 }, { "epoch": 3.372713414634146, "grad_norm": 1.9294884204864502, "learning_rate": 2.393634828051733e-05, "loss": 0.15544005632400512, "memory(GiB)": 153.57, "step": 8850, "token_acc": 0.9362582781456954, "train_speed(iter/s)": 0.314965 }, { "epoch": 3.3746189024390243, "grad_norm": 1.2445846796035767, "learning_rate": 2.3885280775549152e-05, "loss": 0.1342936396598816, "memory(GiB)": 153.57, "step": 8855, "token_acc": 0.939421175624663, "train_speed(iter/s)": 0.314935 }, { "epoch": 3.3765243902439024, "grad_norm": 1.4426624774932861, "learning_rate": 2.383425070381874e-05, "loss": 0.16345707178115845, "memory(GiB)": 153.57, "step": 8860, "token_acc": 0.9302765303710707, "train_speed(iter/s)": 0.314952 }, { "epoch": 3.3784298780487805, "grad_norm": 1.5787631273269653, "learning_rate": 2.378325813847341e-05, "loss": 0.14858741760253907, "memory(GiB)": 153.57, "step": 8865, "token_acc": 0.9476773864216437, "train_speed(iter/s)": 0.314979 }, { "epoch": 3.3803353658536586, "grad_norm": 1.5326404571533203, "learning_rate": 2.3732303152606638e-05, "loss": 0.1392128586769104, "memory(GiB)": 153.57, "step": 8870, "token_acc": 0.94349754844735, "train_speed(iter/s)": 0.31499 }, { "epoch": 3.3822408536585367, "grad_norm": 1.0936602354049683, "learning_rate": 2.3681385819258133e-05, "loss": 0.1455858826637268, "memory(GiB)": 153.57, "step": 8875, "token_acc": 0.9398090493980905, "train_speed(iter/s)": 0.315007 }, { "epoch": 3.3841463414634148, "grad_norm": 1.463751196861267, "learning_rate": 2.363050621141354e-05, "loss": 0.14095394611358641, "memory(GiB)": 153.57, "step": 8880, "token_acc": 0.9439484126984127, "train_speed(iter/s)": 0.315012 }, { "epoch": 3.386051829268293, "grad_norm": 0.9952589869499207, "learning_rate": 2.3579664402004475e-05, "loss": 0.1254573106765747, "memory(GiB)": 153.57, "step": 8885, "token_acc": 0.9520860495436767, "train_speed(iter/s)": 0.315038 }, { "epoch": 3.3879573170731705, "grad_norm": 1.14264714717865, "learning_rate": 2.3528860463908376e-05, "loss": 0.10351589918136597, "memory(GiB)": 153.57, "step": 8890, "token_acc": 0.9564765300059418, "train_speed(iter/s)": 0.315035 }, { "epoch": 3.3898628048780486, "grad_norm": 1.043481469154358, "learning_rate": 2.3478094469948392e-05, "loss": 0.1001779556274414, "memory(GiB)": 153.57, "step": 8895, "token_acc": 0.9590239410681399, "train_speed(iter/s)": 0.315046 }, { "epoch": 3.3917682926829267, "grad_norm": 1.559093952178955, "learning_rate": 2.3427366492893245e-05, "loss": 0.17002387046813966, "memory(GiB)": 153.57, "step": 8900, "token_acc": 0.9399387911247131, "train_speed(iter/s)": 0.315072 }, { "epoch": 3.3936737804878048, "grad_norm": 1.382629156112671, "learning_rate": 2.3376676605457233e-05, "loss": 0.17088181972503663, "memory(GiB)": 153.57, "step": 8905, "token_acc": 0.9350447296771685, "train_speed(iter/s)": 0.315095 }, { "epoch": 3.395579268292683, "grad_norm": 2.7601776123046875, "learning_rate": 2.332602488029997e-05, "loss": 0.13208487033843994, "memory(GiB)": 153.57, "step": 8910, "token_acc": 0.9430284857571214, "train_speed(iter/s)": 0.315123 }, { "epoch": 3.397484756097561, "grad_norm": 1.7196298837661743, "learning_rate": 2.3275411390026452e-05, "loss": 0.1446629524230957, "memory(GiB)": 153.57, "step": 8915, "token_acc": 0.940856313497823, "train_speed(iter/s)": 0.315153 }, { "epoch": 3.399390243902439, "grad_norm": 1.9929014444351196, "learning_rate": 2.32248362071868e-05, "loss": 0.14587805271148682, "memory(GiB)": 153.57, "step": 8920, "token_acc": 0.9414676135140951, "train_speed(iter/s)": 0.315117 }, { "epoch": 3.401295731707317, "grad_norm": 1.5845555067062378, "learning_rate": 2.3174299404276262e-05, "loss": 0.1029086947441101, "memory(GiB)": 153.57, "step": 8925, "token_acc": 0.9566708792200759, "train_speed(iter/s)": 0.31511 }, { "epoch": 3.4032012195121952, "grad_norm": 1.1310677528381348, "learning_rate": 2.312380105373508e-05, "loss": 0.1306721806526184, "memory(GiB)": 153.57, "step": 8930, "token_acc": 0.945136655948553, "train_speed(iter/s)": 0.315123 }, { "epoch": 3.4051067073170733, "grad_norm": 2.3968093395233154, "learning_rate": 2.3073341227948357e-05, "loss": 0.1329101800918579, "memory(GiB)": 153.57, "step": 8935, "token_acc": 0.9529856795460686, "train_speed(iter/s)": 0.315156 }, { "epoch": 3.4070121951219514, "grad_norm": 1.3431278467178345, "learning_rate": 2.3022919999245962e-05, "loss": 0.14033018350601195, "memory(GiB)": 153.57, "step": 8940, "token_acc": 0.9436270939548435, "train_speed(iter/s)": 0.315161 }, { "epoch": 3.4089176829268295, "grad_norm": 1.2122167348861694, "learning_rate": 2.2972537439902487e-05, "loss": 0.15018625259399415, "memory(GiB)": 153.57, "step": 8945, "token_acc": 0.9416787790697675, "train_speed(iter/s)": 0.315164 }, { "epoch": 3.410823170731707, "grad_norm": 1.712294101715088, "learning_rate": 2.2922193622137016e-05, "loss": 0.1319315791130066, "memory(GiB)": 153.57, "step": 8950, "token_acc": 0.9466584917228694, "train_speed(iter/s)": 0.315181 }, { "epoch": 3.4127286585365852, "grad_norm": 1.3839248418807983, "learning_rate": 2.2871888618113197e-05, "loss": 0.1254754424095154, "memory(GiB)": 153.57, "step": 8955, "token_acc": 0.9490066225165563, "train_speed(iter/s)": 0.315199 }, { "epoch": 3.4146341463414633, "grad_norm": 3.837480306625366, "learning_rate": 2.282162249993895e-05, "loss": 0.139333176612854, "memory(GiB)": 153.57, "step": 8960, "token_acc": 0.9372098705106279, "train_speed(iter/s)": 0.315224 }, { "epoch": 3.4165396341463414, "grad_norm": 1.299329161643982, "learning_rate": 2.2771395339666514e-05, "loss": 0.10019296407699585, "memory(GiB)": 153.57, "step": 8965, "token_acc": 0.9535911602209944, "train_speed(iter/s)": 0.315234 }, { "epoch": 3.4184451219512195, "grad_norm": 1.3530842065811157, "learning_rate": 2.2721207209292284e-05, "loss": 0.14037709236145018, "memory(GiB)": 153.57, "step": 8970, "token_acc": 0.9408369408369408, "train_speed(iter/s)": 0.31516 }, { "epoch": 3.4203506097560976, "grad_norm": 1.8900891542434692, "learning_rate": 2.2671058180756667e-05, "loss": 0.10397248268127442, "memory(GiB)": 153.57, "step": 8975, "token_acc": 0.9698843775208389, "train_speed(iter/s)": 0.315179 }, { "epoch": 3.4222560975609757, "grad_norm": 1.7675398588180542, "learning_rate": 2.262094832594405e-05, "loss": 0.1763313889503479, "memory(GiB)": 153.57, "step": 8980, "token_acc": 0.931566265060241, "train_speed(iter/s)": 0.315202 }, { "epoch": 3.424161585365854, "grad_norm": 1.706864595413208, "learning_rate": 2.2570877716682703e-05, "loss": 0.13844659328460693, "memory(GiB)": 153.57, "step": 8985, "token_acc": 0.9357694053563322, "train_speed(iter/s)": 0.315184 }, { "epoch": 3.426067073170732, "grad_norm": 1.202454686164856, "learning_rate": 2.2520846424744545e-05, "loss": 0.14239177703857422, "memory(GiB)": 153.57, "step": 8990, "token_acc": 0.9377005347593583, "train_speed(iter/s)": 0.315199 }, { "epoch": 3.4279725609756095, "grad_norm": 2.022578239440918, "learning_rate": 2.247085452184525e-05, "loss": 0.17486990690231324, "memory(GiB)": 153.57, "step": 8995, "token_acc": 0.9185969556585043, "train_speed(iter/s)": 0.315208 }, { "epoch": 3.4298780487804876, "grad_norm": 1.4716941118240356, "learning_rate": 2.242090207964393e-05, "loss": 0.13175266981124878, "memory(GiB)": 153.57, "step": 9000, "token_acc": 0.9414362519201229, "train_speed(iter/s)": 0.315211 }, { "epoch": 3.4317835365853657, "grad_norm": 1.2095222473144531, "learning_rate": 2.2370989169743197e-05, "loss": 0.13511110544204713, "memory(GiB)": 153.57, "step": 9005, "token_acc": 0.9409815696831643, "train_speed(iter/s)": 0.315214 }, { "epoch": 3.433689024390244, "grad_norm": 1.6322389841079712, "learning_rate": 2.2321115863689003e-05, "loss": 0.1496240973472595, "memory(GiB)": 153.57, "step": 9010, "token_acc": 0.9436889077773055, "train_speed(iter/s)": 0.315185 }, { "epoch": 3.435594512195122, "grad_norm": 2.1517622470855713, "learning_rate": 2.2271282232970458e-05, "loss": 0.13385870456695556, "memory(GiB)": 153.57, "step": 9015, "token_acc": 0.9456902592852138, "train_speed(iter/s)": 0.315214 }, { "epoch": 3.4375, "grad_norm": 1.7635605335235596, "learning_rate": 2.2221488349019903e-05, "loss": 0.11977691650390625, "memory(GiB)": 153.57, "step": 9020, "token_acc": 0.9529494382022472, "train_speed(iter/s)": 0.315234 }, { "epoch": 3.439405487804878, "grad_norm": 1.4232138395309448, "learning_rate": 2.2171734283212608e-05, "loss": 0.12821455001831056, "memory(GiB)": 153.57, "step": 9025, "token_acc": 0.9437994722955145, "train_speed(iter/s)": 0.315259 }, { "epoch": 3.441310975609756, "grad_norm": 2.2713234424591064, "learning_rate": 2.2122020106866824e-05, "loss": 0.1336764097213745, "memory(GiB)": 153.57, "step": 9030, "token_acc": 0.944393466691136, "train_speed(iter/s)": 0.315259 }, { "epoch": 3.4432164634146343, "grad_norm": 2.100065231323242, "learning_rate": 2.2072345891243644e-05, "loss": 0.12859123945236206, "memory(GiB)": 153.57, "step": 9035, "token_acc": 0.9498272287491362, "train_speed(iter/s)": 0.315254 }, { "epoch": 3.4451219512195124, "grad_norm": 0.8036344051361084, "learning_rate": 2.20227117075468e-05, "loss": 0.103058123588562, "memory(GiB)": 153.57, "step": 9040, "token_acc": 0.9560229445506692, "train_speed(iter/s)": 0.315244 }, { "epoch": 3.4470274390243905, "grad_norm": 1.8794971704483032, "learning_rate": 2.1973117626922713e-05, "loss": 0.12968981266021729, "memory(GiB)": 153.57, "step": 9045, "token_acc": 0.9434169969176575, "train_speed(iter/s)": 0.315264 }, { "epoch": 3.448932926829268, "grad_norm": 0.8568743467330933, "learning_rate": 2.1923563720460316e-05, "loss": 0.13257490396499633, "memory(GiB)": 153.57, "step": 9050, "token_acc": 0.9441329179646937, "train_speed(iter/s)": 0.315264 }, { "epoch": 3.450838414634146, "grad_norm": 1.0793404579162598, "learning_rate": 2.1874050059190894e-05, "loss": 0.14166663885116576, "memory(GiB)": 153.57, "step": 9055, "token_acc": 0.9407496977025392, "train_speed(iter/s)": 0.315223 }, { "epoch": 3.4527439024390243, "grad_norm": 1.0831483602523804, "learning_rate": 2.1824576714088123e-05, "loss": 0.10705820322036744, "memory(GiB)": 153.57, "step": 9060, "token_acc": 0.9548598471059337, "train_speed(iter/s)": 0.315195 }, { "epoch": 3.4546493902439024, "grad_norm": 1.4062113761901855, "learning_rate": 2.1775143756067824e-05, "loss": 0.13775646686553955, "memory(GiB)": 153.57, "step": 9065, "token_acc": 0.9421754953497776, "train_speed(iter/s)": 0.315189 }, { "epoch": 3.4565548780487805, "grad_norm": 1.8547838926315308, "learning_rate": 2.1725751255987963e-05, "loss": 0.13667328357696534, "memory(GiB)": 153.57, "step": 9070, "token_acc": 0.9351685104193466, "train_speed(iter/s)": 0.315203 }, { "epoch": 3.4584603658536586, "grad_norm": 1.1968729496002197, "learning_rate": 2.1676399284648524e-05, "loss": 0.10814080238342286, "memory(GiB)": 153.57, "step": 9075, "token_acc": 0.9545836837678722, "train_speed(iter/s)": 0.315235 }, { "epoch": 3.4603658536585367, "grad_norm": 2.155623435974121, "learning_rate": 2.162708791279135e-05, "loss": 0.1985549211502075, "memory(GiB)": 153.57, "step": 9080, "token_acc": 0.9142519219396806, "train_speed(iter/s)": 0.315257 }, { "epoch": 3.4622713414634148, "grad_norm": 1.2915196418762207, "learning_rate": 2.1577817211100128e-05, "loss": 0.15732212066650392, "memory(GiB)": 153.57, "step": 9085, "token_acc": 0.9431731502669718, "train_speed(iter/s)": 0.315252 }, { "epoch": 3.464176829268293, "grad_norm": 1.0970711708068848, "learning_rate": 2.1528587250200248e-05, "loss": 0.11228183507919312, "memory(GiB)": 153.57, "step": 9090, "token_acc": 0.960955710955711, "train_speed(iter/s)": 0.315205 }, { "epoch": 3.4660823170731705, "grad_norm": 2.12918758392334, "learning_rate": 2.147939810065866e-05, "loss": 0.15259385108947754, "memory(GiB)": 153.57, "step": 9095, "token_acc": 0.9380831212892281, "train_speed(iter/s)": 0.315245 }, { "epoch": 3.4679878048780486, "grad_norm": 1.3640626668930054, "learning_rate": 2.1430249832983873e-05, "loss": 0.12700788974761962, "memory(GiB)": 153.57, "step": 9100, "token_acc": 0.946676231468197, "train_speed(iter/s)": 0.31526 }, { "epoch": 3.4698932926829267, "grad_norm": 1.188637137413025, "learning_rate": 2.1381142517625736e-05, "loss": 0.10651180744171143, "memory(GiB)": 153.57, "step": 9105, "token_acc": 0.9532019704433498, "train_speed(iter/s)": 0.315265 }, { "epoch": 3.4717987804878048, "grad_norm": 1.9598772525787354, "learning_rate": 2.1332076224975457e-05, "loss": 0.17402836084365844, "memory(GiB)": 153.57, "step": 9110, "token_acc": 0.9332146037399822, "train_speed(iter/s)": 0.315236 }, { "epoch": 3.473704268292683, "grad_norm": 1.0841922760009766, "learning_rate": 2.128305102536537e-05, "loss": 0.13262943029403687, "memory(GiB)": 153.57, "step": 9115, "token_acc": 0.9459108048368222, "train_speed(iter/s)": 0.315231 }, { "epoch": 3.475609756097561, "grad_norm": 1.260262370109558, "learning_rate": 2.1234066989068972e-05, "loss": 0.12751134634017944, "memory(GiB)": 153.57, "step": 9120, "token_acc": 0.9436020218143123, "train_speed(iter/s)": 0.315259 }, { "epoch": 3.477515243902439, "grad_norm": 1.7203400135040283, "learning_rate": 2.1185124186300726e-05, "loss": 0.17690620422363282, "memory(GiB)": 153.57, "step": 9125, "token_acc": 0.9192286193404137, "train_speed(iter/s)": 0.315284 }, { "epoch": 3.479420731707317, "grad_norm": 1.5799440145492554, "learning_rate": 2.113622268721601e-05, "loss": 0.1133494257926941, "memory(GiB)": 153.57, "step": 9130, "token_acc": 0.9387052341597796, "train_speed(iter/s)": 0.315314 }, { "epoch": 3.4813262195121952, "grad_norm": 1.3387322425842285, "learning_rate": 2.108736256191095e-05, "loss": 0.16127970218658447, "memory(GiB)": 153.57, "step": 9135, "token_acc": 0.9327460249247959, "train_speed(iter/s)": 0.315332 }, { "epoch": 3.4832317073170733, "grad_norm": 1.1406440734863281, "learning_rate": 2.103854388042243e-05, "loss": 0.08425362110137939, "memory(GiB)": 153.57, "step": 9140, "token_acc": 0.9497008641701751, "train_speed(iter/s)": 0.315346 }, { "epoch": 3.4851371951219514, "grad_norm": 1.8120265007019043, "learning_rate": 2.0989766712727848e-05, "loss": 0.15765851736068726, "memory(GiB)": 153.57, "step": 9145, "token_acc": 0.9276157804459692, "train_speed(iter/s)": 0.315339 }, { "epoch": 3.4870426829268295, "grad_norm": 2.310845375061035, "learning_rate": 2.0941031128745196e-05, "loss": 0.16346664428710939, "memory(GiB)": 153.57, "step": 9150, "token_acc": 0.9344262295081968, "train_speed(iter/s)": 0.315363 }, { "epoch": 3.488948170731707, "grad_norm": 0.9319409728050232, "learning_rate": 2.0892337198332756e-05, "loss": 0.07884341478347778, "memory(GiB)": 153.57, "step": 9155, "token_acc": 0.9604588723456188, "train_speed(iter/s)": 0.315384 }, { "epoch": 3.4908536585365852, "grad_norm": 1.6170521974563599, "learning_rate": 2.0843684991289175e-05, "loss": 0.20753929615020753, "memory(GiB)": 153.57, "step": 9160, "token_acc": 0.9141615356754799, "train_speed(iter/s)": 0.315356 }, { "epoch": 3.4927591463414633, "grad_norm": 2.2304952144622803, "learning_rate": 2.079507457735327e-05, "loss": 0.16047122478485107, "memory(GiB)": 153.57, "step": 9165, "token_acc": 0.9296497584541062, "train_speed(iter/s)": 0.315384 }, { "epoch": 3.4946646341463414, "grad_norm": 1.1971718072891235, "learning_rate": 2.0746506026203972e-05, "loss": 0.0996301829814911, "memory(GiB)": 153.57, "step": 9170, "token_acc": 0.9546442151004889, "train_speed(iter/s)": 0.315366 }, { "epoch": 3.4965701219512195, "grad_norm": 1.2547656297683716, "learning_rate": 2.0697979407460137e-05, "loss": 0.13788585662841796, "memory(GiB)": 153.57, "step": 9175, "token_acc": 0.9413359148112295, "train_speed(iter/s)": 0.315373 }, { "epoch": 3.4984756097560976, "grad_norm": 1.0229442119598389, "learning_rate": 2.0649494790680617e-05, "loss": 0.13085498809814453, "memory(GiB)": 153.57, "step": 9180, "token_acc": 0.9466019417475728, "train_speed(iter/s)": 0.315362 }, { "epoch": 3.5003810975609757, "grad_norm": 1.3001161813735962, "learning_rate": 2.060105224536395e-05, "loss": 0.1482514262199402, "memory(GiB)": 153.57, "step": 9185, "token_acc": 0.9426893479141353, "train_speed(iter/s)": 0.315313 }, { "epoch": 3.502286585365854, "grad_norm": 0.9398359656333923, "learning_rate": 2.0552651840948472e-05, "loss": 0.0900155782699585, "memory(GiB)": 153.57, "step": 9190, "token_acc": 0.9575988565983802, "train_speed(iter/s)": 0.315325 }, { "epoch": 3.5041920731707314, "grad_norm": 1.1275955438613892, "learning_rate": 2.050429364681201e-05, "loss": 0.13178261518478393, "memory(GiB)": 153.57, "step": 9195, "token_acc": 0.9437795713888116, "train_speed(iter/s)": 0.315352 }, { "epoch": 3.5060975609756095, "grad_norm": 2.233680248260498, "learning_rate": 2.0455977732271993e-05, "loss": 0.15582740306854248, "memory(GiB)": 153.57, "step": 9200, "token_acc": 0.9471617863569443, "train_speed(iter/s)": 0.31531 }, { "epoch": 3.5080030487804876, "grad_norm": 1.8786373138427734, "learning_rate": 2.0407704166585118e-05, "loss": 0.13598541021347046, "memory(GiB)": 153.57, "step": 9205, "token_acc": 0.9516390782213567, "train_speed(iter/s)": 0.315324 }, { "epoch": 3.5099085365853657, "grad_norm": 1.2644833326339722, "learning_rate": 2.0359473018947537e-05, "loss": 0.11990734338760375, "memory(GiB)": 153.57, "step": 9210, "token_acc": 0.9474151714336472, "train_speed(iter/s)": 0.315333 }, { "epoch": 3.511814024390244, "grad_norm": 1.4301514625549316, "learning_rate": 2.0311284358494463e-05, "loss": 0.10223643779754639, "memory(GiB)": 153.57, "step": 9215, "token_acc": 0.9563675062078751, "train_speed(iter/s)": 0.315262 }, { "epoch": 3.513719512195122, "grad_norm": 2.0038771629333496, "learning_rate": 2.0263138254300283e-05, "loss": 0.1475292205810547, "memory(GiB)": 153.57, "step": 9220, "token_acc": 0.9342963653308481, "train_speed(iter/s)": 0.315292 }, { "epoch": 3.515625, "grad_norm": 1.3784104585647583, "learning_rate": 2.0215034775378332e-05, "loss": 0.12260894775390625, "memory(GiB)": 153.57, "step": 9225, "token_acc": 0.9476403720289356, "train_speed(iter/s)": 0.315324 }, { "epoch": 3.517530487804878, "grad_norm": 1.105149745941162, "learning_rate": 2.016697399068091e-05, "loss": 0.14450700283050538, "memory(GiB)": 153.57, "step": 9230, "token_acc": 0.9400134801168277, "train_speed(iter/s)": 0.315338 }, { "epoch": 3.519435975609756, "grad_norm": 1.1235864162445068, "learning_rate": 2.0118955969099033e-05, "loss": 0.125530207157135, "memory(GiB)": 153.57, "step": 9235, "token_acc": 0.9462254395036195, "train_speed(iter/s)": 0.315349 }, { "epoch": 3.5213414634146343, "grad_norm": 1.1017069816589355, "learning_rate": 2.007098077946251e-05, "loss": 0.09729762673377991, "memory(GiB)": 153.57, "step": 9240, "token_acc": 0.9566677326511033, "train_speed(iter/s)": 0.315345 }, { "epoch": 3.5232469512195124, "grad_norm": 1.6202446222305298, "learning_rate": 2.0023048490539676e-05, "loss": 0.12721346616744994, "memory(GiB)": 153.57, "step": 9245, "token_acc": 0.946376811594203, "train_speed(iter/s)": 0.315279 }, { "epoch": 3.5251524390243905, "grad_norm": 1.3536185026168823, "learning_rate": 1.997515917103743e-05, "loss": 0.11073062419891358, "memory(GiB)": 153.57, "step": 9250, "token_acc": 0.9584552369806905, "train_speed(iter/s)": 0.3153 }, { "epoch": 3.5270579268292686, "grad_norm": 1.2785640954971313, "learning_rate": 1.9927312889601036e-05, "loss": 0.1000365972518921, "memory(GiB)": 153.57, "step": 9255, "token_acc": 0.9467174119885823, "train_speed(iter/s)": 0.315322 }, { "epoch": 3.528963414634146, "grad_norm": 1.1043310165405273, "learning_rate": 1.9879509714814126e-05, "loss": 0.11559239625930787, "memory(GiB)": 153.57, "step": 9260, "token_acc": 0.9515564882826163, "train_speed(iter/s)": 0.31533 }, { "epoch": 3.5308689024390243, "grad_norm": 0.9448847770690918, "learning_rate": 1.983174971519845e-05, "loss": 0.12441079616546631, "memory(GiB)": 153.57, "step": 9265, "token_acc": 0.9526871841984382, "train_speed(iter/s)": 0.315323 }, { "epoch": 3.5327743902439024, "grad_norm": 2.1997787952423096, "learning_rate": 1.9784032959213967e-05, "loss": 0.13497296571731568, "memory(GiB)": 153.57, "step": 9270, "token_acc": 0.9466463414634146, "train_speed(iter/s)": 0.315321 }, { "epoch": 3.5346798780487805, "grad_norm": 1.001599669456482, "learning_rate": 1.9736359515258562e-05, "loss": 0.12213469743728637, "memory(GiB)": 153.57, "step": 9275, "token_acc": 0.9446524064171123, "train_speed(iter/s)": 0.315251 }, { "epoch": 3.5365853658536586, "grad_norm": 1.395804762840271, "learning_rate": 1.9688729451668114e-05, "loss": 0.13413703441619873, "memory(GiB)": 153.57, "step": 9280, "token_acc": 0.9362574944777532, "train_speed(iter/s)": 0.315275 }, { "epoch": 3.5384908536585367, "grad_norm": 1.6871453523635864, "learning_rate": 1.9641142836716258e-05, "loss": 0.11618226766586304, "memory(GiB)": 153.57, "step": 9285, "token_acc": 0.9507929104477612, "train_speed(iter/s)": 0.315256 }, { "epoch": 3.5403963414634148, "grad_norm": 1.511337161064148, "learning_rate": 1.9593599738614382e-05, "loss": 0.13625757694244384, "memory(GiB)": 153.57, "step": 9290, "token_acc": 0.9429520072441896, "train_speed(iter/s)": 0.315223 }, { "epoch": 3.542301829268293, "grad_norm": 1.1867032051086426, "learning_rate": 1.9546100225511493e-05, "loss": 0.15710177421569824, "memory(GiB)": 153.57, "step": 9295, "token_acc": 0.9473590292648109, "train_speed(iter/s)": 0.315213 }, { "epoch": 3.5442073170731705, "grad_norm": 1.3889262676239014, "learning_rate": 1.9498644365494144e-05, "loss": 0.10249950885772705, "memory(GiB)": 153.57, "step": 9300, "token_acc": 0.9494224164845457, "train_speed(iter/s)": 0.315239 }, { "epoch": 3.5461128048780486, "grad_norm": 1.442673921585083, "learning_rate": 1.945123222658625e-05, "loss": 0.10498849153518677, "memory(GiB)": 153.57, "step": 9305, "token_acc": 0.9475818129163046, "train_speed(iter/s)": 0.31525 }, { "epoch": 3.5480182926829267, "grad_norm": 2.4200539588928223, "learning_rate": 1.9403863876749124e-05, "loss": 0.1065626859664917, "memory(GiB)": 153.57, "step": 9310, "token_acc": 0.9588502269288957, "train_speed(iter/s)": 0.315223 }, { "epoch": 3.5499237804878048, "grad_norm": 2.049487590789795, "learning_rate": 1.935653938388125e-05, "loss": 0.14384814500808715, "memory(GiB)": 153.57, "step": 9315, "token_acc": 0.9483249397854171, "train_speed(iter/s)": 0.315235 }, { "epoch": 3.551829268292683, "grad_norm": 1.3155640363693237, "learning_rate": 1.9309258815818326e-05, "loss": 0.0929718017578125, "memory(GiB)": 153.57, "step": 9320, "token_acc": 0.9588926174496645, "train_speed(iter/s)": 0.315245 }, { "epoch": 3.553734756097561, "grad_norm": 1.4483519792556763, "learning_rate": 1.9262022240332998e-05, "loss": 0.1401067018508911, "memory(GiB)": 153.57, "step": 9325, "token_acc": 0.9439566598169251, "train_speed(iter/s)": 0.315252 }, { "epoch": 3.555640243902439, "grad_norm": 0.8245370388031006, "learning_rate": 1.9214829725134915e-05, "loss": 0.1260592222213745, "memory(GiB)": 153.57, "step": 9330, "token_acc": 0.9439465536907283, "train_speed(iter/s)": 0.315253 }, { "epoch": 3.557545731707317, "grad_norm": 2.1642181873321533, "learning_rate": 1.916768133787054e-05, "loss": 0.13385848999023436, "memory(GiB)": 153.57, "step": 9335, "token_acc": 0.9533921676778819, "train_speed(iter/s)": 0.315277 }, { "epoch": 3.5594512195121952, "grad_norm": 1.5048781633377075, "learning_rate": 1.9120577146123126e-05, "loss": 0.1483513355255127, "memory(GiB)": 153.57, "step": 9340, "token_acc": 0.9387931034482758, "train_speed(iter/s)": 0.315289 }, { "epoch": 3.5613567073170733, "grad_norm": 1.7871955633163452, "learning_rate": 1.9073517217412496e-05, "loss": 0.11920788288116455, "memory(GiB)": 153.57, "step": 9345, "token_acc": 0.9449132303773284, "train_speed(iter/s)": 0.315293 }, { "epoch": 3.5632621951219514, "grad_norm": 1.622061014175415, "learning_rate": 1.902650161919511e-05, "loss": 0.11272703409194947, "memory(GiB)": 153.57, "step": 9350, "token_acc": 0.9506776271939569, "train_speed(iter/s)": 0.315291 }, { "epoch": 3.5651676829268295, "grad_norm": 1.153550624847412, "learning_rate": 1.8979530418863805e-05, "loss": 0.13754942417144775, "memory(GiB)": 153.57, "step": 9355, "token_acc": 0.9345657276995305, "train_speed(iter/s)": 0.315317 }, { "epoch": 3.567073170731707, "grad_norm": 1.257258415222168, "learning_rate": 1.893260368374786e-05, "loss": 0.14358898401260375, "memory(GiB)": 153.57, "step": 9360, "token_acc": 0.9493044822256569, "train_speed(iter/s)": 0.315294 }, { "epoch": 3.5689786585365852, "grad_norm": 0.9448966383934021, "learning_rate": 1.8885721481112734e-05, "loss": 0.13097114562988282, "memory(GiB)": 153.57, "step": 9365, "token_acc": 0.9499930157843274, "train_speed(iter/s)": 0.315295 }, { "epoch": 3.5708841463414633, "grad_norm": 2.2280893325805664, "learning_rate": 1.8838883878160114e-05, "loss": 0.14424471855163573, "memory(GiB)": 153.57, "step": 9370, "token_acc": 0.9362549800796812, "train_speed(iter/s)": 0.315316 }, { "epoch": 3.5727896341463414, "grad_norm": 1.4215748310089111, "learning_rate": 1.8792090942027735e-05, "loss": 0.1584327220916748, "memory(GiB)": 153.57, "step": 9375, "token_acc": 0.9403728704596592, "train_speed(iter/s)": 0.315326 }, { "epoch": 3.5746951219512195, "grad_norm": 0.5627345442771912, "learning_rate": 1.8745342739789323e-05, "loss": 0.08233664631843567, "memory(GiB)": 153.57, "step": 9380, "token_acc": 0.9526420737786641, "train_speed(iter/s)": 0.315358 }, { "epoch": 3.5766006097560976, "grad_norm": 1.8843233585357666, "learning_rate": 1.869863933845443e-05, "loss": 0.16048389673233032, "memory(GiB)": 153.57, "step": 9385, "token_acc": 0.9302042060347455, "train_speed(iter/s)": 0.315377 }, { "epoch": 3.5785060975609757, "grad_norm": 1.215389609336853, "learning_rate": 1.8651980804968467e-05, "loss": 0.09636815786361694, "memory(GiB)": 153.57, "step": 9390, "token_acc": 0.9526076606795066, "train_speed(iter/s)": 0.315375 }, { "epoch": 3.580411585365854, "grad_norm": 1.3257660865783691, "learning_rate": 1.8605367206212447e-05, "loss": 0.1311952590942383, "memory(GiB)": 153.57, "step": 9395, "token_acc": 0.9535389482008699, "train_speed(iter/s)": 0.315333 }, { "epoch": 3.5823170731707314, "grad_norm": 1.3370816707611084, "learning_rate": 1.8558798609003052e-05, "loss": 0.12346526384353637, "memory(GiB)": 153.57, "step": 9400, "token_acc": 0.9459561602418746, "train_speed(iter/s)": 0.315337 }, { "epoch": 3.5842225609756095, "grad_norm": 1.699508547782898, "learning_rate": 1.851227508009239e-05, "loss": 0.1538402557373047, "memory(GiB)": 153.57, "step": 9405, "token_acc": 0.9391952309985097, "train_speed(iter/s)": 0.315356 }, { "epoch": 3.5861280487804876, "grad_norm": 1.1466703414916992, "learning_rate": 1.8465796686168012e-05, "loss": 0.11803864240646363, "memory(GiB)": 153.57, "step": 9410, "token_acc": 0.949159408547701, "train_speed(iter/s)": 0.31536 }, { "epoch": 3.5880335365853657, "grad_norm": 1.4377665519714355, "learning_rate": 1.841936349385276e-05, "loss": 0.15624843835830687, "memory(GiB)": 153.57, "step": 9415, "token_acc": 0.9402020871293689, "train_speed(iter/s)": 0.315353 }, { "epoch": 3.589939024390244, "grad_norm": 1.3724440336227417, "learning_rate": 1.8372975569704697e-05, "loss": 0.11087644100189209, "memory(GiB)": 153.57, "step": 9420, "token_acc": 0.9527428949107734, "train_speed(iter/s)": 0.315323 }, { "epoch": 3.591844512195122, "grad_norm": 0.9930050373077393, "learning_rate": 1.832663298021695e-05, "loss": 0.14536055326461791, "memory(GiB)": 153.57, "step": 9425, "token_acc": 0.9537117903930131, "train_speed(iter/s)": 0.315338 }, { "epoch": 3.59375, "grad_norm": 2.3027918338775635, "learning_rate": 1.8280335791817733e-05, "loss": 0.13776137828826904, "memory(GiB)": 153.57, "step": 9430, "token_acc": 0.9436544137375905, "train_speed(iter/s)": 0.315311 }, { "epoch": 3.595655487804878, "grad_norm": 2.566782236099243, "learning_rate": 1.8234084070870107e-05, "loss": 0.16008563041687013, "memory(GiB)": 153.57, "step": 9435, "token_acc": 0.933602965958881, "train_speed(iter/s)": 0.315323 }, { "epoch": 3.597560975609756, "grad_norm": 1.7107137441635132, "learning_rate": 1.818787788367202e-05, "loss": 0.13388592004776, "memory(GiB)": 153.57, "step": 9440, "token_acc": 0.94384, "train_speed(iter/s)": 0.315322 }, { "epoch": 3.5994664634146343, "grad_norm": 1.360084891319275, "learning_rate": 1.8141717296456106e-05, "loss": 0.11229424476623535, "memory(GiB)": 153.57, "step": 9445, "token_acc": 0.9554159186116098, "train_speed(iter/s)": 0.315325 }, { "epoch": 3.6013719512195124, "grad_norm": 1.1216161251068115, "learning_rate": 1.8095602375389687e-05, "loss": 0.09846707582473754, "memory(GiB)": 153.57, "step": 9450, "token_acc": 0.9494857142857143, "train_speed(iter/s)": 0.315335 }, { "epoch": 3.6032774390243905, "grad_norm": 1.4330477714538574, "learning_rate": 1.8049533186574545e-05, "loss": 0.10834307670593261, "memory(GiB)": 153.57, "step": 9455, "token_acc": 0.9519943522767385, "train_speed(iter/s)": 0.315334 }, { "epoch": 3.6051829268292686, "grad_norm": 1.4567190408706665, "learning_rate": 1.8003509796047025e-05, "loss": 0.12891201972961425, "memory(GiB)": 153.57, "step": 9460, "token_acc": 0.9451338994121489, "train_speed(iter/s)": 0.315348 }, { "epoch": 3.607088414634146, "grad_norm": 1.2764747142791748, "learning_rate": 1.795753226977771e-05, "loss": 0.12083613872528076, "memory(GiB)": 153.57, "step": 9465, "token_acc": 0.95556640625, "train_speed(iter/s)": 0.315358 }, { "epoch": 3.6089939024390243, "grad_norm": 0.7442162036895752, "learning_rate": 1.791160067367152e-05, "loss": 0.09272769689559937, "memory(GiB)": 153.57, "step": 9470, "token_acc": 0.9535318659185729, "train_speed(iter/s)": 0.315355 }, { "epoch": 3.6108993902439024, "grad_norm": 0.9971497058868408, "learning_rate": 1.786571507356748e-05, "loss": 0.09642525315284729, "memory(GiB)": 153.57, "step": 9475, "token_acc": 0.9602791878172589, "train_speed(iter/s)": 0.315355 }, { "epoch": 3.6128048780487805, "grad_norm": 1.2065091133117676, "learning_rate": 1.781987553523874e-05, "loss": 0.1392520546913147, "memory(GiB)": 153.57, "step": 9480, "token_acc": 0.9393643031784841, "train_speed(iter/s)": 0.315355 }, { "epoch": 3.6147103658536586, "grad_norm": 1.5555506944656372, "learning_rate": 1.7774082124392356e-05, "loss": 0.10903143882751465, "memory(GiB)": 153.57, "step": 9485, "token_acc": 0.9580061454421305, "train_speed(iter/s)": 0.315359 }, { "epoch": 3.6166158536585367, "grad_norm": 1.1519625186920166, "learning_rate": 1.7728334906669343e-05, "loss": 0.1405436635017395, "memory(GiB)": 153.57, "step": 9490, "token_acc": 0.9436730416557506, "train_speed(iter/s)": 0.315384 }, { "epoch": 3.6185213414634148, "grad_norm": 1.2298482656478882, "learning_rate": 1.768263394764441e-05, "loss": 0.12252728939056397, "memory(GiB)": 153.57, "step": 9495, "token_acc": 0.9531172069825437, "train_speed(iter/s)": 0.315401 }, { "epoch": 3.620426829268293, "grad_norm": 1.8704886436462402, "learning_rate": 1.7636979312826086e-05, "loss": 0.083682781457901, "memory(GiB)": 153.57, "step": 9500, "token_acc": 0.9599465954606141, "train_speed(iter/s)": 0.315388 }, { "epoch": 3.6223323170731705, "grad_norm": 2.093130111694336, "learning_rate": 1.759137106765636e-05, "loss": 0.13385186195373536, "memory(GiB)": 153.57, "step": 9505, "token_acc": 0.9486917416189697, "train_speed(iter/s)": 0.315374 }, { "epoch": 3.6242378048780486, "grad_norm": 0.9764480590820312, "learning_rate": 1.7545809277510828e-05, "loss": 0.12133400440216065, "memory(GiB)": 153.57, "step": 9510, "token_acc": 0.9425144103442904, "train_speed(iter/s)": 0.315373 }, { "epoch": 3.6261432926829267, "grad_norm": 1.0537539720535278, "learning_rate": 1.7500294007698414e-05, "loss": 0.11140575408935546, "memory(GiB)": 153.57, "step": 9515, "token_acc": 0.9521531100478469, "train_speed(iter/s)": 0.315377 }, { "epoch": 3.6280487804878048, "grad_norm": 1.0301567316055298, "learning_rate": 1.7454825323461448e-05, "loss": 0.10202914476394653, "memory(GiB)": 153.57, "step": 9520, "token_acc": 0.9561237373737373, "train_speed(iter/s)": 0.315397 }, { "epoch": 3.629954268292683, "grad_norm": 0.9479950666427612, "learning_rate": 1.7409403289975395e-05, "loss": 0.10846024751663208, "memory(GiB)": 153.57, "step": 9525, "token_acc": 0.9569487983281086, "train_speed(iter/s)": 0.315414 }, { "epoch": 3.631859756097561, "grad_norm": 1.726353645324707, "learning_rate": 1.7364027972348933e-05, "loss": 0.13396856784820557, "memory(GiB)": 153.57, "step": 9530, "token_acc": 0.9361917623227549, "train_speed(iter/s)": 0.315443 }, { "epoch": 3.633765243902439, "grad_norm": 1.4730727672576904, "learning_rate": 1.731869943562368e-05, "loss": 0.12761666774749755, "memory(GiB)": 153.57, "step": 9535, "token_acc": 0.9435061153174141, "train_speed(iter/s)": 0.315425 }, { "epoch": 3.635670731707317, "grad_norm": 2.5029475688934326, "learning_rate": 1.727341774477432e-05, "loss": 0.11769596338272095, "memory(GiB)": 153.57, "step": 9540, "token_acc": 0.9513018322082931, "train_speed(iter/s)": 0.315439 }, { "epoch": 3.6375762195121952, "grad_norm": 0.9344792366027832, "learning_rate": 1.7228182964708273e-05, "loss": 0.12266175746917725, "memory(GiB)": 153.57, "step": 9545, "token_acc": 0.940234375, "train_speed(iter/s)": 0.31547 }, { "epoch": 3.6394817073170733, "grad_norm": 1.2143491506576538, "learning_rate": 1.7182995160265797e-05, "loss": 0.1280925989151001, "memory(GiB)": 153.57, "step": 9550, "token_acc": 0.9466929911154985, "train_speed(iter/s)": 0.31548 }, { "epoch": 3.6413871951219514, "grad_norm": 0.9653764367103577, "learning_rate": 1.7137854396219747e-05, "loss": 0.14458831548690795, "memory(GiB)": 153.57, "step": 9555, "token_acc": 0.9425287356321839, "train_speed(iter/s)": 0.315468 }, { "epoch": 3.6432926829268295, "grad_norm": 1.3371074199676514, "learning_rate": 1.7092760737275625e-05, "loss": 0.11643474102020264, "memory(GiB)": 153.57, "step": 9560, "token_acc": 0.953344058837049, "train_speed(iter/s)": 0.31548 }, { "epoch": 3.645198170731707, "grad_norm": 1.2127481698989868, "learning_rate": 1.704771424807133e-05, "loss": 0.10764638185501099, "memory(GiB)": 153.57, "step": 9565, "token_acc": 0.9408121128699243, "train_speed(iter/s)": 0.315506 }, { "epoch": 3.6471036585365852, "grad_norm": 1.330531358718872, "learning_rate": 1.700271499317722e-05, "loss": 0.11383228302001953, "memory(GiB)": 153.57, "step": 9570, "token_acc": 0.9560924915562484, "train_speed(iter/s)": 0.315524 }, { "epoch": 3.6490091463414633, "grad_norm": 1.349421501159668, "learning_rate": 1.695776303709587e-05, "loss": 0.11509509086608886, "memory(GiB)": 153.57, "step": 9575, "token_acc": 0.9511295527893038, "train_speed(iter/s)": 0.315549 }, { "epoch": 3.6509146341463414, "grad_norm": 1.1268173456192017, "learning_rate": 1.6912858444262164e-05, "loss": 0.11879854202270508, "memory(GiB)": 153.57, "step": 9580, "token_acc": 0.9470504619758351, "train_speed(iter/s)": 0.315578 }, { "epoch": 3.6528201219512195, "grad_norm": 1.4520211219787598, "learning_rate": 1.6868001279042988e-05, "loss": 0.13754982948303224, "memory(GiB)": 153.57, "step": 9585, "token_acc": 0.947659672584736, "train_speed(iter/s)": 0.315546 }, { "epoch": 3.6547256097560976, "grad_norm": 1.6700470447540283, "learning_rate": 1.6823191605737313e-05, "loss": 0.11810154914855957, "memory(GiB)": 153.57, "step": 9590, "token_acc": 0.953355155482815, "train_speed(iter/s)": 0.315562 }, { "epoch": 3.6566310975609757, "grad_norm": 1.7535673379898071, "learning_rate": 1.677842948857599e-05, "loss": 0.13871003389358522, "memory(GiB)": 153.57, "step": 9595, "token_acc": 0.9358872040436286, "train_speed(iter/s)": 0.315584 }, { "epoch": 3.658536585365854, "grad_norm": 0.9647493362426758, "learning_rate": 1.673371499172174e-05, "loss": 0.10893909931182862, "memory(GiB)": 153.57, "step": 9600, "token_acc": 0.9477503628447025, "train_speed(iter/s)": 0.315561 }, { "epoch": 3.658536585365854, "eval_loss": 0.1623094379901886, "eval_runtime": 32.5652, "eval_samples_per_second": 3.255, "eval_steps_per_second": 3.255, "eval_token_acc": 0.9226049786271059, "step": 9600 }, { "epoch": 3.6604420731707314, "grad_norm": 1.7719767093658447, "learning_rate": 1.6689048179268984e-05, "loss": 0.08365650773048401, "memory(GiB)": 153.57, "step": 9605, "token_acc": 0.9298834934943447, "train_speed(iter/s)": 0.315246 }, { "epoch": 3.6623475609756095, "grad_norm": 0.7952829599380493, "learning_rate": 1.6644429115243835e-05, "loss": 0.10205848217010498, "memory(GiB)": 153.57, "step": 9610, "token_acc": 0.9564727954971858, "train_speed(iter/s)": 0.315255 }, { "epoch": 3.6642530487804876, "grad_norm": 0.9800346493721008, "learning_rate": 1.6599857863603906e-05, "loss": 0.1236146330833435, "memory(GiB)": 153.57, "step": 9615, "token_acc": 0.9554176072234764, "train_speed(iter/s)": 0.315225 }, { "epoch": 3.6661585365853657, "grad_norm": 1.4909855127334595, "learning_rate": 1.6555334488238373e-05, "loss": 0.11997066736221314, "memory(GiB)": 153.57, "step": 9620, "token_acc": 0.9530362603757099, "train_speed(iter/s)": 0.315221 }, { "epoch": 3.668064024390244, "grad_norm": 1.5883636474609375, "learning_rate": 1.6510859052967663e-05, "loss": 0.11715039014816284, "memory(GiB)": 153.57, "step": 9625, "token_acc": 0.9472753007784855, "train_speed(iter/s)": 0.315203 }, { "epoch": 3.669969512195122, "grad_norm": 1.5669958591461182, "learning_rate": 1.6466431621543592e-05, "loss": 0.15151697397232056, "memory(GiB)": 153.57, "step": 9630, "token_acc": 0.9336710082006754, "train_speed(iter/s)": 0.315216 }, { "epoch": 3.671875, "grad_norm": 1.568556547164917, "learning_rate": 1.6422052257649078e-05, "loss": 0.1131399393081665, "memory(GiB)": 153.57, "step": 9635, "token_acc": 0.9534106107891217, "train_speed(iter/s)": 0.315229 }, { "epoch": 3.673780487804878, "grad_norm": 1.975461483001709, "learning_rate": 1.6377721024898213e-05, "loss": 0.13431090116500854, "memory(GiB)": 153.57, "step": 9640, "token_acc": 0.9393599101628298, "train_speed(iter/s)": 0.315248 }, { "epoch": 3.675685975609756, "grad_norm": 3.136540412902832, "learning_rate": 1.6333437986836036e-05, "loss": 0.13715286254882814, "memory(GiB)": 153.57, "step": 9645, "token_acc": 0.9492083076290355, "train_speed(iter/s)": 0.315251 }, { "epoch": 3.6775914634146343, "grad_norm": 1.0393781661987305, "learning_rate": 1.628920320693856e-05, "loss": 0.12597635984420777, "memory(GiB)": 153.57, "step": 9650, "token_acc": 0.9494406508789772, "train_speed(iter/s)": 0.315245 }, { "epoch": 3.6794969512195124, "grad_norm": 2.2098283767700195, "learning_rate": 1.624501674861256e-05, "loss": 0.11644941568374634, "memory(GiB)": 153.57, "step": 9655, "token_acc": 0.9546578730420445, "train_speed(iter/s)": 0.315276 }, { "epoch": 3.6814024390243905, "grad_norm": 1.5266799926757812, "learning_rate": 1.6200878675195596e-05, "loss": 0.14197858572006225, "memory(GiB)": 153.57, "step": 9660, "token_acc": 0.9379645511720983, "train_speed(iter/s)": 0.315295 }, { "epoch": 3.6833079268292686, "grad_norm": 3.426311731338501, "learning_rate": 1.6156789049955863e-05, "loss": 0.14448261260986328, "memory(GiB)": 153.57, "step": 9665, "token_acc": 0.9380378657487092, "train_speed(iter/s)": 0.315308 }, { "epoch": 3.685213414634146, "grad_norm": 1.4490491151809692, "learning_rate": 1.611274793609212e-05, "loss": 0.0959943950176239, "memory(GiB)": 153.57, "step": 9670, "token_acc": 0.965661252900232, "train_speed(iter/s)": 0.315276 }, { "epoch": 3.6871189024390243, "grad_norm": 1.2894536256790161, "learning_rate": 1.6068755396733537e-05, "loss": 0.11232166290283203, "memory(GiB)": 153.57, "step": 9675, "token_acc": 0.9482426576793452, "train_speed(iter/s)": 0.315295 }, { "epoch": 3.6890243902439024, "grad_norm": 1.9272608757019043, "learning_rate": 1.6024811494939724e-05, "loss": 0.1333998203277588, "memory(GiB)": 153.57, "step": 9680, "token_acc": 0.9439914163090128, "train_speed(iter/s)": 0.315283 }, { "epoch": 3.6909298780487805, "grad_norm": 1.908034086227417, "learning_rate": 1.598091629370051e-05, "loss": 0.10511620044708252, "memory(GiB)": 153.57, "step": 9685, "token_acc": 0.9627236580516899, "train_speed(iter/s)": 0.31532 }, { "epoch": 3.6928353658536586, "grad_norm": 0.8904842734336853, "learning_rate": 1.5937069855935987e-05, "loss": 0.13103630542755126, "memory(GiB)": 153.57, "step": 9690, "token_acc": 0.9525132609244759, "train_speed(iter/s)": 0.315342 }, { "epoch": 3.6947408536585367, "grad_norm": 1.6369924545288086, "learning_rate": 1.5893272244496266e-05, "loss": 0.10622515678405761, "memory(GiB)": 153.57, "step": 9695, "token_acc": 0.9558464610605725, "train_speed(iter/s)": 0.315354 }, { "epoch": 3.6966463414634148, "grad_norm": 1.806870937347412, "learning_rate": 1.5849523522161535e-05, "loss": 0.15740132331848145, "memory(GiB)": 153.57, "step": 9700, "token_acc": 0.9326710816777042, "train_speed(iter/s)": 0.315365 }, { "epoch": 3.698551829268293, "grad_norm": 1.2490324974060059, "learning_rate": 1.5805823751641886e-05, "loss": 0.1392240047454834, "memory(GiB)": 153.57, "step": 9705, "token_acc": 0.9396382654173022, "train_speed(iter/s)": 0.315333 }, { "epoch": 3.7004573170731705, "grad_norm": 1.291001319885254, "learning_rate": 1.576217299557724e-05, "loss": 0.13183499574661256, "memory(GiB)": 153.57, "step": 9710, "token_acc": 0.9406201891356939, "train_speed(iter/s)": 0.315348 }, { "epoch": 3.7023628048780486, "grad_norm": 2.4410390853881836, "learning_rate": 1.571857131653724e-05, "loss": 0.08946994543075562, "memory(GiB)": 153.57, "step": 9715, "token_acc": 0.9632025879498585, "train_speed(iter/s)": 0.315383 }, { "epoch": 3.7042682926829267, "grad_norm": 1.1723476648330688, "learning_rate": 1.5675018777021223e-05, "loss": 0.13752089738845824, "memory(GiB)": 153.57, "step": 9720, "token_acc": 0.9350789872322008, "train_speed(iter/s)": 0.315394 }, { "epoch": 3.7061737804878048, "grad_norm": 1.469864010810852, "learning_rate": 1.563151543945804e-05, "loss": 0.1708155393600464, "memory(GiB)": 153.57, "step": 9725, "token_acc": 0.9228962818003914, "train_speed(iter/s)": 0.315424 }, { "epoch": 3.708079268292683, "grad_norm": 1.3281887769699097, "learning_rate": 1.5588061366206062e-05, "loss": 0.09345080256462097, "memory(GiB)": 153.57, "step": 9730, "token_acc": 0.9556030389363722, "train_speed(iter/s)": 0.315435 }, { "epoch": 3.709984756097561, "grad_norm": 1.623186469078064, "learning_rate": 1.5544656619553007e-05, "loss": 0.13215622901916504, "memory(GiB)": 153.57, "step": 9735, "token_acc": 0.9407461594732992, "train_speed(iter/s)": 0.315416 }, { "epoch": 3.711890243902439, "grad_norm": 1.701106309890747, "learning_rate": 1.5501301261715894e-05, "loss": 0.08919808864593506, "memory(GiB)": 153.57, "step": 9740, "token_acc": 0.9585889570552147, "train_speed(iter/s)": 0.315442 }, { "epoch": 3.713795731707317, "grad_norm": 1.7255492210388184, "learning_rate": 1.5457995354840994e-05, "loss": 0.12670376300811767, "memory(GiB)": 153.57, "step": 9745, "token_acc": 0.952544704264099, "train_speed(iter/s)": 0.315437 }, { "epoch": 3.7157012195121952, "grad_norm": 1.2782412767410278, "learning_rate": 1.541473896100361e-05, "loss": 0.1051875114440918, "memory(GiB)": 153.57, "step": 9750, "token_acc": 0.9643614764531183, "train_speed(iter/s)": 0.315442 }, { "epoch": 3.7176067073170733, "grad_norm": 1.3452297449111938, "learning_rate": 1.5371532142208146e-05, "loss": 0.11557042598724365, "memory(GiB)": 153.57, "step": 9755, "token_acc": 0.9406366504158302, "train_speed(iter/s)": 0.315466 }, { "epoch": 3.7195121951219514, "grad_norm": 1.7345609664916992, "learning_rate": 1.532837496038792e-05, "loss": 0.1567040801048279, "memory(GiB)": 153.57, "step": 9760, "token_acc": 0.935046891281695, "train_speed(iter/s)": 0.315473 }, { "epoch": 3.7214176829268295, "grad_norm": 2.546192169189453, "learning_rate": 1.528526747740507e-05, "loss": 0.1895283579826355, "memory(GiB)": 153.57, "step": 9765, "token_acc": 0.9272271016311167, "train_speed(iter/s)": 0.315479 }, { "epoch": 3.723323170731707, "grad_norm": 1.955855131149292, "learning_rate": 1.5242209755050557e-05, "loss": 0.12201153039932251, "memory(GiB)": 153.57, "step": 9770, "token_acc": 0.9488262910798122, "train_speed(iter/s)": 0.315488 }, { "epoch": 3.7252286585365852, "grad_norm": 2.5881080627441406, "learning_rate": 1.5199201855043944e-05, "loss": 0.18086713552474976, "memory(GiB)": 153.57, "step": 9775, "token_acc": 0.935794289320082, "train_speed(iter/s)": 0.315494 }, { "epoch": 3.7271341463414633, "grad_norm": 1.7658121585845947, "learning_rate": 1.5156243839033424e-05, "loss": 0.107296884059906, "memory(GiB)": 153.57, "step": 9780, "token_acc": 0.9547888161808448, "train_speed(iter/s)": 0.315458 }, { "epoch": 3.7290396341463414, "grad_norm": 1.909732699394226, "learning_rate": 1.5113335768595693e-05, "loss": 0.12953538894653321, "memory(GiB)": 153.57, "step": 9785, "token_acc": 0.9526743899433079, "train_speed(iter/s)": 0.315458 }, { "epoch": 3.7309451219512195, "grad_norm": 1.3787052631378174, "learning_rate": 1.5070477705235803e-05, "loss": 0.10690945386886597, "memory(GiB)": 153.57, "step": 9790, "token_acc": 0.9525706697021438, "train_speed(iter/s)": 0.31543 }, { "epoch": 3.7328506097560976, "grad_norm": 1.2305569648742676, "learning_rate": 1.5027669710387172e-05, "loss": 0.14233248233795165, "memory(GiB)": 153.57, "step": 9795, "token_acc": 0.9472346786248131, "train_speed(iter/s)": 0.315372 }, { "epoch": 3.7347560975609757, "grad_norm": 1.593862533569336, "learning_rate": 1.4984911845411453e-05, "loss": 0.10687276124954223, "memory(GiB)": 153.57, "step": 9800, "token_acc": 0.9600336558687421, "train_speed(iter/s)": 0.31541 }, { "epoch": 3.736661585365854, "grad_norm": 1.1348072290420532, "learning_rate": 1.4942204171598396e-05, "loss": 0.138498055934906, "memory(GiB)": 153.57, "step": 9805, "token_acc": 0.9442197505898213, "train_speed(iter/s)": 0.31541 }, { "epoch": 3.7385670731707314, "grad_norm": 1.8067467212677002, "learning_rate": 1.4899546750165849e-05, "loss": 0.12027257680892944, "memory(GiB)": 153.57, "step": 9810, "token_acc": 0.9556145675265554, "train_speed(iter/s)": 0.315407 }, { "epoch": 3.7404725609756095, "grad_norm": 0.9521879553794861, "learning_rate": 1.4856939642259598e-05, "loss": 0.13121397495269777, "memory(GiB)": 153.57, "step": 9815, "token_acc": 0.9474161378059837, "train_speed(iter/s)": 0.315426 }, { "epoch": 3.7423780487804876, "grad_norm": 1.5402554273605347, "learning_rate": 1.4814382908953333e-05, "loss": 0.12653278112411498, "memory(GiB)": 153.57, "step": 9820, "token_acc": 0.9390862944162437, "train_speed(iter/s)": 0.315405 }, { "epoch": 3.7442835365853657, "grad_norm": 2.3915371894836426, "learning_rate": 1.4771876611248535e-05, "loss": 0.13180941343307495, "memory(GiB)": 153.57, "step": 9825, "token_acc": 0.9434646049770476, "train_speed(iter/s)": 0.315409 }, { "epoch": 3.746189024390244, "grad_norm": 1.033556580543518, "learning_rate": 1.4729420810074357e-05, "loss": 0.10652037858963012, "memory(GiB)": 153.57, "step": 9830, "token_acc": 0.9633730834752982, "train_speed(iter/s)": 0.315405 }, { "epoch": 3.748094512195122, "grad_norm": 2.093681573867798, "learning_rate": 1.4687015566287627e-05, "loss": 0.17582614421844484, "memory(GiB)": 153.57, "step": 9835, "token_acc": 0.9275500476644424, "train_speed(iter/s)": 0.315378 }, { "epoch": 3.75, "grad_norm": 1.727195382118225, "learning_rate": 1.4644660940672627e-05, "loss": 0.13334352970123292, "memory(GiB)": 153.57, "step": 9840, "token_acc": 0.9498548855207998, "train_speed(iter/s)": 0.315374 }, { "epoch": 3.751905487804878, "grad_norm": 1.8910799026489258, "learning_rate": 1.4602356993941151e-05, "loss": 0.1357877492904663, "memory(GiB)": 153.57, "step": 9845, "token_acc": 0.9557667702431, "train_speed(iter/s)": 0.315372 }, { "epoch": 3.753810975609756, "grad_norm": 1.0398377180099487, "learning_rate": 1.4560103786732338e-05, "loss": 0.1184606671333313, "memory(GiB)": 153.57, "step": 9850, "token_acc": 0.9506578947368421, "train_speed(iter/s)": 0.315354 }, { "epoch": 3.7557164634146343, "grad_norm": 1.0920261144638062, "learning_rate": 1.4517901379612542e-05, "loss": 0.12969909906387328, "memory(GiB)": 153.57, "step": 9855, "token_acc": 0.9518118004173781, "train_speed(iter/s)": 0.315368 }, { "epoch": 3.7576219512195124, "grad_norm": 1.3970835208892822, "learning_rate": 1.447574983307538e-05, "loss": 0.17229862213134767, "memory(GiB)": 153.57, "step": 9860, "token_acc": 0.9401937046004842, "train_speed(iter/s)": 0.315384 }, { "epoch": 3.7595274390243905, "grad_norm": 1.8157577514648438, "learning_rate": 1.4433649207541482e-05, "loss": 0.11085953712463378, "memory(GiB)": 153.57, "step": 9865, "token_acc": 0.9552703941338222, "train_speed(iter/s)": 0.315388 }, { "epoch": 3.7614329268292686, "grad_norm": 1.321682095527649, "learning_rate": 1.4391599563358554e-05, "loss": 0.13754005432128907, "memory(GiB)": 153.57, "step": 9870, "token_acc": 0.943502824858757, "train_speed(iter/s)": 0.315369 }, { "epoch": 3.763338414634146, "grad_norm": 1.1724010705947876, "learning_rate": 1.4349600960801208e-05, "loss": 0.11645550727844238, "memory(GiB)": 153.57, "step": 9875, "token_acc": 0.9500933627567476, "train_speed(iter/s)": 0.315371 }, { "epoch": 3.7652439024390243, "grad_norm": 1.9847787618637085, "learning_rate": 1.4307653460070846e-05, "loss": 0.09575778245925903, "memory(GiB)": 153.57, "step": 9880, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.315404 }, { "epoch": 3.7671493902439024, "grad_norm": 1.7861918210983276, "learning_rate": 1.4265757121295703e-05, "loss": 0.12480499744415283, "memory(GiB)": 153.57, "step": 9885, "token_acc": 0.94180407371484, "train_speed(iter/s)": 0.315422 }, { "epoch": 3.7690548780487805, "grad_norm": 1.3600523471832275, "learning_rate": 1.4223912004530594e-05, "loss": 0.11408032178878784, "memory(GiB)": 153.57, "step": 9890, "token_acc": 0.953948883260419, "train_speed(iter/s)": 0.315391 }, { "epoch": 3.7709603658536586, "grad_norm": 1.2590572834014893, "learning_rate": 1.4182118169756958e-05, "loss": 0.11752780675888061, "memory(GiB)": 153.57, "step": 9895, "token_acc": 0.9482463644140291, "train_speed(iter/s)": 0.315385 }, { "epoch": 3.7728658536585367, "grad_norm": 2.199924945831299, "learning_rate": 1.4140375676882738e-05, "loss": 0.10768822431564332, "memory(GiB)": 153.57, "step": 9900, "token_acc": 0.9618218352310783, "train_speed(iter/s)": 0.315367 }, { "epoch": 3.7747713414634148, "grad_norm": 2.268294334411621, "learning_rate": 1.4098684585742233e-05, "loss": 0.10798047780990601, "memory(GiB)": 153.57, "step": 9905, "token_acc": 0.9585301837270341, "train_speed(iter/s)": 0.315349 }, { "epoch": 3.776676829268293, "grad_norm": 1.5494683980941772, "learning_rate": 1.40570449560961e-05, "loss": 0.13501830101013185, "memory(GiB)": 153.57, "step": 9910, "token_acc": 0.9450031426775612, "train_speed(iter/s)": 0.315366 }, { "epoch": 3.7785823170731705, "grad_norm": 1.8206753730773926, "learning_rate": 1.4015456847631242e-05, "loss": 0.12684489488601686, "memory(GiB)": 153.57, "step": 9915, "token_acc": 0.9546581650726178, "train_speed(iter/s)": 0.315389 }, { "epoch": 3.7804878048780486, "grad_norm": 0.004334409721195698, "learning_rate": 1.3973920319960655e-05, "loss": 0.08523440957069398, "memory(GiB)": 153.57, "step": 9920, "token_acc": 0.9545999201171615, "train_speed(iter/s)": 0.315394 }, { "epoch": 3.7823932926829267, "grad_norm": 2.052057981491089, "learning_rate": 1.3932435432623458e-05, "loss": 0.16506922245025635, "memory(GiB)": 153.57, "step": 9925, "token_acc": 0.9149043303121853, "train_speed(iter/s)": 0.315427 }, { "epoch": 3.7842987804878048, "grad_norm": 1.6798710823059082, "learning_rate": 1.3891002245084706e-05, "loss": 0.12916908264160157, "memory(GiB)": 153.57, "step": 9930, "token_acc": 0.957790192427064, "train_speed(iter/s)": 0.315437 }, { "epoch": 3.786204268292683, "grad_norm": 1.2674846649169922, "learning_rate": 1.3849620816735365e-05, "loss": 0.10861196517944335, "memory(GiB)": 153.57, "step": 9935, "token_acc": 0.9534287545396283, "train_speed(iter/s)": 0.315413 }, { "epoch": 3.788109756097561, "grad_norm": 1.1109390258789062, "learning_rate": 1.3808291206892232e-05, "loss": 0.10950745344161987, "memory(GiB)": 153.57, "step": 9940, "token_acc": 0.9529533986346097, "train_speed(iter/s)": 0.315411 }, { "epoch": 3.790015243902439, "grad_norm": 1.1236052513122559, "learning_rate": 1.3767013474797764e-05, "loss": 0.10576488971710205, "memory(GiB)": 153.57, "step": 9945, "token_acc": 0.9563849563849564, "train_speed(iter/s)": 0.315413 }, { "epoch": 3.791920731707317, "grad_norm": 1.5001875162124634, "learning_rate": 1.3725787679620117e-05, "loss": 0.09208563566207886, "memory(GiB)": 153.57, "step": 9950, "token_acc": 0.9525208027410671, "train_speed(iter/s)": 0.315434 }, { "epoch": 3.7938262195121952, "grad_norm": 2.52644681930542, "learning_rate": 1.368461388045298e-05, "loss": 0.11790552139282226, "memory(GiB)": 153.57, "step": 9955, "token_acc": 0.9523141654978962, "train_speed(iter/s)": 0.315457 }, { "epoch": 3.7957317073170733, "grad_norm": 1.3922626972198486, "learning_rate": 1.3643492136315483e-05, "loss": 0.12132012844085693, "memory(GiB)": 153.57, "step": 9960, "token_acc": 0.9541984732824428, "train_speed(iter/s)": 0.315479 }, { "epoch": 3.7976371951219514, "grad_norm": 1.4085506200790405, "learning_rate": 1.3602422506152195e-05, "loss": 0.14438790082931519, "memory(GiB)": 153.57, "step": 9965, "token_acc": 0.9342544068604097, "train_speed(iter/s)": 0.31549 }, { "epoch": 3.7995426829268295, "grad_norm": 1.3057026863098145, "learning_rate": 1.356140504883292e-05, "loss": 0.15778045654296874, "memory(GiB)": 153.57, "step": 9970, "token_acc": 0.9344086021505377, "train_speed(iter/s)": 0.315514 }, { "epoch": 3.801448170731707, "grad_norm": 1.604524850845337, "learning_rate": 1.352043982315273e-05, "loss": 0.1383068799972534, "memory(GiB)": 153.57, "step": 9975, "token_acc": 0.9408075264602117, "train_speed(iter/s)": 0.315498 }, { "epoch": 3.8033536585365852, "grad_norm": 1.864388346672058, "learning_rate": 1.347952688783179e-05, "loss": 0.1272861361503601, "memory(GiB)": 153.57, "step": 9980, "token_acc": 0.9432928282106267, "train_speed(iter/s)": 0.315504 }, { "epoch": 3.8052591463414633, "grad_norm": 2.028414011001587, "learning_rate": 1.3438666301515329e-05, "loss": 0.12247673273086548, "memory(GiB)": 153.57, "step": 9985, "token_acc": 0.9506052323311207, "train_speed(iter/s)": 0.315524 }, { "epoch": 3.8071646341463414, "grad_norm": 1.1839983463287354, "learning_rate": 1.3397858122773544e-05, "loss": 0.1068060040473938, "memory(GiB)": 153.57, "step": 9990, "token_acc": 0.9567506581421588, "train_speed(iter/s)": 0.315547 }, { "epoch": 3.8090701219512195, "grad_norm": 1.5879080295562744, "learning_rate": 1.3357102410101518e-05, "loss": 0.13497183322906495, "memory(GiB)": 153.57, "step": 9995, "token_acc": 0.9482439926062847, "train_speed(iter/s)": 0.315566 }, { "epoch": 3.8109756097560976, "grad_norm": 2.2318127155303955, "learning_rate": 1.3316399221919074e-05, "loss": 0.10893306732177735, "memory(GiB)": 153.57, "step": 10000, "token_acc": 0.9558823529411765, "train_speed(iter/s)": 0.315586 }, { "epoch": 3.8128810975609757, "grad_norm": 1.1962727308273315, "learning_rate": 1.327574861657082e-05, "loss": 0.14052540063858032, "memory(GiB)": 153.57, "step": 10005, "token_acc": 0.9422418012726382, "train_speed(iter/s)": 0.315563 }, { "epoch": 3.814786585365854, "grad_norm": 1.8456696271896362, "learning_rate": 1.3235150652325929e-05, "loss": 0.16324206590652465, "memory(GiB)": 153.57, "step": 10010, "token_acc": 0.9392026578073089, "train_speed(iter/s)": 0.315586 }, { "epoch": 3.8166920731707314, "grad_norm": 1.2257410287857056, "learning_rate": 1.3194605387378168e-05, "loss": 0.10969864130020142, "memory(GiB)": 153.57, "step": 10015, "token_acc": 0.9496915959907479, "train_speed(iter/s)": 0.315557 }, { "epoch": 3.8185975609756095, "grad_norm": 1.5104618072509766, "learning_rate": 1.3154112879845714e-05, "loss": 0.11160964965820312, "memory(GiB)": 153.57, "step": 10020, "token_acc": 0.9578481602235678, "train_speed(iter/s)": 0.315528 }, { "epoch": 3.8205030487804876, "grad_norm": 1.1373611688613892, "learning_rate": 1.3113673187771152e-05, "loss": 0.10886105298995971, "memory(GiB)": 153.57, "step": 10025, "token_acc": 0.955786350148368, "train_speed(iter/s)": 0.315543 }, { "epoch": 3.8224085365853657, "grad_norm": 1.1310313940048218, "learning_rate": 1.3073286369121369e-05, "loss": 0.08976523280143738, "memory(GiB)": 153.57, "step": 10030, "token_acc": 0.9520692073883563, "train_speed(iter/s)": 0.315564 }, { "epoch": 3.824314024390244, "grad_norm": 1.5947405099868774, "learning_rate": 1.3032952481787452e-05, "loss": 0.15509529113769532, "memory(GiB)": 153.57, "step": 10035, "token_acc": 0.9345675071816151, "train_speed(iter/s)": 0.315533 }, { "epoch": 3.826219512195122, "grad_norm": 1.627636432647705, "learning_rate": 1.2992671583584587e-05, "loss": 0.11668051481246948, "memory(GiB)": 153.57, "step": 10040, "token_acc": 0.9520116335433835, "train_speed(iter/s)": 0.315566 }, { "epoch": 3.828125, "grad_norm": 1.262283205986023, "learning_rate": 1.2952443732252057e-05, "loss": 0.15596174001693724, "memory(GiB)": 153.57, "step": 10045, "token_acc": 0.9394036550176339, "train_speed(iter/s)": 0.315519 }, { "epoch": 3.830030487804878, "grad_norm": 1.3204190731048584, "learning_rate": 1.2912268985453046e-05, "loss": 0.1445538282394409, "memory(GiB)": 153.57, "step": 10050, "token_acc": 0.9394122993032414, "train_speed(iter/s)": 0.315538 }, { "epoch": 3.831935975609756, "grad_norm": 0.8811204433441162, "learning_rate": 1.2872147400774686e-05, "loss": 0.1432849645614624, "memory(GiB)": 153.57, "step": 10055, "token_acc": 0.9318479685452162, "train_speed(iter/s)": 0.31555 }, { "epoch": 3.8338414634146343, "grad_norm": 1.1289805173873901, "learning_rate": 1.2832079035727839e-05, "loss": 0.11592762470245362, "memory(GiB)": 153.57, "step": 10060, "token_acc": 0.9538961038961039, "train_speed(iter/s)": 0.315572 }, { "epoch": 3.8357469512195124, "grad_norm": 1.1095726490020752, "learning_rate": 1.2792063947747146e-05, "loss": 0.128584885597229, "memory(GiB)": 153.57, "step": 10065, "token_acc": 0.9562545191612437, "train_speed(iter/s)": 0.315592 }, { "epoch": 3.8376524390243905, "grad_norm": 1.2144289016723633, "learning_rate": 1.275210219419079e-05, "loss": 0.12740299701690674, "memory(GiB)": 153.57, "step": 10070, "token_acc": 0.9522258414766558, "train_speed(iter/s)": 0.315559 }, { "epoch": 3.8395579268292686, "grad_norm": 1.8560009002685547, "learning_rate": 1.271219383234063e-05, "loss": 0.13309918642044066, "memory(GiB)": 153.57, "step": 10075, "token_acc": 0.9430091185410334, "train_speed(iter/s)": 0.315543 }, { "epoch": 3.841463414634146, "grad_norm": 2.2546963691711426, "learning_rate": 1.2672338919401866e-05, "loss": 0.14546821117401124, "memory(GiB)": 153.57, "step": 10080, "token_acc": 0.9379615952732644, "train_speed(iter/s)": 0.315567 }, { "epoch": 3.8433689024390243, "grad_norm": 1.764148235321045, "learning_rate": 1.263253751250318e-05, "loss": 0.10618321895599366, "memory(GiB)": 153.57, "step": 10085, "token_acc": 0.9548819175185055, "train_speed(iter/s)": 0.315553 }, { "epoch": 3.8452743902439024, "grad_norm": 1.6570149660110474, "learning_rate": 1.2592789668696481e-05, "loss": 0.11951981782913208, "memory(GiB)": 153.57, "step": 10090, "token_acc": 0.9451279527559056, "train_speed(iter/s)": 0.315558 }, { "epoch": 3.8471798780487805, "grad_norm": 2.576421022415161, "learning_rate": 1.255309544495697e-05, "loss": 0.15585290193557738, "memory(GiB)": 153.57, "step": 10095, "token_acc": 0.9310128566948886, "train_speed(iter/s)": 0.315576 }, { "epoch": 3.8490853658536586, "grad_norm": 1.9248632192611694, "learning_rate": 1.251345489818292e-05, "loss": 0.11163849830627441, "memory(GiB)": 153.57, "step": 10100, "token_acc": 0.948339483394834, "train_speed(iter/s)": 0.315602 }, { "epoch": 3.8509908536585367, "grad_norm": 1.4099236726760864, "learning_rate": 1.2473868085195733e-05, "loss": 0.08761237263679504, "memory(GiB)": 153.57, "step": 10105, "token_acc": 0.9573770491803278, "train_speed(iter/s)": 0.31563 }, { "epoch": 3.8528963414634148, "grad_norm": 0.976403534412384, "learning_rate": 1.24343350627397e-05, "loss": 0.1305638313293457, "memory(GiB)": 153.57, "step": 10110, "token_acc": 0.9448132780082987, "train_speed(iter/s)": 0.315644 }, { "epoch": 3.854801829268293, "grad_norm": 0.9880330562591553, "learning_rate": 1.239485588748212e-05, "loss": 0.10859357118606568, "memory(GiB)": 153.57, "step": 10115, "token_acc": 0.9518900343642611, "train_speed(iter/s)": 0.315655 }, { "epoch": 3.8567073170731705, "grad_norm": 1.297875165939331, "learning_rate": 1.235543061601301e-05, "loss": 0.1268310546875, "memory(GiB)": 153.57, "step": 10120, "token_acc": 0.948607768992781, "train_speed(iter/s)": 0.315662 }, { "epoch": 3.8586128048780486, "grad_norm": 1.0808274745941162, "learning_rate": 1.2316059304845174e-05, "loss": 0.09744128584861755, "memory(GiB)": 153.57, "step": 10125, "token_acc": 0.952787543947765, "train_speed(iter/s)": 0.315683 }, { "epoch": 3.8605182926829267, "grad_norm": 0.8879773020744324, "learning_rate": 1.2276742010414033e-05, "loss": 0.10487064123153686, "memory(GiB)": 153.57, "step": 10130, "token_acc": 0.9603072983354674, "train_speed(iter/s)": 0.315664 }, { "epoch": 3.8624237804878048, "grad_norm": 1.4526326656341553, "learning_rate": 1.2237478789077622e-05, "loss": 0.09773914813995362, "memory(GiB)": 153.57, "step": 10135, "token_acc": 0.9609375, "train_speed(iter/s)": 0.315673 }, { "epoch": 3.864329268292683, "grad_norm": 1.0846410989761353, "learning_rate": 1.2198269697116416e-05, "loss": 0.12988425493240358, "memory(GiB)": 153.57, "step": 10140, "token_acc": 0.940254652301665, "train_speed(iter/s)": 0.315683 }, { "epoch": 3.866234756097561, "grad_norm": 2.2168076038360596, "learning_rate": 1.215911479073335e-05, "loss": 0.10020997524261474, "memory(GiB)": 153.57, "step": 10145, "token_acc": 0.9614384748700173, "train_speed(iter/s)": 0.315694 }, { "epoch": 3.868140243902439, "grad_norm": 2.8332507610321045, "learning_rate": 1.212001412605363e-05, "loss": 0.14244717359542847, "memory(GiB)": 153.57, "step": 10150, "token_acc": 0.9423716291097155, "train_speed(iter/s)": 0.315664 }, { "epoch": 3.870045731707317, "grad_norm": 1.0109800100326538, "learning_rate": 1.20809677591248e-05, "loss": 0.13998541831970215, "memory(GiB)": 153.57, "step": 10155, "token_acc": 0.9498359118612283, "train_speed(iter/s)": 0.315673 }, { "epoch": 3.8719512195121952, "grad_norm": 1.2522181272506714, "learning_rate": 1.2041975745916472e-05, "loss": 0.15703034400939941, "memory(GiB)": 153.57, "step": 10160, "token_acc": 0.9379014989293362, "train_speed(iter/s)": 0.315688 }, { "epoch": 3.8738567073170733, "grad_norm": 2.952767848968506, "learning_rate": 1.2003038142320433e-05, "loss": 0.14631829261779786, "memory(GiB)": 153.57, "step": 10165, "token_acc": 0.945025541230844, "train_speed(iter/s)": 0.315703 }, { "epoch": 3.8757621951219514, "grad_norm": 1.5243122577667236, "learning_rate": 1.1964155004150402e-05, "loss": 0.11510473489761353, "memory(GiB)": 153.57, "step": 10170, "token_acc": 0.9521864124678469, "train_speed(iter/s)": 0.315693 }, { "epoch": 3.8776676829268295, "grad_norm": 1.5356203317642212, "learning_rate": 1.1925326387142094e-05, "loss": 0.11454654932022094, "memory(GiB)": 153.57, "step": 10175, "token_acc": 0.955948298601952, "train_speed(iter/s)": 0.315637 }, { "epoch": 3.879573170731707, "grad_norm": 2.0147159099578857, "learning_rate": 1.1886552346953017e-05, "loss": 0.13240494728088378, "memory(GiB)": 153.57, "step": 10180, "token_acc": 0.9499136442141624, "train_speed(iter/s)": 0.315658 }, { "epoch": 3.8814786585365852, "grad_norm": 1.4629461765289307, "learning_rate": 1.1847832939162496e-05, "loss": 0.1202610969543457, "memory(GiB)": 153.57, "step": 10185, "token_acc": 0.9486040609137056, "train_speed(iter/s)": 0.315665 }, { "epoch": 3.8833841463414633, "grad_norm": 2.076403856277466, "learning_rate": 1.1809168219271488e-05, "loss": 0.1271574854850769, "memory(GiB)": 153.57, "step": 10190, "token_acc": 0.9515134529147982, "train_speed(iter/s)": 0.315643 }, { "epoch": 3.8852896341463414, "grad_norm": 1.6850332021713257, "learning_rate": 1.177055824270264e-05, "loss": 0.11804933547973633, "memory(GiB)": 153.57, "step": 10195, "token_acc": 0.9527431623240007, "train_speed(iter/s)": 0.315642 }, { "epoch": 3.8871951219512195, "grad_norm": 1.394483208656311, "learning_rate": 1.1732003064800045e-05, "loss": 0.10643041133880615, "memory(GiB)": 153.57, "step": 10200, "token_acc": 0.9446476469153865, "train_speed(iter/s)": 0.315655 }, { "epoch": 3.8891006097560976, "grad_norm": 1.749753713607788, "learning_rate": 1.1693502740829315e-05, "loss": 0.11508380174636841, "memory(GiB)": 153.57, "step": 10205, "token_acc": 0.9479530960707674, "train_speed(iter/s)": 0.315665 }, { "epoch": 3.8910060975609757, "grad_norm": 1.257240653038025, "learning_rate": 1.1655057325977376e-05, "loss": 0.10545927286148071, "memory(GiB)": 153.57, "step": 10210, "token_acc": 0.9572099262240107, "train_speed(iter/s)": 0.315648 }, { "epoch": 3.892911585365854, "grad_norm": 2.0494651794433594, "learning_rate": 1.1616666875352495e-05, "loss": 0.10836185216903686, "memory(GiB)": 153.57, "step": 10215, "token_acc": 0.9550629119233074, "train_speed(iter/s)": 0.315625 }, { "epoch": 3.8948170731707314, "grad_norm": 1.5668283700942993, "learning_rate": 1.1578331443984104e-05, "loss": 0.1619330406188965, "memory(GiB)": 153.57, "step": 10220, "token_acc": 0.9250836120401338, "train_speed(iter/s)": 0.315612 }, { "epoch": 3.8967225609756095, "grad_norm": 1.1836718320846558, "learning_rate": 1.1540051086822828e-05, "loss": 0.1352340340614319, "memory(GiB)": 153.57, "step": 10225, "token_acc": 0.9547511312217195, "train_speed(iter/s)": 0.315532 }, { "epoch": 3.8986280487804876, "grad_norm": 1.3724385499954224, "learning_rate": 1.1501825858740273e-05, "loss": 0.08165214657783508, "memory(GiB)": 153.57, "step": 10230, "token_acc": 0.9641105662059741, "train_speed(iter/s)": 0.315534 }, { "epoch": 3.9005335365853657, "grad_norm": 1.5411126613616943, "learning_rate": 1.1463655814529117e-05, "loss": 0.09922332763671875, "memory(GiB)": 153.57, "step": 10235, "token_acc": 0.9625763747454175, "train_speed(iter/s)": 0.315554 }, { "epoch": 3.902439024390244, "grad_norm": 0.7563130855560303, "learning_rate": 1.1425541008902851e-05, "loss": 0.14009546041488646, "memory(GiB)": 153.57, "step": 10240, "token_acc": 0.9399070385126163, "train_speed(iter/s)": 0.315574 }, { "epoch": 3.904344512195122, "grad_norm": 1.1794179677963257, "learning_rate": 1.1387481496495845e-05, "loss": 0.12127641439437867, "memory(GiB)": 153.57, "step": 10245, "token_acc": 0.9529122963624191, "train_speed(iter/s)": 0.315566 }, { "epoch": 3.90625, "grad_norm": 1.0935447216033936, "learning_rate": 1.134947733186315e-05, "loss": 0.1649762511253357, "memory(GiB)": 153.57, "step": 10250, "token_acc": 0.9422180801491147, "train_speed(iter/s)": 0.315517 }, { "epoch": 3.908155487804878, "grad_norm": 1.2318986654281616, "learning_rate": 1.1311528569480555e-05, "loss": 0.13256968259811402, "memory(GiB)": 153.57, "step": 10255, "token_acc": 0.9441839696360795, "train_speed(iter/s)": 0.315529 }, { "epoch": 3.910060975609756, "grad_norm": 1.394962191581726, "learning_rate": 1.127363526374437e-05, "loss": 0.14239201545715333, "memory(GiB)": 153.57, "step": 10260, "token_acc": 0.9408215363193316, "train_speed(iter/s)": 0.315541 }, { "epoch": 3.9119664634146343, "grad_norm": 1.9015406370162964, "learning_rate": 1.1235797468971453e-05, "loss": 0.16214717626571656, "memory(GiB)": 153.57, "step": 10265, "token_acc": 0.9223691776883266, "train_speed(iter/s)": 0.315563 }, { "epoch": 3.9138719512195124, "grad_norm": 1.830666422843933, "learning_rate": 1.1198015239399063e-05, "loss": 0.14174840450286866, "memory(GiB)": 153.57, "step": 10270, "token_acc": 0.9532038834951456, "train_speed(iter/s)": 0.315576 }, { "epoch": 3.9157774390243905, "grad_norm": 1.4826629161834717, "learning_rate": 1.1160288629184834e-05, "loss": 0.11346104145050048, "memory(GiB)": 153.57, "step": 10275, "token_acc": 0.9529633113828786, "train_speed(iter/s)": 0.315543 }, { "epoch": 3.9176829268292686, "grad_norm": 1.5285218954086304, "learning_rate": 1.1122617692406667e-05, "loss": 0.10069547891616822, "memory(GiB)": 153.57, "step": 10280, "token_acc": 0.9542944201104552, "train_speed(iter/s)": 0.315548 }, { "epoch": 3.919588414634146, "grad_norm": 1.649736762046814, "learning_rate": 1.1085002483062668e-05, "loss": 0.11486964225769043, "memory(GiB)": 153.57, "step": 10285, "token_acc": 0.9528006267136702, "train_speed(iter/s)": 0.315552 }, { "epoch": 3.9214939024390243, "grad_norm": 1.2502200603485107, "learning_rate": 1.1047443055071033e-05, "loss": 0.10881927013397216, "memory(GiB)": 153.57, "step": 10290, "token_acc": 0.9536777806057521, "train_speed(iter/s)": 0.315565 }, { "epoch": 3.9233993902439024, "grad_norm": 0.7391106486320496, "learning_rate": 1.1009939462270041e-05, "loss": 0.11849139928817749, "memory(GiB)": 153.57, "step": 10295, "token_acc": 0.9550883898709985, "train_speed(iter/s)": 0.31556 }, { "epoch": 3.9253048780487805, "grad_norm": 1.1624078750610352, "learning_rate": 1.097249175841789e-05, "loss": 0.09486567378044128, "memory(GiB)": 153.57, "step": 10300, "token_acc": 0.9638617318435754, "train_speed(iter/s)": 0.31552 }, { "epoch": 3.9272103658536586, "grad_norm": 1.5506725311279297, "learning_rate": 1.093509999719272e-05, "loss": 0.11022601127624512, "memory(GiB)": 153.57, "step": 10305, "token_acc": 0.9508804448563485, "train_speed(iter/s)": 0.315542 }, { "epoch": 3.9291158536585367, "grad_norm": 1.2734568119049072, "learning_rate": 1.0897764232192431e-05, "loss": 0.17542943954467774, "memory(GiB)": 153.57, "step": 10310, "token_acc": 0.9394873771439584, "train_speed(iter/s)": 0.315545 }, { "epoch": 3.9310213414634148, "grad_norm": 1.1836843490600586, "learning_rate": 1.0860484516934682e-05, "loss": 0.16687018871307374, "memory(GiB)": 153.57, "step": 10315, "token_acc": 0.936804771884572, "train_speed(iter/s)": 0.315539 }, { "epoch": 3.932926829268293, "grad_norm": 2.830190420150757, "learning_rate": 1.082326090485679e-05, "loss": 0.09959838390350342, "memory(GiB)": 153.57, "step": 10320, "token_acc": 0.9593055814821728, "train_speed(iter/s)": 0.315536 }, { "epoch": 3.9348323170731705, "grad_norm": 1.0339384078979492, "learning_rate": 1.078609344931567e-05, "loss": 0.1402353048324585, "memory(GiB)": 153.57, "step": 10325, "token_acc": 0.9443250634889627, "train_speed(iter/s)": 0.315542 }, { "epoch": 3.9367378048780486, "grad_norm": 0.8112776279449463, "learning_rate": 1.0748982203587688e-05, "loss": 0.11530166864395142, "memory(GiB)": 153.57, "step": 10330, "token_acc": 0.9464158132888782, "train_speed(iter/s)": 0.315565 }, { "epoch": 3.9386432926829267, "grad_norm": 1.2682150602340698, "learning_rate": 1.0711927220868695e-05, "loss": 0.113700270652771, "memory(GiB)": 153.57, "step": 10335, "token_acc": 0.9424727441428903, "train_speed(iter/s)": 0.315538 }, { "epoch": 3.9405487804878048, "grad_norm": 1.1441675424575806, "learning_rate": 1.067492855427385e-05, "loss": 0.11194170713424682, "memory(GiB)": 153.57, "step": 10340, "token_acc": 0.9513496040084047, "train_speed(iter/s)": 0.315526 }, { "epoch": 3.942454268292683, "grad_norm": 1.6850155591964722, "learning_rate": 1.0637986256837634e-05, "loss": 0.08850868940353393, "memory(GiB)": 153.57, "step": 10345, "token_acc": 0.9578015108101068, "train_speed(iter/s)": 0.315538 }, { "epoch": 3.944359756097561, "grad_norm": 1.153041124343872, "learning_rate": 1.0601100381513667e-05, "loss": 0.10808827877044677, "memory(GiB)": 153.57, "step": 10350, "token_acc": 0.9577308120133482, "train_speed(iter/s)": 0.315498 }, { "epoch": 3.946265243902439, "grad_norm": 0.8345314264297485, "learning_rate": 1.0564270981174746e-05, "loss": 0.0792979657649994, "memory(GiB)": 153.57, "step": 10355, "token_acc": 0.9593872461702886, "train_speed(iter/s)": 0.315503 }, { "epoch": 3.948170731707317, "grad_norm": 1.2244489192962646, "learning_rate": 1.0527498108612694e-05, "loss": 0.09929026365280151, "memory(GiB)": 153.57, "step": 10360, "token_acc": 0.9574638844301766, "train_speed(iter/s)": 0.315502 }, { "epoch": 3.9500762195121952, "grad_norm": 1.5677422285079956, "learning_rate": 1.0490781816538326e-05, "loss": 0.11203696727752685, "memory(GiB)": 153.57, "step": 10365, "token_acc": 0.9471639471639471, "train_speed(iter/s)": 0.315514 }, { "epoch": 3.9519817073170733, "grad_norm": 1.9769266843795776, "learning_rate": 1.0454122157581304e-05, "loss": 0.12433789968490601, "memory(GiB)": 153.57, "step": 10370, "token_acc": 0.9627272727272728, "train_speed(iter/s)": 0.315503 }, { "epoch": 3.9538871951219514, "grad_norm": 1.6246246099472046, "learning_rate": 1.041751918429017e-05, "loss": 0.12678325176239014, "memory(GiB)": 153.57, "step": 10375, "token_acc": 0.942575684766939, "train_speed(iter/s)": 0.315466 }, { "epoch": 3.9557926829268295, "grad_norm": 1.0176761150360107, "learning_rate": 1.0380972949132167e-05, "loss": 0.0999096155166626, "memory(GiB)": 153.57, "step": 10380, "token_acc": 0.9612676056338029, "train_speed(iter/s)": 0.315494 }, { "epoch": 3.957698170731707, "grad_norm": 0.9890904426574707, "learning_rate": 1.0344483504493246e-05, "loss": 0.09484732151031494, "memory(GiB)": 153.57, "step": 10385, "token_acc": 0.9627599548605513, "train_speed(iter/s)": 0.315494 }, { "epoch": 3.9596036585365852, "grad_norm": 1.7178586721420288, "learning_rate": 1.0308050902677906e-05, "loss": 0.14851861000061034, "memory(GiB)": 153.57, "step": 10390, "token_acc": 0.9398023360287511, "train_speed(iter/s)": 0.315508 }, { "epoch": 3.9615091463414633, "grad_norm": 1.7955299615859985, "learning_rate": 1.0271675195909219e-05, "loss": 0.1414152979850769, "memory(GiB)": 153.57, "step": 10395, "token_acc": 0.932578486875965, "train_speed(iter/s)": 0.315524 }, { "epoch": 3.9634146341463414, "grad_norm": 1.5836018323898315, "learning_rate": 1.0235356436328675e-05, "loss": 0.08856061100959778, "memory(GiB)": 153.57, "step": 10400, "token_acc": 0.960602310231023, "train_speed(iter/s)": 0.315487 }, { "epoch": 3.9634146341463414, "eval_loss": 0.1593502163887024, "eval_runtime": 33.1362, "eval_samples_per_second": 3.199, "eval_steps_per_second": 3.199, "eval_token_acc": 0.9267789791299975, "step": 10400 }, { "epoch": 3.9653201219512195, "grad_norm": 1.6715800762176514, "learning_rate": 1.0199094675996145e-05, "loss": 0.15254191160202027, "memory(GiB)": 153.57, "step": 10405, "token_acc": 0.9284648505776527, "train_speed(iter/s)": 0.315194 }, { "epoch": 3.9672256097560976, "grad_norm": 0.9192164540290833, "learning_rate": 1.0162889966889772e-05, "loss": 0.13202561140060426, "memory(GiB)": 153.57, "step": 10410, "token_acc": 0.9522120905099628, "train_speed(iter/s)": 0.315195 }, { "epoch": 3.9691310975609757, "grad_norm": 1.4465471506118774, "learning_rate": 1.0126742360905966e-05, "loss": 0.09850500822067261, "memory(GiB)": 153.57, "step": 10415, "token_acc": 0.9598878963040812, "train_speed(iter/s)": 0.315183 }, { "epoch": 3.971036585365854, "grad_norm": 1.6577435731887817, "learning_rate": 1.0090651909859228e-05, "loss": 0.11531217098236084, "memory(GiB)": 153.57, "step": 10420, "token_acc": 0.9471282877079978, "train_speed(iter/s)": 0.315193 }, { "epoch": 3.9729420731707314, "grad_norm": 2.0910165309906006, "learning_rate": 1.0054618665482197e-05, "loss": 0.13827275037765502, "memory(GiB)": 153.57, "step": 10425, "token_acc": 0.9522552255225523, "train_speed(iter/s)": 0.315125 }, { "epoch": 3.9748475609756095, "grad_norm": 1.614781141281128, "learning_rate": 1.0018642679425433e-05, "loss": 0.12148494720458984, "memory(GiB)": 153.57, "step": 10430, "token_acc": 0.952274630198158, "train_speed(iter/s)": 0.315144 }, { "epoch": 3.9767530487804876, "grad_norm": 1.3443634510040283, "learning_rate": 9.982724003257493e-06, "loss": 0.12529168128967286, "memory(GiB)": 153.57, "step": 10435, "token_acc": 0.9461678832116789, "train_speed(iter/s)": 0.315171 }, { "epoch": 3.9786585365853657, "grad_norm": 1.917319655418396, "learning_rate": 9.946862688464753e-06, "loss": 0.10258030891418457, "memory(GiB)": 153.57, "step": 10440, "token_acc": 0.9591335417597141, "train_speed(iter/s)": 0.315185 }, { "epoch": 3.980564024390244, "grad_norm": 1.7500046491622925, "learning_rate": 9.911058786451383e-06, "loss": 0.11640030145645142, "memory(GiB)": 153.57, "step": 10445, "token_acc": 0.9496021220159151, "train_speed(iter/s)": 0.315121 }, { "epoch": 3.982469512195122, "grad_norm": 1.5394132137298584, "learning_rate": 9.875312348539212e-06, "loss": 0.10729349851608276, "memory(GiB)": 153.57, "step": 10450, "token_acc": 0.9552864462040056, "train_speed(iter/s)": 0.315072 }, { "epoch": 3.984375, "grad_norm": 1.7001394033432007, "learning_rate": 9.83962342596776e-06, "loss": 0.11789286136627197, "memory(GiB)": 153.57, "step": 10455, "token_acc": 0.9580376427028695, "train_speed(iter/s)": 0.315101 }, { "epoch": 3.986280487804878, "grad_norm": 1.0840353965759277, "learning_rate": 9.803992069894053e-06, "loss": 0.14647552967071534, "memory(GiB)": 153.57, "step": 10460, "token_acc": 0.9472075010191602, "train_speed(iter/s)": 0.315108 }, { "epoch": 3.988185975609756, "grad_norm": 0.9740680456161499, "learning_rate": 9.768418331392637e-06, "loss": 0.11112321615219116, "memory(GiB)": 153.57, "step": 10465, "token_acc": 0.9563265306122449, "train_speed(iter/s)": 0.31509 }, { "epoch": 3.9900914634146343, "grad_norm": 0.8366380333900452, "learning_rate": 9.732902261455434e-06, "loss": 0.10510437488555908, "memory(GiB)": 153.57, "step": 10470, "token_acc": 0.9533855052739026, "train_speed(iter/s)": 0.315098 }, { "epoch": 3.9919969512195124, "grad_norm": 3.3244292736053467, "learning_rate": 9.697443910991755e-06, "loss": 0.11702111959457398, "memory(GiB)": 153.57, "step": 10475, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.315118 }, { "epoch": 3.9939024390243905, "grad_norm": 1.1119053363800049, "learning_rate": 9.662043330828085e-06, "loss": 0.15201900005340577, "memory(GiB)": 153.57, "step": 10480, "token_acc": 0.9482429855625171, "train_speed(iter/s)": 0.31514 }, { "epoch": 3.9958079268292686, "grad_norm": 0.6949253678321838, "learning_rate": 9.626700571708219e-06, "loss": 0.11317740678787232, "memory(GiB)": 153.57, "step": 10485, "token_acc": 0.9515620521639439, "train_speed(iter/s)": 0.315156 }, { "epoch": 3.997713414634146, "grad_norm": 1.4169353246688843, "learning_rate": 9.591415684292975e-06, "loss": 0.12291057109832763, "memory(GiB)": 153.57, "step": 10490, "token_acc": 0.9487284659557014, "train_speed(iter/s)": 0.315141 }, { "epoch": 3.9996189024390243, "grad_norm": 1.182724952697754, "learning_rate": 9.556188719160263e-06, "loss": 0.09388527870178223, "memory(GiB)": 153.57, "step": 10495, "token_acc": 0.9605475040257649, "train_speed(iter/s)": 0.315163 }, { "epoch": 4.001524390243903, "grad_norm": 0.9365359544754028, "learning_rate": 9.521019726804936e-06, "loss": 0.11818516254425049, "memory(GiB)": 153.57, "step": 10500, "token_acc": 0.9527777777777777, "train_speed(iter/s)": 0.315189 }, { "epoch": 4.003429878048781, "grad_norm": 1.3493191003799438, "learning_rate": 9.48590875763879e-06, "loss": 0.08004541397094726, "memory(GiB)": 153.57, "step": 10505, "token_acc": 0.968596395412343, "train_speed(iter/s)": 0.315194 }, { "epoch": 4.005335365853658, "grad_norm": 1.3283686637878418, "learning_rate": 9.450855861990394e-06, "loss": 0.13557007312774658, "memory(GiB)": 153.57, "step": 10510, "token_acc": 0.9502412545235223, "train_speed(iter/s)": 0.315215 }, { "epoch": 4.007240853658536, "grad_norm": 1.0819870233535767, "learning_rate": 9.415861090105138e-06, "loss": 0.10911916494369507, "memory(GiB)": 153.57, "step": 10515, "token_acc": 0.9574765422239968, "train_speed(iter/s)": 0.315181 }, { "epoch": 4.009146341463414, "grad_norm": 1.0804154872894287, "learning_rate": 9.380924492145032e-06, "loss": 0.07676995396614075, "memory(GiB)": 153.57, "step": 10520, "token_acc": 0.970107163000564, "train_speed(iter/s)": 0.315154 }, { "epoch": 4.011051829268292, "grad_norm": 2.7015035152435303, "learning_rate": 9.346046118188744e-06, "loss": 0.1331552743911743, "memory(GiB)": 153.57, "step": 10525, "token_acc": 0.9464978030498837, "train_speed(iter/s)": 0.31517 }, { "epoch": 4.0129573170731705, "grad_norm": 1.0405958890914917, "learning_rate": 9.311226018231467e-06, "loss": 0.08280447721481324, "memory(GiB)": 153.57, "step": 10530, "token_acc": 0.9653943264518046, "train_speed(iter/s)": 0.315127 }, { "epoch": 4.014862804878049, "grad_norm": 0.8672987818717957, "learning_rate": 9.276464242184885e-06, "loss": 0.08994803428649903, "memory(GiB)": 153.57, "step": 10535, "token_acc": 0.9666851749028318, "train_speed(iter/s)": 0.315149 }, { "epoch": 4.016768292682927, "grad_norm": 1.5098129510879517, "learning_rate": 9.241760839877022e-06, "loss": 0.08241230845451356, "memory(GiB)": 153.57, "step": 10540, "token_acc": 0.9704968944099379, "train_speed(iter/s)": 0.315175 }, { "epoch": 4.018673780487805, "grad_norm": 1.0130987167358398, "learning_rate": 9.207115861052301e-06, "loss": 0.06117960214614868, "memory(GiB)": 153.57, "step": 10545, "token_acc": 0.974644640799078, "train_speed(iter/s)": 0.315187 }, { "epoch": 4.020579268292683, "grad_norm": 0.9799622297286987, "learning_rate": 9.172529355371328e-06, "loss": 0.11690658330917358, "memory(GiB)": 153.57, "step": 10550, "token_acc": 0.9529810043257476, "train_speed(iter/s)": 0.315198 }, { "epoch": 4.022484756097561, "grad_norm": 1.7227221727371216, "learning_rate": 9.138001372410965e-06, "loss": 0.11891278028488159, "memory(GiB)": 153.57, "step": 10555, "token_acc": 0.9494207864251917, "train_speed(iter/s)": 0.315202 }, { "epoch": 4.024390243902439, "grad_norm": 1.6567440032958984, "learning_rate": 9.103531961664118e-06, "loss": 0.07341548800468445, "memory(GiB)": 153.57, "step": 10560, "token_acc": 0.9632082413539367, "train_speed(iter/s)": 0.315228 }, { "epoch": 4.026295731707317, "grad_norm": 1.2259862422943115, "learning_rate": 9.069121172539791e-06, "loss": 0.11724125146865845, "memory(GiB)": 153.57, "step": 10565, "token_acc": 0.9522781774580336, "train_speed(iter/s)": 0.315209 }, { "epoch": 4.028201219512195, "grad_norm": 2.6820926666259766, "learning_rate": 9.034769054362919e-06, "loss": 0.10814363956451416, "memory(GiB)": 153.57, "step": 10570, "token_acc": 0.9598330725091289, "train_speed(iter/s)": 0.315217 }, { "epoch": 4.030106707317073, "grad_norm": 2.375070571899414, "learning_rate": 9.00047565637438e-06, "loss": 0.11838709115982056, "memory(GiB)": 153.57, "step": 10575, "token_acc": 0.9520749665327979, "train_speed(iter/s)": 0.315236 }, { "epoch": 4.032012195121951, "grad_norm": 1.2127223014831543, "learning_rate": 8.966241027730821e-06, "loss": 0.08515911102294922, "memory(GiB)": 153.57, "step": 10580, "token_acc": 0.9639230358097274, "train_speed(iter/s)": 0.315253 }, { "epoch": 4.0339176829268295, "grad_norm": 1.3575879335403442, "learning_rate": 8.93206521750471e-06, "loss": 0.0862622618675232, "memory(GiB)": 153.57, "step": 10585, "token_acc": 0.9679330065359477, "train_speed(iter/s)": 0.315253 }, { "epoch": 4.035823170731708, "grad_norm": 1.0465242862701416, "learning_rate": 8.897948274684153e-06, "loss": 0.093155437707901, "memory(GiB)": 153.57, "step": 10590, "token_acc": 0.9639922667955534, "train_speed(iter/s)": 0.315263 }, { "epoch": 4.037728658536586, "grad_norm": 1.3334078788757324, "learning_rate": 8.863890248172918e-06, "loss": 0.10436415672302246, "memory(GiB)": 153.57, "step": 10595, "token_acc": 0.9554891710231516, "train_speed(iter/s)": 0.315231 }, { "epoch": 4.039634146341464, "grad_norm": 1.4503023624420166, "learning_rate": 8.829891186790279e-06, "loss": 0.10477622747421264, "memory(GiB)": 153.57, "step": 10600, "token_acc": 0.9647908516400843, "train_speed(iter/s)": 0.315247 }, { "epoch": 4.041539634146342, "grad_norm": 1.800877332687378, "learning_rate": 8.795951139271025e-06, "loss": 0.0464599609375, "memory(GiB)": 153.57, "step": 10605, "token_acc": 0.978890023833844, "train_speed(iter/s)": 0.315263 }, { "epoch": 4.043445121951219, "grad_norm": 1.2676774263381958, "learning_rate": 8.762070154265345e-06, "loss": 0.08200929164886475, "memory(GiB)": 153.57, "step": 10610, "token_acc": 0.9670257488176563, "train_speed(iter/s)": 0.315253 }, { "epoch": 4.045350609756097, "grad_norm": 2.6687850952148438, "learning_rate": 8.72824828033874e-06, "loss": 0.10514997243881226, "memory(GiB)": 153.57, "step": 10615, "token_acc": 0.9618400418191323, "train_speed(iter/s)": 0.315267 }, { "epoch": 4.047256097560975, "grad_norm": 1.4682490825653076, "learning_rate": 8.694485565972016e-06, "loss": 0.06241785287857056, "memory(GiB)": 153.57, "step": 10620, "token_acc": 0.9711576451995259, "train_speed(iter/s)": 0.315255 }, { "epoch": 4.049161585365853, "grad_norm": 1.698265552520752, "learning_rate": 8.660782059561167e-06, "loss": 0.10221883058547973, "memory(GiB)": 153.57, "step": 10625, "token_acc": 0.9646670055922725, "train_speed(iter/s)": 0.315272 }, { "epoch": 4.0510670731707314, "grad_norm": 1.6267180442810059, "learning_rate": 8.627137809417302e-06, "loss": 0.1070255994796753, "memory(GiB)": 153.57, "step": 10630, "token_acc": 0.9609113245476882, "train_speed(iter/s)": 0.315247 }, { "epoch": 4.0529725609756095, "grad_norm": 1.0320191383361816, "learning_rate": 8.593552863766624e-06, "loss": 0.1211356520652771, "memory(GiB)": 153.57, "step": 10635, "token_acc": 0.9577867473932055, "train_speed(iter/s)": 0.315246 }, { "epoch": 4.054878048780488, "grad_norm": 1.720259428024292, "learning_rate": 8.560027270750277e-06, "loss": 0.10955535173416138, "memory(GiB)": 153.57, "step": 10640, "token_acc": 0.951487414187643, "train_speed(iter/s)": 0.315275 }, { "epoch": 4.056783536585366, "grad_norm": 1.4549100399017334, "learning_rate": 8.526561078424377e-06, "loss": 0.10669156312942504, "memory(GiB)": 153.57, "step": 10645, "token_acc": 0.9590361445783132, "train_speed(iter/s)": 0.315219 }, { "epoch": 4.058689024390244, "grad_norm": 2.258918046951294, "learning_rate": 8.493154334759883e-06, "loss": 0.12934715747833253, "memory(GiB)": 153.57, "step": 10650, "token_acc": 0.9433088034886891, "train_speed(iter/s)": 0.31523 }, { "epoch": 4.060594512195122, "grad_norm": 0.8288207650184631, "learning_rate": 8.459807087642518e-06, "loss": 0.06750277280807496, "memory(GiB)": 153.57, "step": 10655, "token_acc": 0.977325312355391, "train_speed(iter/s)": 0.315258 }, { "epoch": 4.0625, "grad_norm": 1.1402863264083862, "learning_rate": 8.426519384872733e-06, "loss": 0.10719895362854004, "memory(GiB)": 153.57, "step": 10660, "token_acc": 0.9584588893671976, "train_speed(iter/s)": 0.315235 }, { "epoch": 4.064405487804878, "grad_norm": 1.3577643632888794, "learning_rate": 8.393291274165654e-06, "loss": 0.11247930526733399, "memory(GiB)": 153.57, "step": 10665, "token_acc": 0.9504158669225847, "train_speed(iter/s)": 0.315256 }, { "epoch": 4.066310975609756, "grad_norm": 0.9213225841522217, "learning_rate": 8.360122803150938e-06, "loss": 0.09559518098831177, "memory(GiB)": 153.57, "step": 10670, "token_acc": 0.963850444123115, "train_speed(iter/s)": 0.315272 }, { "epoch": 4.068216463414634, "grad_norm": 1.3207772970199585, "learning_rate": 8.327014019372792e-06, "loss": 0.09787349700927735, "memory(GiB)": 153.57, "step": 10675, "token_acc": 0.9581212021678437, "train_speed(iter/s)": 0.31527 }, { "epoch": 4.070121951219512, "grad_norm": 1.5345057249069214, "learning_rate": 8.293964970289841e-06, "loss": 0.11561622619628906, "memory(GiB)": 153.57, "step": 10680, "token_acc": 0.9542637946296842, "train_speed(iter/s)": 0.315288 }, { "epoch": 4.0720274390243905, "grad_norm": 1.143996000289917, "learning_rate": 8.260975703275103e-06, "loss": 0.08596771955490112, "memory(GiB)": 153.57, "step": 10685, "token_acc": 0.9616578014184397, "train_speed(iter/s)": 0.315261 }, { "epoch": 4.0739329268292686, "grad_norm": 2.2221944332122803, "learning_rate": 8.228046265615908e-06, "loss": 0.10367131233215332, "memory(GiB)": 153.57, "step": 10690, "token_acc": 0.9560439560439561, "train_speed(iter/s)": 0.315283 }, { "epoch": 4.075838414634147, "grad_norm": 1.0494986772537231, "learning_rate": 8.195176704513796e-06, "loss": 0.07588052749633789, "memory(GiB)": 153.57, "step": 10695, "token_acc": 0.970771850170261, "train_speed(iter/s)": 0.315301 }, { "epoch": 4.077743902439025, "grad_norm": 1.724025845527649, "learning_rate": 8.162367067084525e-06, "loss": 0.10157213211059571, "memory(GiB)": 153.57, "step": 10700, "token_acc": 0.9696241258741258, "train_speed(iter/s)": 0.315304 }, { "epoch": 4.079649390243903, "grad_norm": 1.1636123657226562, "learning_rate": 8.129617400357897e-06, "loss": 0.0785775899887085, "memory(GiB)": 153.57, "step": 10705, "token_acc": 0.9707832573559884, "train_speed(iter/s)": 0.315266 }, { "epoch": 4.081554878048781, "grad_norm": 1.3439143896102905, "learning_rate": 8.096927751277805e-06, "loss": 0.08933408260345459, "memory(GiB)": 153.57, "step": 10710, "token_acc": 0.9586343930635838, "train_speed(iter/s)": 0.315272 }, { "epoch": 4.083460365853658, "grad_norm": 1.2433946132659912, "learning_rate": 8.0642981667021e-06, "loss": 0.07502334117889405, "memory(GiB)": 153.57, "step": 10715, "token_acc": 0.9722846441947566, "train_speed(iter/s)": 0.315218 }, { "epoch": 4.085365853658536, "grad_norm": 2.1483325958251953, "learning_rate": 8.031728693402502e-06, "loss": 0.08831688165664672, "memory(GiB)": 153.57, "step": 10720, "token_acc": 0.9627059843885516, "train_speed(iter/s)": 0.31524 }, { "epoch": 4.087271341463414, "grad_norm": 1.2522408962249756, "learning_rate": 7.99921937806461e-06, "loss": 0.0796471118927002, "memory(GiB)": 153.57, "step": 10725, "token_acc": 0.968142422112907, "train_speed(iter/s)": 0.315252 }, { "epoch": 4.089176829268292, "grad_norm": 1.2630137205123901, "learning_rate": 7.966770267287781e-06, "loss": 0.0868294358253479, "memory(GiB)": 153.57, "step": 10730, "token_acc": 0.9596495885319883, "train_speed(iter/s)": 0.315272 }, { "epoch": 4.0910823170731705, "grad_norm": 2.0246028900146484, "learning_rate": 7.934381407585045e-06, "loss": 0.08449790477752686, "memory(GiB)": 153.57, "step": 10735, "token_acc": 0.9592608147837043, "train_speed(iter/s)": 0.315274 }, { "epoch": 4.092987804878049, "grad_norm": 1.5401612520217896, "learning_rate": 7.902052845383112e-06, "loss": 0.08534547090530395, "memory(GiB)": 153.57, "step": 10740, "token_acc": 0.9667607400439009, "train_speed(iter/s)": 0.315286 }, { "epoch": 4.094893292682927, "grad_norm": 1.338455080986023, "learning_rate": 7.869784627022214e-06, "loss": 0.08437874317169189, "memory(GiB)": 153.57, "step": 10745, "token_acc": 0.9648105181747874, "train_speed(iter/s)": 0.315288 }, { "epoch": 4.096798780487805, "grad_norm": 1.9307080507278442, "learning_rate": 7.837576798756125e-06, "loss": 0.08517765998840332, "memory(GiB)": 153.57, "step": 10750, "token_acc": 0.9682824655894674, "train_speed(iter/s)": 0.315302 }, { "epoch": 4.098704268292683, "grad_norm": 1.845447063446045, "learning_rate": 7.805429406752024e-06, "loss": 0.07496689558029175, "memory(GiB)": 153.57, "step": 10755, "token_acc": 0.9647588102974256, "train_speed(iter/s)": 0.315313 }, { "epoch": 4.100609756097561, "grad_norm": 1.6665014028549194, "learning_rate": 7.773342497090486e-06, "loss": 0.07084813117980956, "memory(GiB)": 153.57, "step": 10760, "token_acc": 0.9700923323966278, "train_speed(iter/s)": 0.315278 }, { "epoch": 4.102515243902439, "grad_norm": 2.7782135009765625, "learning_rate": 7.741316115765367e-06, "loss": 0.107193922996521, "memory(GiB)": 153.57, "step": 10765, "token_acc": 0.95750609543713, "train_speed(iter/s)": 0.315296 }, { "epoch": 4.104420731707317, "grad_norm": 1.8996915817260742, "learning_rate": 7.7093503086838e-06, "loss": 0.09660684466361999, "memory(GiB)": 153.57, "step": 10770, "token_acc": 0.9622689938398358, "train_speed(iter/s)": 0.315309 }, { "epoch": 4.106326219512195, "grad_norm": 0.9840500354766846, "learning_rate": 7.677445121666022e-06, "loss": 0.07227116823196411, "memory(GiB)": 153.57, "step": 10775, "token_acc": 0.9559579153413261, "train_speed(iter/s)": 0.315331 }, { "epoch": 4.108231707317073, "grad_norm": 2.3252828121185303, "learning_rate": 7.645600600445451e-06, "loss": 0.09470783472061158, "memory(GiB)": 153.57, "step": 10780, "token_acc": 0.9662477558348295, "train_speed(iter/s)": 0.315341 }, { "epoch": 4.110137195121951, "grad_norm": 1.562300443649292, "learning_rate": 7.613816790668477e-06, "loss": 0.12733618021011353, "memory(GiB)": 153.57, "step": 10785, "token_acc": 0.9524163568773234, "train_speed(iter/s)": 0.315329 }, { "epoch": 4.1120426829268295, "grad_norm": 1.8150243759155273, "learning_rate": 7.58209373789453e-06, "loss": 0.09805129766464234, "memory(GiB)": 153.57, "step": 10790, "token_acc": 0.9585616438356165, "train_speed(iter/s)": 0.315356 }, { "epoch": 4.113948170731708, "grad_norm": 1.725007176399231, "learning_rate": 7.550431487595894e-06, "loss": 0.11369028091430664, "memory(GiB)": 153.57, "step": 10795, "token_acc": 0.9521800281293952, "train_speed(iter/s)": 0.315356 }, { "epoch": 4.115853658536586, "grad_norm": 1.277803897857666, "learning_rate": 7.518830085157735e-06, "loss": 0.0805463194847107, "memory(GiB)": 153.57, "step": 10800, "token_acc": 0.9651240778001341, "train_speed(iter/s)": 0.315367 }, { "epoch": 4.117759146341464, "grad_norm": 1.057202696800232, "learning_rate": 7.487289575877981e-06, "loss": 0.09631685018539429, "memory(GiB)": 153.57, "step": 10805, "token_acc": 0.9601063829787234, "train_speed(iter/s)": 0.315403 }, { "epoch": 4.119664634146342, "grad_norm": 1.274109125137329, "learning_rate": 7.455810004967301e-06, "loss": 0.0692741870880127, "memory(GiB)": 153.57, "step": 10810, "token_acc": 0.9743223965763196, "train_speed(iter/s)": 0.315401 }, { "epoch": 4.121570121951219, "grad_norm": 1.6566963195800781, "learning_rate": 7.42439141754896e-06, "loss": 0.08956168293952942, "memory(GiB)": 153.57, "step": 10815, "token_acc": 0.9612903225806452, "train_speed(iter/s)": 0.315372 }, { "epoch": 4.123475609756097, "grad_norm": 2.6689743995666504, "learning_rate": 7.393033858658871e-06, "loss": 0.15076417922973634, "memory(GiB)": 153.57, "step": 10820, "token_acc": 0.948051948051948, "train_speed(iter/s)": 0.315389 }, { "epoch": 4.125381097560975, "grad_norm": 1.9779152870178223, "learning_rate": 7.361737373245414e-06, "loss": 0.09615955352783204, "memory(GiB)": 153.57, "step": 10825, "token_acc": 0.9597058823529412, "train_speed(iter/s)": 0.315411 }, { "epoch": 4.127286585365853, "grad_norm": 1.3585253953933716, "learning_rate": 7.33050200616947e-06, "loss": 0.11969186067581176, "memory(GiB)": 153.57, "step": 10830, "token_acc": 0.9500739644970414, "train_speed(iter/s)": 0.315433 }, { "epoch": 4.1291920731707314, "grad_norm": 1.5228625535964966, "learning_rate": 7.299327802204275e-06, "loss": 0.06458817720413208, "memory(GiB)": 153.57, "step": 10835, "token_acc": 0.9746835443037974, "train_speed(iter/s)": 0.315464 }, { "epoch": 4.1310975609756095, "grad_norm": 1.4344878196716309, "learning_rate": 7.268214806035423e-06, "loss": 0.1165360689163208, "memory(GiB)": 153.57, "step": 10840, "token_acc": 0.9516160626836435, "train_speed(iter/s)": 0.315447 }, { "epoch": 4.133003048780488, "grad_norm": 1.3986989259719849, "learning_rate": 7.237163062260732e-06, "loss": 0.09414119720458984, "memory(GiB)": 153.57, "step": 10845, "token_acc": 0.9582926829268292, "train_speed(iter/s)": 0.315453 }, { "epoch": 4.134908536585366, "grad_norm": 1.902741551399231, "learning_rate": 7.206172615390294e-06, "loss": 0.09143221974372864, "memory(GiB)": 153.57, "step": 10850, "token_acc": 0.9598853868194842, "train_speed(iter/s)": 0.315452 }, { "epoch": 4.136814024390244, "grad_norm": 1.1338527202606201, "learning_rate": 7.175243509846247e-06, "loss": 0.12482341527938842, "memory(GiB)": 153.57, "step": 10855, "token_acc": 0.9497041420118343, "train_speed(iter/s)": 0.31544 }, { "epoch": 4.138719512195122, "grad_norm": 1.0454462766647339, "learning_rate": 7.144375789962865e-06, "loss": 0.09484012722969055, "memory(GiB)": 153.57, "step": 10860, "token_acc": 0.9638669538167025, "train_speed(iter/s)": 0.315451 }, { "epoch": 4.140625, "grad_norm": 1.3755942583084106, "learning_rate": 7.1135694999864e-06, "loss": 0.0945687711238861, "memory(GiB)": 153.57, "step": 10865, "token_acc": 0.9638637943015983, "train_speed(iter/s)": 0.315473 }, { "epoch": 4.142530487804878, "grad_norm": 1.2689404487609863, "learning_rate": 7.082824684075068e-06, "loss": 0.12032040357589721, "memory(GiB)": 153.57, "step": 10870, "token_acc": 0.9557751117384145, "train_speed(iter/s)": 0.315481 }, { "epoch": 4.144435975609756, "grad_norm": 2.1881251335144043, "learning_rate": 7.052141386298944e-06, "loss": 0.06482705473899841, "memory(GiB)": 153.57, "step": 10875, "token_acc": 0.9705315340126687, "train_speed(iter/s)": 0.315492 }, { "epoch": 4.146341463414634, "grad_norm": 1.480611801147461, "learning_rate": 7.0215196506399515e-06, "loss": 0.10238081216812134, "memory(GiB)": 153.57, "step": 10880, "token_acc": 0.9569520039584364, "train_speed(iter/s)": 0.315469 }, { "epoch": 4.148246951219512, "grad_norm": 1.7641735076904297, "learning_rate": 6.990959520991741e-06, "loss": 0.12017501592636108, "memory(GiB)": 153.57, "step": 10885, "token_acc": 0.9595231494316606, "train_speed(iter/s)": 0.315473 }, { "epoch": 4.1501524390243905, "grad_norm": 1.5983552932739258, "learning_rate": 6.960461041159677e-06, "loss": 0.10784424543380737, "memory(GiB)": 153.57, "step": 10890, "token_acc": 0.9553444180522566, "train_speed(iter/s)": 0.315437 }, { "epoch": 4.1520579268292686, "grad_norm": 1.7701127529144287, "learning_rate": 6.9300242548607395e-06, "loss": 0.10369609594345093, "memory(GiB)": 153.57, "step": 10895, "token_acc": 0.96090719188302, "train_speed(iter/s)": 0.315411 }, { "epoch": 4.153963414634147, "grad_norm": 0.8944533467292786, "learning_rate": 6.899649205723507e-06, "loss": 0.08880216479301453, "memory(GiB)": 153.57, "step": 10900, "token_acc": 0.9648183556405354, "train_speed(iter/s)": 0.315423 }, { "epoch": 4.155868902439025, "grad_norm": 1.5162708759307861, "learning_rate": 6.869335937288007e-06, "loss": 0.08160734176635742, "memory(GiB)": 153.57, "step": 10905, "token_acc": 0.9689129981222616, "train_speed(iter/s)": 0.315436 }, { "epoch": 4.157774390243903, "grad_norm": 1.0665161609649658, "learning_rate": 6.839084493005771e-06, "loss": 0.09529926776885986, "memory(GiB)": 153.57, "step": 10910, "token_acc": 0.9663424124513619, "train_speed(iter/s)": 0.315447 }, { "epoch": 4.159679878048781, "grad_norm": 1.506656527519226, "learning_rate": 6.808894916239655e-06, "loss": 0.09687199592590331, "memory(GiB)": 153.57, "step": 10915, "token_acc": 0.9636631475570833, "train_speed(iter/s)": 0.315457 }, { "epoch": 4.161585365853658, "grad_norm": 1.1064687967300415, "learning_rate": 6.7787672502638844e-06, "loss": 0.09500679969787598, "memory(GiB)": 153.57, "step": 10920, "token_acc": 0.9595015576323987, "train_speed(iter/s)": 0.315461 }, { "epoch": 4.163490853658536, "grad_norm": 1.6410852670669556, "learning_rate": 6.748701538263891e-06, "loss": 0.0864219069480896, "memory(GiB)": 153.57, "step": 10925, "token_acc": 0.964851086664878, "train_speed(iter/s)": 0.31547 }, { "epoch": 4.165396341463414, "grad_norm": 1.4071049690246582, "learning_rate": 6.71869782333634e-06, "loss": 0.1215739369392395, "memory(GiB)": 153.57, "step": 10930, "token_acc": 0.9684896776530243, "train_speed(iter/s)": 0.31549 }, { "epoch": 4.167301829268292, "grad_norm": 2.401376724243164, "learning_rate": 6.6887561484890175e-06, "loss": 0.0943516194820404, "memory(GiB)": 153.57, "step": 10935, "token_acc": 0.9628990509059534, "train_speed(iter/s)": 0.315503 }, { "epoch": 4.1692073170731705, "grad_norm": 1.43844735622406, "learning_rate": 6.65887655664078e-06, "loss": 0.08427236080169678, "memory(GiB)": 153.57, "step": 10940, "token_acc": 0.9603125, "train_speed(iter/s)": 0.315486 }, { "epoch": 4.171112804878049, "grad_norm": 1.318272590637207, "learning_rate": 6.629059090621481e-06, "loss": 0.09028308391571045, "memory(GiB)": 153.57, "step": 10945, "token_acc": 0.9702380952380952, "train_speed(iter/s)": 0.315486 }, { "epoch": 4.173018292682927, "grad_norm": 1.197618007659912, "learning_rate": 6.599303793171946e-06, "loss": 0.11356860399246216, "memory(GiB)": 153.57, "step": 10950, "token_acc": 0.9511222991399203, "train_speed(iter/s)": 0.315491 }, { "epoch": 4.174923780487805, "grad_norm": 1.5447841882705688, "learning_rate": 6.569610706943852e-06, "loss": 0.07365354299545288, "memory(GiB)": 153.57, "step": 10955, "token_acc": 0.9674054758800521, "train_speed(iter/s)": 0.315507 }, { "epoch": 4.176829268292683, "grad_norm": 1.97136652469635, "learning_rate": 6.539979874499747e-06, "loss": 0.11800072193145753, "memory(GiB)": 153.57, "step": 10960, "token_acc": 0.9581690757774858, "train_speed(iter/s)": 0.31552 }, { "epoch": 4.178734756097561, "grad_norm": 1.4230834245681763, "learning_rate": 6.510411338312894e-06, "loss": 0.09368343949317932, "memory(GiB)": 153.57, "step": 10965, "token_acc": 0.9587079587079587, "train_speed(iter/s)": 0.315543 }, { "epoch": 4.180640243902439, "grad_norm": 1.2982308864593506, "learning_rate": 6.480905140767301e-06, "loss": 0.08461216688156128, "memory(GiB)": 153.57, "step": 10970, "token_acc": 0.9581932773109244, "train_speed(iter/s)": 0.315559 }, { "epoch": 4.182545731707317, "grad_norm": 1.228634238243103, "learning_rate": 6.451461324157604e-06, "loss": 0.10939806699752808, "memory(GiB)": 153.57, "step": 10975, "token_acc": 0.953581969412396, "train_speed(iter/s)": 0.315578 }, { "epoch": 4.184451219512195, "grad_norm": 2.0599472522735596, "learning_rate": 6.422079930689024e-06, "loss": 0.12104350328445435, "memory(GiB)": 153.57, "step": 10980, "token_acc": 0.9516700404858299, "train_speed(iter/s)": 0.315591 }, { "epoch": 4.186356707317073, "grad_norm": 1.3813929557800293, "learning_rate": 6.392761002477277e-06, "loss": 0.07927249073982238, "memory(GiB)": 153.57, "step": 10985, "token_acc": 0.9669764354463989, "train_speed(iter/s)": 0.315593 }, { "epoch": 4.188262195121951, "grad_norm": 1.4582067728042603, "learning_rate": 6.363504581548591e-06, "loss": 0.10623682737350464, "memory(GiB)": 153.57, "step": 10990, "token_acc": 0.9587449157466589, "train_speed(iter/s)": 0.315595 }, { "epoch": 4.1901676829268295, "grad_norm": 1.0264461040496826, "learning_rate": 6.334310709839541e-06, "loss": 0.08122773170471191, "memory(GiB)": 153.57, "step": 10995, "token_acc": 0.9692647308541348, "train_speed(iter/s)": 0.315592 }, { "epoch": 4.192073170731708, "grad_norm": 1.2786307334899902, "learning_rate": 6.3051794291970944e-06, "loss": 0.11238372325897217, "memory(GiB)": 153.57, "step": 11000, "token_acc": 0.9539588975625298, "train_speed(iter/s)": 0.315593 }, { "epoch": 4.193978658536586, "grad_norm": 2.4050345420837402, "learning_rate": 6.276110781378447e-06, "loss": 0.08874131441116333, "memory(GiB)": 153.57, "step": 11005, "token_acc": 0.9761010583817002, "train_speed(iter/s)": 0.315606 }, { "epoch": 4.195884146341464, "grad_norm": 1.9520503282546997, "learning_rate": 6.247104808051058e-06, "loss": 0.08208704590797425, "memory(GiB)": 153.57, "step": 11010, "token_acc": 0.9652622142536978, "train_speed(iter/s)": 0.315621 }, { "epoch": 4.197789634146342, "grad_norm": 3.3606626987457275, "learning_rate": 6.218161550792534e-06, "loss": 0.11756823062896729, "memory(GiB)": 153.57, "step": 11015, "token_acc": 0.9570658036677454, "train_speed(iter/s)": 0.315627 }, { "epoch": 4.199695121951219, "grad_norm": 1.3416368961334229, "learning_rate": 6.1892810510905895e-06, "loss": 0.07744035720825196, "memory(GiB)": 153.57, "step": 11020, "token_acc": 0.9635053463505346, "train_speed(iter/s)": 0.315638 }, { "epoch": 4.201600609756097, "grad_norm": 2.039689064025879, "learning_rate": 6.16046335034296e-06, "loss": 0.10947566032409668, "memory(GiB)": 153.57, "step": 11025, "token_acc": 0.964206805125939, "train_speed(iter/s)": 0.315665 }, { "epoch": 4.203506097560975, "grad_norm": 2.0639781951904297, "learning_rate": 6.13170848985739e-06, "loss": 0.09627519845962525, "memory(GiB)": 153.57, "step": 11030, "token_acc": 0.9592412201999487, "train_speed(iter/s)": 0.315643 }, { "epoch": 4.205411585365853, "grad_norm": 1.3410687446594238, "learning_rate": 6.103016510851517e-06, "loss": 0.12872873544692992, "memory(GiB)": 153.57, "step": 11035, "token_acc": 0.9487645348837209, "train_speed(iter/s)": 0.315661 }, { "epoch": 4.2073170731707314, "grad_norm": 1.1931157112121582, "learning_rate": 6.07438745445289e-06, "loss": 0.10955967903137206, "memory(GiB)": 153.57, "step": 11040, "token_acc": 0.9569513055751588, "train_speed(iter/s)": 0.315509 }, { "epoch": 4.2092225609756095, "grad_norm": 1.3818087577819824, "learning_rate": 6.045821361698811e-06, "loss": 0.08947989344596863, "memory(GiB)": 153.57, "step": 11045, "token_acc": 0.9610141766630316, "train_speed(iter/s)": 0.315521 }, { "epoch": 4.211128048780488, "grad_norm": 1.774148941040039, "learning_rate": 6.0173182735363596e-06, "loss": 0.07843393087387085, "memory(GiB)": 153.57, "step": 11050, "token_acc": 0.9648375066595631, "train_speed(iter/s)": 0.315536 }, { "epoch": 4.213033536585366, "grad_norm": 0.9829747676849365, "learning_rate": 5.988878230822309e-06, "loss": 0.08877206444740296, "memory(GiB)": 153.57, "step": 11055, "token_acc": 0.9631690947781961, "train_speed(iter/s)": 0.315541 }, { "epoch": 4.214939024390244, "grad_norm": 1.773871898651123, "learning_rate": 5.960501274323049e-06, "loss": 0.10888124704360962, "memory(GiB)": 153.57, "step": 11060, "token_acc": 0.9570599613152805, "train_speed(iter/s)": 0.315563 }, { "epoch": 4.216844512195122, "grad_norm": 1.0329890251159668, "learning_rate": 5.932187444714532e-06, "loss": 0.08455199003219604, "memory(GiB)": 153.57, "step": 11065, "token_acc": 0.9692229212718926, "train_speed(iter/s)": 0.315566 }, { "epoch": 4.21875, "grad_norm": 2.8096909523010254, "learning_rate": 5.903936782582253e-06, "loss": 0.14798057079315186, "memory(GiB)": 153.57, "step": 11070, "token_acc": 0.937120660354455, "train_speed(iter/s)": 0.315579 }, { "epoch": 4.220655487804878, "grad_norm": 1.707064151763916, "learning_rate": 5.875749328421121e-06, "loss": 0.08490533828735351, "memory(GiB)": 153.57, "step": 11075, "token_acc": 0.9673961589995533, "train_speed(iter/s)": 0.315594 }, { "epoch": 4.222560975609756, "grad_norm": 1.5915848016738892, "learning_rate": 5.8476251226354805e-06, "loss": 0.117938232421875, "memory(GiB)": 153.57, "step": 11080, "token_acc": 0.9582800102380343, "train_speed(iter/s)": 0.31557 }, { "epoch": 4.224466463414634, "grad_norm": 2.0593507289886475, "learning_rate": 5.8195642055389855e-06, "loss": 0.10842788219451904, "memory(GiB)": 153.57, "step": 11085, "token_acc": 0.9585464333781964, "train_speed(iter/s)": 0.315587 }, { "epoch": 4.226371951219512, "grad_norm": 1.1590673923492432, "learning_rate": 5.791566617354599e-06, "loss": 0.1109940767288208, "memory(GiB)": 153.57, "step": 11090, "token_acc": 0.9594190516873131, "train_speed(iter/s)": 0.315595 }, { "epoch": 4.2282774390243905, "grad_norm": 2.171658754348755, "learning_rate": 5.763632398214464e-06, "loss": 0.11757392883300781, "memory(GiB)": 153.57, "step": 11095, "token_acc": 0.947769148774174, "train_speed(iter/s)": 0.315601 }, { "epoch": 4.2301829268292686, "grad_norm": 1.8307409286499023, "learning_rate": 5.735761588159949e-06, "loss": 0.08276964426040649, "memory(GiB)": 153.57, "step": 11100, "token_acc": 0.9704754601226994, "train_speed(iter/s)": 0.31562 }, { "epoch": 4.232088414634147, "grad_norm": 1.7849177122116089, "learning_rate": 5.707954227141482e-06, "loss": 0.11370728015899659, "memory(GiB)": 153.57, "step": 11105, "token_acc": 0.9552492046659598, "train_speed(iter/s)": 0.315591 }, { "epoch": 4.233993902439025, "grad_norm": 2.1754467487335205, "learning_rate": 5.680210355018573e-06, "loss": 0.11714692115783691, "memory(GiB)": 153.57, "step": 11110, "token_acc": 0.9537732767069586, "train_speed(iter/s)": 0.315586 }, { "epoch": 4.235899390243903, "grad_norm": 1.3922533988952637, "learning_rate": 5.6525300115596925e-06, "loss": 0.14044620990753173, "memory(GiB)": 153.57, "step": 11115, "token_acc": 0.9446248196248196, "train_speed(iter/s)": 0.315587 }, { "epoch": 4.237804878048781, "grad_norm": 1.9667530059814453, "learning_rate": 5.624913236442286e-06, "loss": 0.12016603946685792, "memory(GiB)": 153.57, "step": 11120, "token_acc": 0.9466709760827408, "train_speed(iter/s)": 0.315572 }, { "epoch": 4.239710365853658, "grad_norm": 2.064809799194336, "learning_rate": 5.597360069252644e-06, "loss": 0.07246490716934204, "memory(GiB)": 153.57, "step": 11125, "token_acc": 0.9674382110631621, "train_speed(iter/s)": 0.315561 }, { "epoch": 4.241615853658536, "grad_norm": 1.6288541555404663, "learning_rate": 5.569870549485917e-06, "loss": 0.10115618705749511, "memory(GiB)": 153.57, "step": 11130, "token_acc": 0.9583463827121829, "train_speed(iter/s)": 0.315548 }, { "epoch": 4.243521341463414, "grad_norm": 1.2607572078704834, "learning_rate": 5.542444716545975e-06, "loss": 0.132284677028656, "memory(GiB)": 153.57, "step": 11135, "token_acc": 0.9465618068158903, "train_speed(iter/s)": 0.315505 }, { "epoch": 4.245426829268292, "grad_norm": 1.383402705192566, "learning_rate": 5.515082609745464e-06, "loss": 0.12236417531967163, "memory(GiB)": 153.57, "step": 11140, "token_acc": 0.9532962907343445, "train_speed(iter/s)": 0.315509 }, { "epoch": 4.2473323170731705, "grad_norm": 1.3970201015472412, "learning_rate": 5.487784268305612e-06, "loss": 0.07692657709121704, "memory(GiB)": 153.57, "step": 11145, "token_acc": 0.9672940450478248, "train_speed(iter/s)": 0.315489 }, { "epoch": 4.249237804878049, "grad_norm": 1.683255910873413, "learning_rate": 5.460549731356313e-06, "loss": 0.08858453631401061, "memory(GiB)": 153.57, "step": 11150, "token_acc": 0.9656111678583589, "train_speed(iter/s)": 0.315516 }, { "epoch": 4.251143292682927, "grad_norm": 1.2498304843902588, "learning_rate": 5.433379037935932e-06, "loss": 0.07717564702033997, "memory(GiB)": 153.57, "step": 11155, "token_acc": 0.9665201925088931, "train_speed(iter/s)": 0.31552 }, { "epoch": 4.253048780487805, "grad_norm": 1.58143949508667, "learning_rate": 5.406272226991383e-06, "loss": 0.09872873425483704, "memory(GiB)": 153.57, "step": 11160, "token_acc": 0.9611720728325094, "train_speed(iter/s)": 0.315502 }, { "epoch": 4.254954268292683, "grad_norm": 2.013761520385742, "learning_rate": 5.3792293373779665e-06, "loss": 0.12995802164077758, "memory(GiB)": 153.57, "step": 11165, "token_acc": 0.9467440686521958, "train_speed(iter/s)": 0.315511 }, { "epoch": 4.256859756097561, "grad_norm": 1.2351192235946655, "learning_rate": 5.352250407859388e-06, "loss": 0.1540639281272888, "memory(GiB)": 153.57, "step": 11170, "token_acc": 0.946458763854969, "train_speed(iter/s)": 0.315479 }, { "epoch": 4.258765243902439, "grad_norm": 1.1345454454421997, "learning_rate": 5.325335477107629e-06, "loss": 0.10254055261611938, "memory(GiB)": 153.57, "step": 11175, "token_acc": 0.9594662638469285, "train_speed(iter/s)": 0.31549 }, { "epoch": 4.260670731707317, "grad_norm": 2.0704526901245117, "learning_rate": 5.298484583703006e-06, "loss": 0.0925794005393982, "memory(GiB)": 153.57, "step": 11180, "token_acc": 0.9581229581229581, "train_speed(iter/s)": 0.315503 }, { "epoch": 4.262576219512195, "grad_norm": 1.5397295951843262, "learning_rate": 5.2716977661339706e-06, "loss": 0.09100185632705689, "memory(GiB)": 153.57, "step": 11185, "token_acc": 0.9456572224802601, "train_speed(iter/s)": 0.315522 }, { "epoch": 4.264481707317073, "grad_norm": 0.7259523868560791, "learning_rate": 5.244975062797175e-06, "loss": 0.052532291412353514, "memory(GiB)": 153.57, "step": 11190, "token_acc": 0.9749038656142481, "train_speed(iter/s)": 0.315526 }, { "epoch": 4.266387195121951, "grad_norm": 2.096083879470825, "learning_rate": 5.218316511997329e-06, "loss": 0.16748921871185302, "memory(GiB)": 153.57, "step": 11195, "token_acc": 0.9424635332252836, "train_speed(iter/s)": 0.315529 }, { "epoch": 4.2682926829268295, "grad_norm": 1.5728875398635864, "learning_rate": 5.191722151947226e-06, "loss": 0.09891394376754761, "memory(GiB)": 153.57, "step": 11200, "token_acc": 0.9575062034739454, "train_speed(iter/s)": 0.315503 }, { "epoch": 4.2682926829268295, "eval_loss": 0.16833104193210602, "eval_runtime": 33.0869, "eval_samples_per_second": 3.204, "eval_steps_per_second": 3.204, "eval_token_acc": 0.9265778224792557, "step": 11200 }, { "epoch": 4.270198170731708, "grad_norm": 1.5655537843704224, "learning_rate": 5.165192020767612e-06, "loss": 0.0861035406589508, "memory(GiB)": 153.57, "step": 11205, "token_acc": 0.9325482689747004, "train_speed(iter/s)": 0.315227 }, { "epoch": 4.272103658536586, "grad_norm": 0.9425920844078064, "learning_rate": 5.1387261564872e-06, "loss": 0.07834097146987914, "memory(GiB)": 153.57, "step": 11210, "token_acc": 0.9681103210514975, "train_speed(iter/s)": 0.315218 }, { "epoch": 4.274009146341464, "grad_norm": 1.7313716411590576, "learning_rate": 5.112324597042534e-06, "loss": 0.08889236450195312, "memory(GiB)": 153.57, "step": 11215, "token_acc": 0.9620527647271413, "train_speed(iter/s)": 0.315217 }, { "epoch": 4.275914634146342, "grad_norm": 1.1982163190841675, "learning_rate": 5.085987380278057e-06, "loss": 0.07984168529510498, "memory(GiB)": 153.57, "step": 11220, "token_acc": 0.9663265306122449, "train_speed(iter/s)": 0.315222 }, { "epoch": 4.277820121951219, "grad_norm": 1.166109561920166, "learning_rate": 5.059714543945904e-06, "loss": 0.09516621828079223, "memory(GiB)": 153.57, "step": 11225, "token_acc": 0.956803824483524, "train_speed(iter/s)": 0.31523 }, { "epoch": 4.279725609756097, "grad_norm": 2.1627302169799805, "learning_rate": 5.033506125705978e-06, "loss": 0.1182819128036499, "memory(GiB)": 153.57, "step": 11230, "token_acc": 0.94794231445656, "train_speed(iter/s)": 0.315206 }, { "epoch": 4.281631097560975, "grad_norm": 1.2326204776763916, "learning_rate": 5.007362163125811e-06, "loss": 0.12970235347747802, "memory(GiB)": 153.57, "step": 11235, "token_acc": 0.9534127843986999, "train_speed(iter/s)": 0.315212 }, { "epoch": 4.283536585365853, "grad_norm": 1.0565546751022339, "learning_rate": 4.981282693680583e-06, "loss": 0.07691951394081116, "memory(GiB)": 153.57, "step": 11240, "token_acc": 0.9715458276333789, "train_speed(iter/s)": 0.315226 }, { "epoch": 4.2854420731707314, "grad_norm": 3.1970715522766113, "learning_rate": 4.955267754752974e-06, "loss": 0.13576606512069703, "memory(GiB)": 153.57, "step": 11245, "token_acc": 0.9469434832756632, "train_speed(iter/s)": 0.315241 }, { "epoch": 4.2873475609756095, "grad_norm": 1.003299355506897, "learning_rate": 4.929317383633225e-06, "loss": 0.08763373494148255, "memory(GiB)": 153.57, "step": 11250, "token_acc": 0.9692491060786651, "train_speed(iter/s)": 0.315254 }, { "epoch": 4.289253048780488, "grad_norm": 2.0112717151641846, "learning_rate": 4.903431617518955e-06, "loss": 0.13104579448699952, "memory(GiB)": 153.57, "step": 11255, "token_acc": 0.9611111111111111, "train_speed(iter/s)": 0.315139 }, { "epoch": 4.291158536585366, "grad_norm": 1.9463622570037842, "learning_rate": 4.8776104935152615e-06, "loss": 0.11544160842895508, "memory(GiB)": 153.57, "step": 11260, "token_acc": 0.9514066496163683, "train_speed(iter/s)": 0.315082 }, { "epoch": 4.293064024390244, "grad_norm": 1.1160790920257568, "learning_rate": 4.8518540486345135e-06, "loss": 0.07656705379486084, "memory(GiB)": 153.57, "step": 11265, "token_acc": 0.9713381723431974, "train_speed(iter/s)": 0.315092 }, { "epoch": 4.294969512195122, "grad_norm": 1.1199818849563599, "learning_rate": 4.826162319796401e-06, "loss": 0.11272690296173096, "memory(GiB)": 153.57, "step": 11270, "token_acc": 0.955862977602108, "train_speed(iter/s)": 0.315123 }, { "epoch": 4.296875, "grad_norm": 1.4558731317520142, "learning_rate": 4.800535343827833e-06, "loss": 0.09737681746482849, "memory(GiB)": 153.57, "step": 11275, "token_acc": 0.9622886866059818, "train_speed(iter/s)": 0.315127 }, { "epoch": 4.298780487804878, "grad_norm": 1.235554575920105, "learning_rate": 4.7749731574629196e-06, "loss": 0.08396328687667846, "memory(GiB)": 153.57, "step": 11280, "token_acc": 0.9664556962025317, "train_speed(iter/s)": 0.315148 }, { "epoch": 4.300685975609756, "grad_norm": 1.6709692478179932, "learning_rate": 4.749475797342878e-06, "loss": 0.11301828622817993, "memory(GiB)": 153.57, "step": 11285, "token_acc": 0.9488214058186452, "train_speed(iter/s)": 0.315153 }, { "epoch": 4.302591463414634, "grad_norm": 2.4005722999572754, "learning_rate": 4.724043300016023e-06, "loss": 0.14984216690063476, "memory(GiB)": 153.57, "step": 11290, "token_acc": 0.9340839784189537, "train_speed(iter/s)": 0.315169 }, { "epoch": 4.304496951219512, "grad_norm": 1.5255788564682007, "learning_rate": 4.698675701937677e-06, "loss": 0.07243918180465699, "memory(GiB)": 153.57, "step": 11295, "token_acc": 0.9766364551863042, "train_speed(iter/s)": 0.315176 }, { "epoch": 4.3064024390243905, "grad_norm": 1.6310667991638184, "learning_rate": 4.673373039470147e-06, "loss": 0.09789004921913147, "memory(GiB)": 153.57, "step": 11300, "token_acc": 0.9599388379204893, "train_speed(iter/s)": 0.315193 }, { "epoch": 4.3083079268292686, "grad_norm": 1.1954562664031982, "learning_rate": 4.648135348882654e-06, "loss": 0.08014847040176391, "memory(GiB)": 153.57, "step": 11305, "token_acc": 0.9693229261208931, "train_speed(iter/s)": 0.315195 }, { "epoch": 4.310213414634147, "grad_norm": 2.0469627380371094, "learning_rate": 4.6229626663513e-06, "loss": 0.10127834081649781, "memory(GiB)": 153.57, "step": 11310, "token_acc": 0.964873765093304, "train_speed(iter/s)": 0.315162 }, { "epoch": 4.312118902439025, "grad_norm": 1.7250279188156128, "learning_rate": 4.59785502795898e-06, "loss": 0.0876181423664093, "memory(GiB)": 153.57, "step": 11315, "token_acc": 0.9661691542288557, "train_speed(iter/s)": 0.315178 }, { "epoch": 4.314024390243903, "grad_norm": 0.8684383034706116, "learning_rate": 4.572812469695381e-06, "loss": 0.08131890892982482, "memory(GiB)": 153.57, "step": 11320, "token_acc": 0.9665279536819251, "train_speed(iter/s)": 0.315186 }, { "epoch": 4.315929878048781, "grad_norm": 2.2139551639556885, "learning_rate": 4.547835027456865e-06, "loss": 0.09691190123558044, "memory(GiB)": 153.57, "step": 11325, "token_acc": 0.962078305836001, "train_speed(iter/s)": 0.3152 }, { "epoch": 4.317835365853658, "grad_norm": 2.4840986728668213, "learning_rate": 4.522922737046509e-06, "loss": 0.09278237819671631, "memory(GiB)": 153.57, "step": 11330, "token_acc": 0.9583104772353264, "train_speed(iter/s)": 0.31521 }, { "epoch": 4.319740853658536, "grad_norm": 2.3241026401519775, "learning_rate": 4.498075634173943e-06, "loss": 0.12022689580917359, "memory(GiB)": 153.57, "step": 11335, "token_acc": 0.954772673173054, "train_speed(iter/s)": 0.315222 }, { "epoch": 4.321646341463414, "grad_norm": 1.2160918712615967, "learning_rate": 4.473293754455399e-06, "loss": 0.14611363410949707, "memory(GiB)": 153.57, "step": 11340, "token_acc": 0.945037245822428, "train_speed(iter/s)": 0.315233 }, { "epoch": 4.323551829268292, "grad_norm": 1.5325695276260376, "learning_rate": 4.4485771334136e-06, "loss": 0.08542682528495789, "memory(GiB)": 153.57, "step": 11345, "token_acc": 0.9684637068357999, "train_speed(iter/s)": 0.315154 }, { "epoch": 4.3254573170731705, "grad_norm": 1.782580852508545, "learning_rate": 4.423925806477741e-06, "loss": 0.1250987768173218, "memory(GiB)": 153.57, "step": 11350, "token_acc": 0.9559597893729057, "train_speed(iter/s)": 0.315177 }, { "epoch": 4.327362804878049, "grad_norm": 1.9457018375396729, "learning_rate": 4.399339808983388e-06, "loss": 0.13724640607833863, "memory(GiB)": 153.57, "step": 11355, "token_acc": 0.9428754813863928, "train_speed(iter/s)": 0.315201 }, { "epoch": 4.329268292682927, "grad_norm": 1.1358567476272583, "learning_rate": 4.374819176172501e-06, "loss": 0.07211953401565552, "memory(GiB)": 153.57, "step": 11360, "token_acc": 0.9634825205967491, "train_speed(iter/s)": 0.315212 }, { "epoch": 4.331173780487805, "grad_norm": 1.1153289079666138, "learning_rate": 4.350363943193319e-06, "loss": 0.11402373313903809, "memory(GiB)": 153.57, "step": 11365, "token_acc": 0.9539333805811481, "train_speed(iter/s)": 0.315217 }, { "epoch": 4.333079268292683, "grad_norm": 2.3827121257781982, "learning_rate": 4.325974145100364e-06, "loss": 0.1181564211845398, "memory(GiB)": 153.57, "step": 11370, "token_acc": 0.95, "train_speed(iter/s)": 0.315241 }, { "epoch": 4.334984756097561, "grad_norm": 1.8058412075042725, "learning_rate": 4.301649816854331e-06, "loss": 0.12192789316177369, "memory(GiB)": 153.57, "step": 11375, "token_acc": 0.9491848401439763, "train_speed(iter/s)": 0.315209 }, { "epoch": 4.336890243902439, "grad_norm": 1.45885169506073, "learning_rate": 4.277390993322078e-06, "loss": 0.12566529512405394, "memory(GiB)": 153.57, "step": 11380, "token_acc": 0.945449485109936, "train_speed(iter/s)": 0.315195 }, { "epoch": 4.338795731707317, "grad_norm": 2.361931562423706, "learning_rate": 4.253197709276596e-06, "loss": 0.1268508791923523, "memory(GiB)": 153.57, "step": 11385, "token_acc": 0.9439193446754883, "train_speed(iter/s)": 0.31518 }, { "epoch": 4.340701219512195, "grad_norm": 1.2216675281524658, "learning_rate": 4.22906999939689e-06, "loss": 0.08904828429222107, "memory(GiB)": 153.57, "step": 11390, "token_acc": 0.9606166997404977, "train_speed(iter/s)": 0.315188 }, { "epoch": 4.342606707317073, "grad_norm": 2.2435336112976074, "learning_rate": 4.205007898267988e-06, "loss": 0.11552734375, "memory(GiB)": 153.57, "step": 11395, "token_acc": 0.9572794899043571, "train_speed(iter/s)": 0.315197 }, { "epoch": 4.344512195121951, "grad_norm": 1.504151701927185, "learning_rate": 4.181011440380889e-06, "loss": 0.09209004640579224, "memory(GiB)": 153.57, "step": 11400, "token_acc": 0.955805439330544, "train_speed(iter/s)": 0.31521 }, { "epoch": 4.3464176829268295, "grad_norm": 2.2961456775665283, "learning_rate": 4.157080660132468e-06, "loss": 0.09475575685501099, "memory(GiB)": 153.57, "step": 11405, "token_acc": 0.9582689335394127, "train_speed(iter/s)": 0.315235 }, { "epoch": 4.348323170731708, "grad_norm": 1.4106173515319824, "learning_rate": 4.1332155918254844e-06, "loss": 0.09934595227241516, "memory(GiB)": 153.57, "step": 11410, "token_acc": 0.9597590361445784, "train_speed(iter/s)": 0.315241 }, { "epoch": 4.350228658536586, "grad_norm": 0.9141373634338379, "learning_rate": 4.10941626966847e-06, "loss": 0.08406330943107605, "memory(GiB)": 153.57, "step": 11415, "token_acc": 0.9676442460949952, "train_speed(iter/s)": 0.315238 }, { "epoch": 4.352134146341464, "grad_norm": 2.079514980316162, "learning_rate": 4.085682727775753e-06, "loss": 0.10579637289047242, "memory(GiB)": 153.57, "step": 11420, "token_acc": 0.9683083511777302, "train_speed(iter/s)": 0.315258 }, { "epoch": 4.354039634146342, "grad_norm": 1.4681564569473267, "learning_rate": 4.062015000167363e-06, "loss": 0.11008552312850953, "memory(GiB)": 153.57, "step": 11425, "token_acc": 0.9565530917613124, "train_speed(iter/s)": 0.31526 }, { "epoch": 4.355945121951219, "grad_norm": 2.472832202911377, "learning_rate": 4.038413120768963e-06, "loss": 0.14742614030838014, "memory(GiB)": 153.57, "step": 11430, "token_acc": 0.9471533642059384, "train_speed(iter/s)": 0.315267 }, { "epoch": 4.357850609756097, "grad_norm": 1.8257263898849487, "learning_rate": 4.014877123411858e-06, "loss": 0.09739953279495239, "memory(GiB)": 153.57, "step": 11435, "token_acc": 0.9652509652509652, "train_speed(iter/s)": 0.315294 }, { "epoch": 4.359756097560975, "grad_norm": 1.2778884172439575, "learning_rate": 3.991407041832912e-06, "loss": 0.13109521865844725, "memory(GiB)": 153.57, "step": 11440, "token_acc": 0.9481969613765331, "train_speed(iter/s)": 0.3153 }, { "epoch": 4.361661585365853, "grad_norm": 1.6904910802841187, "learning_rate": 3.96800290967449e-06, "loss": 0.10159043073654175, "memory(GiB)": 153.57, "step": 11445, "token_acc": 0.967685419505548, "train_speed(iter/s)": 0.315298 }, { "epoch": 4.3635670731707314, "grad_norm": 1.6320775747299194, "learning_rate": 3.94466476048444e-06, "loss": 0.11021224260330201, "memory(GiB)": 153.57, "step": 11450, "token_acc": 0.9435673496808868, "train_speed(iter/s)": 0.31532 }, { "epoch": 4.3654725609756095, "grad_norm": 1.2088350057601929, "learning_rate": 3.921392627716009e-06, "loss": 0.08257567882537842, "memory(GiB)": 153.57, "step": 11455, "token_acc": 0.9654796511627907, "train_speed(iter/s)": 0.315334 }, { "epoch": 4.367378048780488, "grad_norm": 1.9846564531326294, "learning_rate": 3.898186544727833e-06, "loss": 0.10808141231536865, "memory(GiB)": 153.57, "step": 11460, "token_acc": 0.9624016321772079, "train_speed(iter/s)": 0.315347 }, { "epoch": 4.369283536585366, "grad_norm": 0.9752321243286133, "learning_rate": 3.875046544783878e-06, "loss": 0.08033028244972229, "memory(GiB)": 153.57, "step": 11465, "token_acc": 0.9655112349764849, "train_speed(iter/s)": 0.315356 }, { "epoch": 4.371189024390244, "grad_norm": 1.4768760204315186, "learning_rate": 3.85197266105336e-06, "loss": 0.10988793373107911, "memory(GiB)": 153.57, "step": 11470, "token_acc": 0.9575831553249924, "train_speed(iter/s)": 0.31533 }, { "epoch": 4.373094512195122, "grad_norm": 2.699922561645508, "learning_rate": 3.828964926610746e-06, "loss": 0.11942515373229981, "memory(GiB)": 153.57, "step": 11475, "token_acc": 0.95575, "train_speed(iter/s)": 0.315341 }, { "epoch": 4.375, "grad_norm": 2.047456741333008, "learning_rate": 3.8060233744356633e-06, "loss": 0.07574573755264283, "memory(GiB)": 153.57, "step": 11480, "token_acc": 0.9732410611303345, "train_speed(iter/s)": 0.315361 }, { "epoch": 4.376905487804878, "grad_norm": 1.5607692003250122, "learning_rate": 3.7831480374128925e-06, "loss": 0.1121975302696228, "memory(GiB)": 153.57, "step": 11485, "token_acc": 0.952437173216394, "train_speed(iter/s)": 0.315368 }, { "epoch": 4.378810975609756, "grad_norm": 2.965771198272705, "learning_rate": 3.7603389483322928e-06, "loss": 0.08829761147499085, "memory(GiB)": 153.57, "step": 11490, "token_acc": 0.9565639407910701, "train_speed(iter/s)": 0.315388 }, { "epoch": 4.380716463414634, "grad_norm": 2.6270203590393066, "learning_rate": 3.737596139888755e-06, "loss": 0.10404912233352662, "memory(GiB)": 153.57, "step": 11495, "token_acc": 0.9603880355699272, "train_speed(iter/s)": 0.31541 }, { "epoch": 4.382621951219512, "grad_norm": 1.8841359615325928, "learning_rate": 3.7149196446821744e-06, "loss": 0.09856808781623841, "memory(GiB)": 153.57, "step": 11500, "token_acc": 0.9609101516919487, "train_speed(iter/s)": 0.315419 }, { "epoch": 4.3845274390243905, "grad_norm": 2.1960415840148926, "learning_rate": 3.692309495217372e-06, "loss": 0.0929335594177246, "memory(GiB)": 153.57, "step": 11505, "token_acc": 0.9641197248559211, "train_speed(iter/s)": 0.315413 }, { "epoch": 4.3864329268292686, "grad_norm": 1.5058540105819702, "learning_rate": 3.669765723904095e-06, "loss": 0.09239311814308167, "memory(GiB)": 153.57, "step": 11510, "token_acc": 0.9587133941430629, "train_speed(iter/s)": 0.315423 }, { "epoch": 4.388338414634147, "grad_norm": 2.0651280879974365, "learning_rate": 3.6472883630569334e-06, "loss": 0.10004969835281372, "memory(GiB)": 153.57, "step": 11515, "token_acc": 0.9612263300270514, "train_speed(iter/s)": 0.315439 }, { "epoch": 4.390243902439025, "grad_norm": 0.729622483253479, "learning_rate": 3.6248774448952695e-06, "loss": 0.10728403329849243, "memory(GiB)": 153.57, "step": 11520, "token_acc": 0.9518222666000998, "train_speed(iter/s)": 0.315376 }, { "epoch": 4.392149390243903, "grad_norm": 1.9619925022125244, "learning_rate": 3.6025330015432624e-06, "loss": 0.05757721066474915, "memory(GiB)": 153.57, "step": 11525, "token_acc": 0.9747814391392065, "train_speed(iter/s)": 0.315397 }, { "epoch": 4.394054878048781, "grad_norm": 1.2542332410812378, "learning_rate": 3.5802550650297696e-06, "loss": 0.1015866756439209, "memory(GiB)": 153.57, "step": 11530, "token_acc": 0.9568449682683591, "train_speed(iter/s)": 0.315411 }, { "epoch": 4.395960365853658, "grad_norm": 2.4002914428710938, "learning_rate": 3.5580436672883357e-06, "loss": 0.11484479904174805, "memory(GiB)": 153.57, "step": 11535, "token_acc": 0.9588799192734612, "train_speed(iter/s)": 0.315394 }, { "epoch": 4.397865853658536, "grad_norm": 0.9319559931755066, "learning_rate": 3.5358988401571203e-06, "loss": 0.09355655908584595, "memory(GiB)": 153.57, "step": 11540, "token_acc": 0.9563139931740614, "train_speed(iter/s)": 0.315413 }, { "epoch": 4.399771341463414, "grad_norm": 2.7637648582458496, "learning_rate": 3.513820615378843e-06, "loss": 0.09541640281677247, "memory(GiB)": 153.57, "step": 11545, "token_acc": 0.9680245291283399, "train_speed(iter/s)": 0.315417 }, { "epoch": 4.401676829268292, "grad_norm": 1.0564733743667603, "learning_rate": 3.4918090246007783e-06, "loss": 0.0913490355014801, "memory(GiB)": 153.57, "step": 11550, "token_acc": 0.9644166157605376, "train_speed(iter/s)": 0.315413 }, { "epoch": 4.4035823170731705, "grad_norm": 1.729576826095581, "learning_rate": 3.469864099374681e-06, "loss": 0.10231225490570069, "memory(GiB)": 153.57, "step": 11555, "token_acc": 0.9526539278131635, "train_speed(iter/s)": 0.315415 }, { "epoch": 4.405487804878049, "grad_norm": 2.2978718280792236, "learning_rate": 3.4479858711567337e-06, "loss": 0.13759269714355468, "memory(GiB)": 153.57, "step": 11560, "token_acc": 0.957688338493292, "train_speed(iter/s)": 0.315426 }, { "epoch": 4.407393292682927, "grad_norm": 1.531053066253662, "learning_rate": 3.4261743713075333e-06, "loss": 0.07079092860221863, "memory(GiB)": 153.57, "step": 11565, "token_acc": 0.9633099141295862, "train_speed(iter/s)": 0.315448 }, { "epoch": 4.409298780487805, "grad_norm": 2.2789950370788574, "learning_rate": 3.4044296310919988e-06, "loss": 0.0825621485710144, "memory(GiB)": 153.57, "step": 11570, "token_acc": 0.9666274047899489, "train_speed(iter/s)": 0.315457 }, { "epoch": 4.411204268292683, "grad_norm": 1.928680181503296, "learning_rate": 3.3827516816793916e-06, "loss": 0.0770064651966095, "memory(GiB)": 153.57, "step": 11575, "token_acc": 0.9656756103456611, "train_speed(iter/s)": 0.315462 }, { "epoch": 4.413109756097561, "grad_norm": 2.05332612991333, "learning_rate": 3.3611405541432107e-06, "loss": 0.11589511632919311, "memory(GiB)": 153.57, "step": 11580, "token_acc": 0.9487870619946092, "train_speed(iter/s)": 0.315478 }, { "epoch": 4.415015243902439, "grad_norm": 1.8572301864624023, "learning_rate": 3.3395962794611712e-06, "loss": 0.08447416424751282, "memory(GiB)": 153.57, "step": 11585, "token_acc": 0.9676279069767442, "train_speed(iter/s)": 0.315478 }, { "epoch": 4.416920731707317, "grad_norm": 1.716090202331543, "learning_rate": 3.3181188885151705e-06, "loss": 0.08244004249572753, "memory(GiB)": 153.57, "step": 11590, "token_acc": 0.9618686868686869, "train_speed(iter/s)": 0.315439 }, { "epoch": 4.418826219512195, "grad_norm": 1.3429962396621704, "learning_rate": 3.2967084120912315e-06, "loss": 0.07596399784088134, "memory(GiB)": 153.57, "step": 11595, "token_acc": 0.9664073550212164, "train_speed(iter/s)": 0.315446 }, { "epoch": 4.420731707317073, "grad_norm": 1.1160967350006104, "learning_rate": 3.2753648808794503e-06, "loss": 0.10423895120620727, "memory(GiB)": 153.57, "step": 11600, "token_acc": 0.9598970329223683, "train_speed(iter/s)": 0.31544 }, { "epoch": 4.422637195121951, "grad_norm": 1.8272711038589478, "learning_rate": 3.2540883254739863e-06, "loss": 0.09047324061393738, "memory(GiB)": 153.57, "step": 11605, "token_acc": 0.9634675849678096, "train_speed(iter/s)": 0.315423 }, { "epoch": 4.4245426829268295, "grad_norm": 1.4035717248916626, "learning_rate": 3.2328787763729606e-06, "loss": 0.09922884702682495, "memory(GiB)": 153.57, "step": 11610, "token_acc": 0.9615474248426941, "train_speed(iter/s)": 0.315429 }, { "epoch": 4.426448170731708, "grad_norm": 1.387790560722351, "learning_rate": 3.2117362639784764e-06, "loss": 0.09474560022354125, "memory(GiB)": 153.57, "step": 11615, "token_acc": 0.9584476675813407, "train_speed(iter/s)": 0.315449 }, { "epoch": 4.428353658536586, "grad_norm": 0.9820630550384521, "learning_rate": 3.190660818596525e-06, "loss": 0.10609523057937623, "memory(GiB)": 153.57, "step": 11620, "token_acc": 0.9623128549303046, "train_speed(iter/s)": 0.315423 }, { "epoch": 4.430259146341464, "grad_norm": 0.9764551520347595, "learning_rate": 3.1696524704369844e-06, "loss": 0.07531336545944214, "memory(GiB)": 153.57, "step": 11625, "token_acc": 0.9646874182579126, "train_speed(iter/s)": 0.315439 }, { "epoch": 4.432164634146342, "grad_norm": 1.60063898563385, "learning_rate": 3.1487112496135285e-06, "loss": 0.10207695960998535, "memory(GiB)": 153.57, "step": 11630, "token_acc": 0.9629411764705882, "train_speed(iter/s)": 0.315454 }, { "epoch": 4.434070121951219, "grad_norm": 1.9534785747528076, "learning_rate": 3.1278371861436394e-06, "loss": 0.10273884534835816, "memory(GiB)": 153.57, "step": 11635, "token_acc": 0.9559952242879072, "train_speed(iter/s)": 0.315417 }, { "epoch": 4.435975609756097, "grad_norm": 1.2523199319839478, "learning_rate": 3.1070303099485056e-06, "loss": 0.07558158040046692, "memory(GiB)": 153.57, "step": 11640, "token_acc": 0.9717160728855044, "train_speed(iter/s)": 0.315433 }, { "epoch": 4.437881097560975, "grad_norm": 1.0347689390182495, "learning_rate": 3.086290650853035e-06, "loss": 0.09405055046081542, "memory(GiB)": 153.57, "step": 11645, "token_acc": 0.9610468654899574, "train_speed(iter/s)": 0.31543 }, { "epoch": 4.439786585365853, "grad_norm": 1.1049449443817139, "learning_rate": 3.065618238585749e-06, "loss": 0.08586655259132385, "memory(GiB)": 153.57, "step": 11650, "token_acc": 0.9622520793346129, "train_speed(iter/s)": 0.315423 }, { "epoch": 4.4416920731707314, "grad_norm": 1.2127302885055542, "learning_rate": 3.0450131027788264e-06, "loss": 0.12303690910339356, "memory(GiB)": 153.57, "step": 11655, "token_acc": 0.9529805233130041, "train_speed(iter/s)": 0.315436 }, { "epoch": 4.4435975609756095, "grad_norm": 1.2596060037612915, "learning_rate": 3.024475272967964e-06, "loss": 0.09550251960754394, "memory(GiB)": 153.57, "step": 11660, "token_acc": 0.9631129104813315, "train_speed(iter/s)": 0.315444 }, { "epoch": 4.445503048780488, "grad_norm": 2.3452913761138916, "learning_rate": 3.0040047785924065e-06, "loss": 0.07995930314064026, "memory(GiB)": 153.57, "step": 11665, "token_acc": 0.9687674288901282, "train_speed(iter/s)": 0.315452 }, { "epoch": 4.447408536585366, "grad_norm": 1.8520549535751343, "learning_rate": 2.9836016489948717e-06, "loss": 0.0993906557559967, "memory(GiB)": 153.57, "step": 11670, "token_acc": 0.9594594594594594, "train_speed(iter/s)": 0.315468 }, { "epoch": 4.449314024390244, "grad_norm": 1.6466741561889648, "learning_rate": 2.9632659134215314e-06, "loss": 0.09301068186759949, "memory(GiB)": 153.57, "step": 11675, "token_acc": 0.9616225889838934, "train_speed(iter/s)": 0.315482 }, { "epoch": 4.451219512195122, "grad_norm": 1.1216379404067993, "learning_rate": 2.942997601021924e-06, "loss": 0.07941349148750305, "memory(GiB)": 153.57, "step": 11680, "token_acc": 0.9644711669855152, "train_speed(iter/s)": 0.315495 }, { "epoch": 4.453125, "grad_norm": 1.3256313800811768, "learning_rate": 2.9227967408489653e-06, "loss": 0.08209016919136047, "memory(GiB)": 153.57, "step": 11685, "token_acc": 0.9670474516695958, "train_speed(iter/s)": 0.315493 }, { "epoch": 4.455030487804878, "grad_norm": 2.99845027923584, "learning_rate": 2.9026633618588704e-06, "loss": 0.11259549856185913, "memory(GiB)": 153.57, "step": 11690, "token_acc": 0.9625090122566691, "train_speed(iter/s)": 0.315509 }, { "epoch": 4.456935975609756, "grad_norm": 1.2355519533157349, "learning_rate": 2.8825974929111486e-06, "loss": 0.10706870555877686, "memory(GiB)": 153.57, "step": 11695, "token_acc": 0.9589400811649559, "train_speed(iter/s)": 0.315477 }, { "epoch": 4.458841463414634, "grad_norm": 1.1041613817214966, "learning_rate": 2.862599162768498e-06, "loss": 0.10314202308654785, "memory(GiB)": 153.57, "step": 11700, "token_acc": 0.9638590203106332, "train_speed(iter/s)": 0.315494 }, { "epoch": 4.460746951219512, "grad_norm": 1.6761411428451538, "learning_rate": 2.8426684000968595e-06, "loss": 0.10314077138900757, "memory(GiB)": 153.57, "step": 11705, "token_acc": 0.9624643002855977, "train_speed(iter/s)": 0.315461 }, { "epoch": 4.4626524390243905, "grad_norm": 0.9481696486473083, "learning_rate": 2.822805233465259e-06, "loss": 0.061567938327789305, "memory(GiB)": 153.57, "step": 11710, "token_acc": 0.9674737099535339, "train_speed(iter/s)": 0.315439 }, { "epoch": 4.4645579268292686, "grad_norm": 1.0908455848693848, "learning_rate": 2.803009691345898e-06, "loss": 0.10454311370849609, "memory(GiB)": 153.57, "step": 11715, "token_acc": 0.9552607913669064, "train_speed(iter/s)": 0.315448 }, { "epoch": 4.466463414634147, "grad_norm": 2.114470958709717, "learning_rate": 2.783281802113985e-06, "loss": 0.08637694716453552, "memory(GiB)": 153.57, "step": 11720, "token_acc": 0.9668936370039916, "train_speed(iter/s)": 0.315428 }, { "epoch": 4.468368902439025, "grad_norm": 1.768939733505249, "learning_rate": 2.7636215940477992e-06, "loss": 0.08350073099136353, "memory(GiB)": 153.57, "step": 11725, "token_acc": 0.9656265236470014, "train_speed(iter/s)": 0.315439 }, { "epoch": 4.470274390243903, "grad_norm": 1.9058854579925537, "learning_rate": 2.7440290953285595e-06, "loss": 0.10799484252929688, "memory(GiB)": 153.57, "step": 11730, "token_acc": 0.9624644549763033, "train_speed(iter/s)": 0.315437 }, { "epoch": 4.472179878048781, "grad_norm": 1.2388858795166016, "learning_rate": 2.724504334040473e-06, "loss": 0.10833930969238281, "memory(GiB)": 153.57, "step": 11735, "token_acc": 0.9633468149646107, "train_speed(iter/s)": 0.315416 }, { "epoch": 4.474085365853658, "grad_norm": 1.8217689990997314, "learning_rate": 2.7050473381706186e-06, "loss": 0.11827579736709595, "memory(GiB)": 153.57, "step": 11740, "token_acc": 0.9586490582377137, "train_speed(iter/s)": 0.315419 }, { "epoch": 4.475990853658536, "grad_norm": 1.5702729225158691, "learning_rate": 2.6856581356089593e-06, "loss": 0.10326893329620361, "memory(GiB)": 153.57, "step": 11745, "token_acc": 0.953587396210347, "train_speed(iter/s)": 0.315433 }, { "epoch": 4.477896341463414, "grad_norm": 1.6617428064346313, "learning_rate": 2.6663367541482574e-06, "loss": 0.10247838497161865, "memory(GiB)": 153.57, "step": 11750, "token_acc": 0.959066100667071, "train_speed(iter/s)": 0.315446 }, { "epoch": 4.479801829268292, "grad_norm": 2.11826753616333, "learning_rate": 2.6470832214841035e-06, "loss": 0.11843993663787841, "memory(GiB)": 153.57, "step": 11755, "token_acc": 0.9623287671232876, "train_speed(iter/s)": 0.315418 }, { "epoch": 4.4817073170731705, "grad_norm": 1.6463037729263306, "learning_rate": 2.6278975652147875e-06, "loss": 0.10094796419143677, "memory(GiB)": 153.57, "step": 11760, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.315437 }, { "epoch": 4.483612804878049, "grad_norm": 1.5844002962112427, "learning_rate": 2.608779812841333e-06, "loss": 0.09594500660896302, "memory(GiB)": 153.57, "step": 11765, "token_acc": 0.9631131711085361, "train_speed(iter/s)": 0.315446 }, { "epoch": 4.485518292682927, "grad_norm": 1.7924340963363647, "learning_rate": 2.5897299917674034e-06, "loss": 0.09116992354393005, "memory(GiB)": 153.57, "step": 11770, "token_acc": 0.9642765024503482, "train_speed(iter/s)": 0.31544 }, { "epoch": 4.487423780487805, "grad_norm": 1.4026494026184082, "learning_rate": 2.5707481292993218e-06, "loss": 0.08246124386787415, "memory(GiB)": 153.57, "step": 11775, "token_acc": 0.9664565968692824, "train_speed(iter/s)": 0.315449 }, { "epoch": 4.489329268292683, "grad_norm": 1.8013426065444946, "learning_rate": 2.5518342526459626e-06, "loss": 0.07896456122398376, "memory(GiB)": 153.57, "step": 11780, "token_acc": 0.9672711189739054, "train_speed(iter/s)": 0.315456 }, { "epoch": 4.491234756097561, "grad_norm": 3.528515338897705, "learning_rate": 2.5329883889187834e-06, "loss": 0.09032862782478332, "memory(GiB)": 153.57, "step": 11785, "token_acc": 0.956343792633015, "train_speed(iter/s)": 0.315466 }, { "epoch": 4.493140243902439, "grad_norm": 2.6173315048217773, "learning_rate": 2.5142105651317082e-06, "loss": 0.10720926523208618, "memory(GiB)": 153.57, "step": 11790, "token_acc": 0.9590687194892978, "train_speed(iter/s)": 0.315445 }, { "epoch": 4.495045731707317, "grad_norm": 1.1434836387634277, "learning_rate": 2.49550080820119e-06, "loss": 0.09843702912330628, "memory(GiB)": 153.57, "step": 11795, "token_acc": 0.960838861545046, "train_speed(iter/s)": 0.315456 }, { "epoch": 4.496951219512195, "grad_norm": 1.8562114238739014, "learning_rate": 2.476859144946053e-06, "loss": 0.09204763770103455, "memory(GiB)": 153.57, "step": 11800, "token_acc": 0.9627450980392157, "train_speed(iter/s)": 0.31546 }, { "epoch": 4.498856707317073, "grad_norm": 2.190268039703369, "learning_rate": 2.458285602087557e-06, "loss": 0.08053414225578308, "memory(GiB)": 153.57, "step": 11805, "token_acc": 0.9696859021183345, "train_speed(iter/s)": 0.315486 }, { "epoch": 4.500762195121951, "grad_norm": 2.073289394378662, "learning_rate": 2.4397802062492923e-06, "loss": 0.11775530576705932, "memory(GiB)": 153.57, "step": 11810, "token_acc": 0.9539007092198581, "train_speed(iter/s)": 0.315462 }, { "epoch": 4.5026676829268295, "grad_norm": 1.707442045211792, "learning_rate": 2.4213429839571923e-06, "loss": 0.08084885478019714, "memory(GiB)": 153.57, "step": 11815, "token_acc": 0.9667562122229685, "train_speed(iter/s)": 0.315414 }, { "epoch": 4.504573170731708, "grad_norm": 2.486074686050415, "learning_rate": 2.402973961639432e-06, "loss": 0.1158403754234314, "memory(GiB)": 153.57, "step": 11820, "token_acc": 0.9597022171872471, "train_speed(iter/s)": 0.315422 }, { "epoch": 4.506478658536586, "grad_norm": 1.8762012720108032, "learning_rate": 2.3846731656264666e-06, "loss": 0.08910753130912781, "memory(GiB)": 153.57, "step": 11825, "token_acc": 0.9656338028169014, "train_speed(iter/s)": 0.315395 }, { "epoch": 4.508384146341464, "grad_norm": 1.5310139656066895, "learning_rate": 2.3664406221509117e-06, "loss": 0.09706774950027466, "memory(GiB)": 153.57, "step": 11830, "token_acc": 0.96875, "train_speed(iter/s)": 0.3154 }, { "epoch": 4.510289634146341, "grad_norm": 2.2540476322174072, "learning_rate": 2.348276357347612e-06, "loss": 0.1254178285598755, "memory(GiB)": 153.57, "step": 11835, "token_acc": 0.950591985428051, "train_speed(iter/s)": 0.31541 }, { "epoch": 4.512195121951219, "grad_norm": 1.7377005815505981, "learning_rate": 2.330180397253473e-06, "loss": 0.08931806087493896, "memory(GiB)": 153.57, "step": 11840, "token_acc": 0.9627434377646062, "train_speed(iter/s)": 0.315427 }, { "epoch": 4.514100609756097, "grad_norm": 2.287041187286377, "learning_rate": 2.312152767807535e-06, "loss": 0.0994877815246582, "memory(GiB)": 153.57, "step": 11845, "token_acc": 0.9605026929982047, "train_speed(iter/s)": 0.315454 }, { "epoch": 4.516006097560975, "grad_norm": 1.839990258216858, "learning_rate": 2.2941934948508657e-06, "loss": 0.09452824592590332, "memory(GiB)": 153.57, "step": 11850, "token_acc": 0.9655647382920111, "train_speed(iter/s)": 0.315461 }, { "epoch": 4.517911585365853, "grad_norm": 1.8815181255340576, "learning_rate": 2.2763026041265732e-06, "loss": 0.048113787174224855, "memory(GiB)": 153.57, "step": 11855, "token_acc": 0.9750830564784053, "train_speed(iter/s)": 0.315485 }, { "epoch": 4.5198170731707314, "grad_norm": 1.3757603168487549, "learning_rate": 2.258480121279727e-06, "loss": 0.09918582439422607, "memory(GiB)": 153.57, "step": 11860, "token_acc": 0.9572446555819477, "train_speed(iter/s)": 0.315496 }, { "epoch": 4.5217225609756095, "grad_norm": 1.9995548725128174, "learning_rate": 2.240726071857352e-06, "loss": 0.12769948244094848, "memory(GiB)": 153.57, "step": 11865, "token_acc": 0.9421821572894, "train_speed(iter/s)": 0.315512 }, { "epoch": 4.523628048780488, "grad_norm": 1.7183359861373901, "learning_rate": 2.2230404813083585e-06, "loss": 0.08501004576683044, "memory(GiB)": 153.57, "step": 11870, "token_acc": 0.9699551569506727, "train_speed(iter/s)": 0.315517 }, { "epoch": 4.525533536585366, "grad_norm": 1.9304075241088867, "learning_rate": 2.2054233749835674e-06, "loss": 0.07722177505493164, "memory(GiB)": 153.57, "step": 11875, "token_acc": 0.9651135005973716, "train_speed(iter/s)": 0.315537 }, { "epoch": 4.527439024390244, "grad_norm": 2.3752598762512207, "learning_rate": 2.1878747781355846e-06, "loss": 0.11850775480270385, "memory(GiB)": 153.57, "step": 11880, "token_acc": 0.9499904561939302, "train_speed(iter/s)": 0.315513 }, { "epoch": 4.529344512195122, "grad_norm": 0.9055870175361633, "learning_rate": 2.1703947159188554e-06, "loss": 0.0918830931186676, "memory(GiB)": 153.57, "step": 11885, "token_acc": 0.9495515695067265, "train_speed(iter/s)": 0.315532 }, { "epoch": 4.53125, "grad_norm": 1.3198498487472534, "learning_rate": 2.152983213389559e-06, "loss": 0.07481920719146729, "memory(GiB)": 153.57, "step": 11890, "token_acc": 0.9690612417885145, "train_speed(iter/s)": 0.315537 }, { "epoch": 4.533155487804878, "grad_norm": 1.1908214092254639, "learning_rate": 2.135640295505609e-06, "loss": 0.0843079149723053, "memory(GiB)": 153.57, "step": 11895, "token_acc": 0.9651993980436419, "train_speed(iter/s)": 0.315545 }, { "epoch": 4.535060975609756, "grad_norm": 1.8239598274230957, "learning_rate": 2.118365987126608e-06, "loss": 0.07904760837554932, "memory(GiB)": 153.57, "step": 11900, "token_acc": 0.9768181818181818, "train_speed(iter/s)": 0.315557 }, { "epoch": 4.536966463414634, "grad_norm": 1.0276819467544556, "learning_rate": 2.1011603130138225e-06, "loss": 0.09517342448234559, "memory(GiB)": 153.57, "step": 11905, "token_acc": 0.9590812290842714, "train_speed(iter/s)": 0.315558 }, { "epoch": 4.538871951219512, "grad_norm": 1.040474772453308, "learning_rate": 2.0840232978301177e-06, "loss": 0.11286966800689698, "memory(GiB)": 153.57, "step": 11910, "token_acc": 0.9479301423027167, "train_speed(iter/s)": 0.315577 }, { "epoch": 4.5407774390243905, "grad_norm": 1.1110634803771973, "learning_rate": 2.0669549661399554e-06, "loss": 0.06359522342681885, "memory(GiB)": 153.57, "step": 11915, "token_acc": 0.9655038004287663, "train_speed(iter/s)": 0.315584 }, { "epoch": 4.5426829268292686, "grad_norm": 1.6514774560928345, "learning_rate": 2.049955342409349e-06, "loss": 0.12485463619232177, "memory(GiB)": 153.57, "step": 11920, "token_acc": 0.9485514485514486, "train_speed(iter/s)": 0.315599 }, { "epoch": 4.544588414634147, "grad_norm": 2.151376962661743, "learning_rate": 2.033024451005822e-06, "loss": 0.12008453607559204, "memory(GiB)": 153.57, "step": 11925, "token_acc": 0.9506369426751592, "train_speed(iter/s)": 0.315625 }, { "epoch": 4.546493902439025, "grad_norm": 0.9565483927726746, "learning_rate": 2.016162316198361e-06, "loss": 0.08634468913078308, "memory(GiB)": 153.57, "step": 11930, "token_acc": 0.9634974533106961, "train_speed(iter/s)": 0.31559 }, { "epoch": 4.548399390243903, "grad_norm": 2.49566650390625, "learning_rate": 1.999368962157416e-06, "loss": 0.09931328296661376, "memory(GiB)": 153.57, "step": 11935, "token_acc": 0.9626318202659331, "train_speed(iter/s)": 0.315594 }, { "epoch": 4.550304878048781, "grad_norm": 1.5352925062179565, "learning_rate": 1.9826444129548315e-06, "loss": 0.09284768104553223, "memory(GiB)": 153.57, "step": 11940, "token_acc": 0.9620535714285714, "train_speed(iter/s)": 0.315598 }, { "epoch": 4.552210365853659, "grad_norm": 1.0938721895217896, "learning_rate": 1.9659886925638414e-06, "loss": 0.10229395627975464, "memory(GiB)": 153.57, "step": 11945, "token_acc": 0.964869775893398, "train_speed(iter/s)": 0.315574 }, { "epoch": 4.554115853658536, "grad_norm": 0.4305574893951416, "learning_rate": 1.9494018248589983e-06, "loss": 0.07107744812965393, "memory(GiB)": 153.57, "step": 11950, "token_acc": 0.9671080734728749, "train_speed(iter/s)": 0.315581 }, { "epoch": 4.556021341463414, "grad_norm": 1.7373857498168945, "learning_rate": 1.932883833616178e-06, "loss": 0.1319974899291992, "memory(GiB)": 153.57, "step": 11955, "token_acc": 0.9454863715928983, "train_speed(iter/s)": 0.315585 }, { "epoch": 4.557926829268292, "grad_norm": 2.0825612545013428, "learning_rate": 1.916434742512524e-06, "loss": 0.09115412831306458, "memory(GiB)": 153.57, "step": 11960, "token_acc": 0.9661331477615402, "train_speed(iter/s)": 0.315595 }, { "epoch": 4.5598323170731705, "grad_norm": 1.2071946859359741, "learning_rate": 1.9000545751264143e-06, "loss": 0.13476415872573852, "memory(GiB)": 153.57, "step": 11965, "token_acc": 0.9489742240925828, "train_speed(iter/s)": 0.315607 }, { "epoch": 4.561737804878049, "grad_norm": 1.8346060514450073, "learning_rate": 1.8837433549374228e-06, "loss": 0.09322500824928284, "memory(GiB)": 153.57, "step": 11970, "token_acc": 0.958820093457944, "train_speed(iter/s)": 0.315622 }, { "epoch": 4.563643292682927, "grad_norm": 1.2597503662109375, "learning_rate": 1.8675011053263192e-06, "loss": 0.10703418254852295, "memory(GiB)": 153.57, "step": 11975, "token_acc": 0.9556903842618129, "train_speed(iter/s)": 0.315624 }, { "epoch": 4.565548780487805, "grad_norm": 2.2413346767425537, "learning_rate": 1.851327849574974e-06, "loss": 0.1398315191268921, "memory(GiB)": 153.57, "step": 11980, "token_acc": 0.9372093023255814, "train_speed(iter/s)": 0.315601 }, { "epoch": 4.567454268292683, "grad_norm": 1.2930039167404175, "learning_rate": 1.8352236108663935e-06, "loss": 0.08489471673965454, "memory(GiB)": 153.57, "step": 11985, "token_acc": 0.9655073296924404, "train_speed(iter/s)": 0.315617 }, { "epoch": 4.569359756097561, "grad_norm": 1.395209789276123, "learning_rate": 1.8191884122846225e-06, "loss": 0.08589162230491638, "memory(GiB)": 153.57, "step": 11990, "token_acc": 0.9720325203252033, "train_speed(iter/s)": 0.315632 }, { "epoch": 4.571265243902439, "grad_norm": 1.7310959100723267, "learning_rate": 1.8032222768147755e-06, "loss": 0.10275592803955078, "memory(GiB)": 153.57, "step": 11995, "token_acc": 0.9617563739376771, "train_speed(iter/s)": 0.315644 }, { "epoch": 4.573170731707317, "grad_norm": 1.3834775686264038, "learning_rate": 1.7873252273429509e-06, "loss": 0.11970033645629882, "memory(GiB)": 153.57, "step": 12000, "token_acc": 0.9545739348370927, "train_speed(iter/s)": 0.315639 }, { "epoch": 4.573170731707317, "eval_loss": 0.16845007240772247, "eval_runtime": 32.7793, "eval_samples_per_second": 3.234, "eval_steps_per_second": 3.234, "eval_token_acc": 0.9275836057329645, "step": 12000 }, { "epoch": 4.575076219512195, "grad_norm": 1.3846936225891113, "learning_rate": 1.7714972866562208e-06, "loss": 0.09804007411003113, "memory(GiB)": 153.57, "step": 12005, "token_acc": 0.9330632157770703, "train_speed(iter/s)": 0.315376 }, { "epoch": 4.576981707317073, "grad_norm": 1.7934681177139282, "learning_rate": 1.7557384774425977e-06, "loss": 0.11483783721923828, "memory(GiB)": 153.57, "step": 12010, "token_acc": 0.9534763948497854, "train_speed(iter/s)": 0.31539 }, { "epoch": 4.578887195121951, "grad_norm": 2.2372188568115234, "learning_rate": 1.7400488222910072e-06, "loss": 0.1248164176940918, "memory(GiB)": 153.57, "step": 12015, "token_acc": 0.9565550728341427, "train_speed(iter/s)": 0.315405 }, { "epoch": 4.5807926829268295, "grad_norm": 2.0462944507598877, "learning_rate": 1.7244283436912256e-06, "loss": 0.08604941368103028, "memory(GiB)": 153.57, "step": 12020, "token_acc": 0.9632011432654519, "train_speed(iter/s)": 0.315426 }, { "epoch": 4.582698170731708, "grad_norm": 1.0216985940933228, "learning_rate": 1.708877064033898e-06, "loss": 0.07491138577461243, "memory(GiB)": 153.57, "step": 12025, "token_acc": 0.9707621550591328, "train_speed(iter/s)": 0.315437 }, { "epoch": 4.584603658536586, "grad_norm": 1.5628917217254639, "learning_rate": 1.6933950056104598e-06, "loss": 0.09694406390190125, "memory(GiB)": 153.57, "step": 12030, "token_acc": 0.9612556561085973, "train_speed(iter/s)": 0.315446 }, { "epoch": 4.586509146341464, "grad_norm": 1.191809058189392, "learning_rate": 1.677982190613131e-06, "loss": 0.09949909448623658, "memory(GiB)": 153.57, "step": 12035, "token_acc": 0.9596698113207547, "train_speed(iter/s)": 0.315429 }, { "epoch": 4.588414634146341, "grad_norm": 1.761447787284851, "learning_rate": 1.6626386411348782e-06, "loss": 0.10120022296905518, "memory(GiB)": 153.57, "step": 12040, "token_acc": 0.9532334672999635, "train_speed(iter/s)": 0.315412 }, { "epoch": 4.590320121951219, "grad_norm": 1.7036997079849243, "learning_rate": 1.647364379169386e-06, "loss": 0.11549882888793946, "memory(GiB)": 153.57, "step": 12045, "token_acc": 0.947032967032967, "train_speed(iter/s)": 0.315429 }, { "epoch": 4.592225609756097, "grad_norm": 1.1933104991912842, "learning_rate": 1.6321594266110074e-06, "loss": 0.09174495935440063, "memory(GiB)": 153.57, "step": 12050, "token_acc": 0.9678571428571429, "train_speed(iter/s)": 0.315444 }, { "epoch": 4.594131097560975, "grad_norm": 1.0169328451156616, "learning_rate": 1.6170238052547526e-06, "loss": 0.07566279172897339, "memory(GiB)": 153.57, "step": 12055, "token_acc": 0.9673276250563316, "train_speed(iter/s)": 0.315459 }, { "epoch": 4.596036585365853, "grad_norm": 1.2037487030029297, "learning_rate": 1.6019575367962559e-06, "loss": 0.1595145583152771, "memory(GiB)": 153.57, "step": 12060, "token_acc": 0.9364323718846261, "train_speed(iter/s)": 0.315469 }, { "epoch": 4.5979420731707314, "grad_norm": 2.2177767753601074, "learning_rate": 1.5869606428317363e-06, "loss": 0.11742309331893921, "memory(GiB)": 153.57, "step": 12065, "token_acc": 0.9525972182798751, "train_speed(iter/s)": 0.315487 }, { "epoch": 4.5998475609756095, "grad_norm": 1.3940255641937256, "learning_rate": 1.5720331448579706e-06, "loss": 0.08909921646118164, "memory(GiB)": 153.57, "step": 12070, "token_acc": 0.9557875345409886, "train_speed(iter/s)": 0.315505 }, { "epoch": 4.601753048780488, "grad_norm": 1.8842830657958984, "learning_rate": 1.5571750642722593e-06, "loss": 0.07524358034133911, "memory(GiB)": 153.57, "step": 12075, "token_acc": 0.9637681159420289, "train_speed(iter/s)": 0.315523 }, { "epoch": 4.603658536585366, "grad_norm": 1.4525853395462036, "learning_rate": 1.542386422372405e-06, "loss": 0.11698713302612304, "memory(GiB)": 153.57, "step": 12080, "token_acc": 0.9512595837897043, "train_speed(iter/s)": 0.315545 }, { "epoch": 4.605564024390244, "grad_norm": 1.2099117040634155, "learning_rate": 1.5276672403566783e-06, "loss": 0.09695632457733154, "memory(GiB)": 153.57, "step": 12085, "token_acc": 0.956975505857295, "train_speed(iter/s)": 0.315559 }, { "epoch": 4.607469512195122, "grad_norm": 1.1743661165237427, "learning_rate": 1.5130175393237745e-06, "loss": 0.08183284997940063, "memory(GiB)": 153.57, "step": 12090, "token_acc": 0.9678362573099415, "train_speed(iter/s)": 0.315542 }, { "epoch": 4.609375, "grad_norm": 1.1845287084579468, "learning_rate": 1.4984373402728014e-06, "loss": 0.0997454822063446, "memory(GiB)": 153.57, "step": 12095, "token_acc": 0.9631929046563192, "train_speed(iter/s)": 0.315566 }, { "epoch": 4.611280487804878, "grad_norm": 2.9476685523986816, "learning_rate": 1.4839266641032356e-06, "loss": 0.12173249721527099, "memory(GiB)": 153.57, "step": 12100, "token_acc": 0.9587431693989071, "train_speed(iter/s)": 0.315511 }, { "epoch": 4.613185975609756, "grad_norm": 1.1058326959609985, "learning_rate": 1.469485531614906e-06, "loss": 0.0715265691280365, "memory(GiB)": 153.57, "step": 12105, "token_acc": 0.9678010471204188, "train_speed(iter/s)": 0.315496 }, { "epoch": 4.615091463414634, "grad_norm": 2.002664566040039, "learning_rate": 1.455113963507948e-06, "loss": 0.10352752208709717, "memory(GiB)": 153.57, "step": 12110, "token_acc": 0.9567537208649256, "train_speed(iter/s)": 0.315517 }, { "epoch": 4.616996951219512, "grad_norm": 0.7982213497161865, "learning_rate": 1.4408119803827946e-06, "loss": 0.1286625862121582, "memory(GiB)": 153.57, "step": 12115, "token_acc": 0.9558618283321703, "train_speed(iter/s)": 0.315518 }, { "epoch": 4.6189024390243905, "grad_norm": 1.3165454864501953, "learning_rate": 1.426579602740108e-06, "loss": 0.10083202123641968, "memory(GiB)": 153.57, "step": 12120, "token_acc": 0.954453147664636, "train_speed(iter/s)": 0.315512 }, { "epoch": 4.6208079268292686, "grad_norm": 2.320981025695801, "learning_rate": 1.4124168509808245e-06, "loss": 0.10472342967987061, "memory(GiB)": 153.57, "step": 12125, "token_acc": 0.9572697003329633, "train_speed(iter/s)": 0.315537 }, { "epoch": 4.622713414634147, "grad_norm": 1.4983068704605103, "learning_rate": 1.3983237454060272e-06, "loss": 0.07675988078117371, "memory(GiB)": 153.57, "step": 12130, "token_acc": 0.9705615119025985, "train_speed(iter/s)": 0.315531 }, { "epoch": 4.624618902439025, "grad_norm": 1.3072235584259033, "learning_rate": 1.3843003062170012e-06, "loss": 0.12429177761077881, "memory(GiB)": 153.57, "step": 12135, "token_acc": 0.9448433420365535, "train_speed(iter/s)": 0.31555 }, { "epoch": 4.626524390243903, "grad_norm": 1.536137342453003, "learning_rate": 1.3703465535151504e-06, "loss": 0.09458630084991455, "memory(GiB)": 153.57, "step": 12140, "token_acc": 0.9631224260576563, "train_speed(iter/s)": 0.315557 }, { "epoch": 4.628429878048781, "grad_norm": 1.9445288181304932, "learning_rate": 1.3564625073020086e-06, "loss": 0.09158232808113098, "memory(GiB)": 153.57, "step": 12145, "token_acc": 0.9547625031414928, "train_speed(iter/s)": 0.315574 }, { "epoch": 4.630335365853659, "grad_norm": 2.2970845699310303, "learning_rate": 1.342648187479162e-06, "loss": 0.13230081796646118, "memory(GiB)": 153.57, "step": 12150, "token_acc": 0.9548633184996821, "train_speed(iter/s)": 0.315547 }, { "epoch": 4.632240853658536, "grad_norm": 1.5460474491119385, "learning_rate": 1.328903613848287e-06, "loss": 0.10423647165298462, "memory(GiB)": 153.57, "step": 12155, "token_acc": 0.9548670407416443, "train_speed(iter/s)": 0.315562 }, { "epoch": 4.634146341463414, "grad_norm": 1.1234873533248901, "learning_rate": 1.3152288061110518e-06, "loss": 0.0929557740688324, "memory(GiB)": 153.57, "step": 12160, "token_acc": 0.9656125268652134, "train_speed(iter/s)": 0.315536 }, { "epoch": 4.636051829268292, "grad_norm": 1.7610845565795898, "learning_rate": 1.3016237838691381e-06, "loss": 0.1232746958732605, "memory(GiB)": 153.57, "step": 12165, "token_acc": 0.9532630379025363, "train_speed(iter/s)": 0.315552 }, { "epoch": 4.6379573170731705, "grad_norm": 3.637580156326294, "learning_rate": 1.2880885666241905e-06, "loss": 0.09840003848075866, "memory(GiB)": 153.57, "step": 12170, "token_acc": 0.9619384819650365, "train_speed(iter/s)": 0.315568 }, { "epoch": 4.639862804878049, "grad_norm": 1.2687509059906006, "learning_rate": 1.2746231737778114e-06, "loss": 0.07762103080749512, "memory(GiB)": 153.57, "step": 12175, "token_acc": 0.9747670807453416, "train_speed(iter/s)": 0.315557 }, { "epoch": 4.641768292682927, "grad_norm": 1.1191585063934326, "learning_rate": 1.261227624631478e-06, "loss": 0.10541527271270752, "memory(GiB)": 153.57, "step": 12180, "token_acc": 0.9580178940123881, "train_speed(iter/s)": 0.315564 }, { "epoch": 4.643673780487805, "grad_norm": 1.2304526567459106, "learning_rate": 1.2479019383865975e-06, "loss": 0.07685758471488953, "memory(GiB)": 153.57, "step": 12185, "token_acc": 0.9629452926208651, "train_speed(iter/s)": 0.31556 }, { "epoch": 4.645579268292683, "grad_norm": 1.007375955581665, "learning_rate": 1.234646134144396e-06, "loss": 0.09075586795806885, "memory(GiB)": 153.57, "step": 12190, "token_acc": 0.9607783055002298, "train_speed(iter/s)": 0.315574 }, { "epoch": 4.647484756097561, "grad_norm": 1.4461005926132202, "learning_rate": 1.221460230905952e-06, "loss": 0.10922461748123169, "memory(GiB)": 153.57, "step": 12195, "token_acc": 0.9454271735039518, "train_speed(iter/s)": 0.315563 }, { "epoch": 4.649390243902439, "grad_norm": 1.8395615816116333, "learning_rate": 1.2083442475721352e-06, "loss": 0.08863595724105836, "memory(GiB)": 153.57, "step": 12200, "token_acc": 0.968833481745325, "train_speed(iter/s)": 0.315551 }, { "epoch": 4.651295731707317, "grad_norm": 2.1988134384155273, "learning_rate": 1.1952982029436066e-06, "loss": 0.06716342568397522, "memory(GiB)": 153.57, "step": 12205, "token_acc": 0.9681671226063169, "train_speed(iter/s)": 0.315564 }, { "epoch": 4.653201219512195, "grad_norm": 1.1138299703598022, "learning_rate": 1.1823221157207577e-06, "loss": 0.08431488871574402, "memory(GiB)": 153.57, "step": 12210, "token_acc": 0.9574635683339897, "train_speed(iter/s)": 0.315576 }, { "epoch": 4.655106707317073, "grad_norm": 1.771843671798706, "learning_rate": 1.1694160045037205e-06, "loss": 0.0955187737941742, "memory(GiB)": 153.57, "step": 12215, "token_acc": 0.9595758780649437, "train_speed(iter/s)": 0.315589 }, { "epoch": 4.657012195121951, "grad_norm": 1.2533818483352661, "learning_rate": 1.1565798877922973e-06, "loss": 0.06370720267295837, "memory(GiB)": 153.57, "step": 12220, "token_acc": 0.9775849602313811, "train_speed(iter/s)": 0.315613 }, { "epoch": 4.6589176829268295, "grad_norm": 1.3926712274551392, "learning_rate": 1.143813783985992e-06, "loss": 0.07421702146530151, "memory(GiB)": 153.57, "step": 12225, "token_acc": 0.9609713790112749, "train_speed(iter/s)": 0.315627 }, { "epoch": 4.660823170731708, "grad_norm": 1.4289295673370361, "learning_rate": 1.1311177113839167e-06, "loss": 0.11254609823226928, "memory(GiB)": 153.57, "step": 12230, "token_acc": 0.9595687331536388, "train_speed(iter/s)": 0.315641 }, { "epoch": 4.662728658536586, "grad_norm": 1.3401597738265991, "learning_rate": 1.118491688184825e-06, "loss": 0.12138086557388306, "memory(GiB)": 153.57, "step": 12235, "token_acc": 0.9509108341323106, "train_speed(iter/s)": 0.315647 }, { "epoch": 4.664634146341464, "grad_norm": 1.0057657957077026, "learning_rate": 1.1059357324870455e-06, "loss": 0.07105240225791931, "memory(GiB)": 153.57, "step": 12240, "token_acc": 0.9741676234213548, "train_speed(iter/s)": 0.315653 }, { "epoch": 4.666539634146341, "grad_norm": 1.087186336517334, "learning_rate": 1.093449862288476e-06, "loss": 0.10503919124603271, "memory(GiB)": 153.57, "step": 12245, "token_acc": 0.9587429492344883, "train_speed(iter/s)": 0.315625 }, { "epoch": 4.668445121951219, "grad_norm": 2.043968915939331, "learning_rate": 1.081034095486566e-06, "loss": 0.08738555312156678, "memory(GiB)": 153.57, "step": 12250, "token_acc": 0.9671728450200983, "train_speed(iter/s)": 0.315632 }, { "epoch": 4.670350609756097, "grad_norm": 1.4455041885375977, "learning_rate": 1.0686884498782524e-06, "loss": 0.09144260287284851, "memory(GiB)": 153.57, "step": 12255, "token_acc": 0.9599303135888502, "train_speed(iter/s)": 0.315651 }, { "epoch": 4.672256097560975, "grad_norm": 1.511194109916687, "learning_rate": 1.056412943159979e-06, "loss": 0.10552457571029664, "memory(GiB)": 153.57, "step": 12260, "token_acc": 0.9577960140679953, "train_speed(iter/s)": 0.315603 }, { "epoch": 4.674161585365853, "grad_norm": 2.150797128677368, "learning_rate": 1.0442075929276486e-06, "loss": 0.09276379346847534, "memory(GiB)": 153.57, "step": 12265, "token_acc": 0.9632602655140475, "train_speed(iter/s)": 0.315612 }, { "epoch": 4.6760670731707314, "grad_norm": 1.9270029067993164, "learning_rate": 1.0320724166765882e-06, "loss": 0.07742235064506531, "memory(GiB)": 153.57, "step": 12270, "token_acc": 0.9660363502845604, "train_speed(iter/s)": 0.315607 }, { "epoch": 4.6779725609756095, "grad_norm": 1.0726490020751953, "learning_rate": 1.020007431801545e-06, "loss": 0.12422355413436889, "memory(GiB)": 153.57, "step": 12275, "token_acc": 0.9368088467614534, "train_speed(iter/s)": 0.315622 }, { "epoch": 4.679878048780488, "grad_norm": 3.09430193901062, "learning_rate": 1.0080126555966574e-06, "loss": 0.11564691066741943, "memory(GiB)": 153.57, "step": 12280, "token_acc": 0.9535532994923858, "train_speed(iter/s)": 0.315635 }, { "epoch": 4.681783536585366, "grad_norm": 1.4459611177444458, "learning_rate": 9.960881052554215e-07, "loss": 0.11018012762069702, "memory(GiB)": 153.57, "step": 12285, "token_acc": 0.9573601303639326, "train_speed(iter/s)": 0.315643 }, { "epoch": 4.683689024390244, "grad_norm": 1.3405718803405762, "learning_rate": 9.842337978706706e-07, "loss": 0.07486891150474548, "memory(GiB)": 153.57, "step": 12290, "token_acc": 0.9706848030018762, "train_speed(iter/s)": 0.315654 }, { "epoch": 4.685594512195122, "grad_norm": 1.3623453378677368, "learning_rate": 9.724497504345398e-07, "loss": 0.0802572250366211, "memory(GiB)": 153.57, "step": 12295, "token_acc": 0.9601487778958555, "train_speed(iter/s)": 0.315673 }, { "epoch": 4.6875, "grad_norm": 2.8842337131500244, "learning_rate": 9.607359798384785e-07, "loss": 0.10089409351348877, "memory(GiB)": 153.57, "step": 12300, "token_acc": 0.9636389975373026, "train_speed(iter/s)": 0.315675 }, { "epoch": 4.689405487804878, "grad_norm": 1.9734485149383545, "learning_rate": 9.490925028731723e-07, "loss": 0.13307772874832152, "memory(GiB)": 153.57, "step": 12305, "token_acc": 0.9383494189481978, "train_speed(iter/s)": 0.315681 }, { "epoch": 4.691310975609756, "grad_norm": 2.551718235015869, "learning_rate": 9.37519336228565e-07, "loss": 0.12629077434539795, "memory(GiB)": 153.57, "step": 12310, "token_acc": 0.9446073639622027, "train_speed(iter/s)": 0.315697 }, { "epoch": 4.693216463414634, "grad_norm": 2.83219838142395, "learning_rate": 9.260164964938145e-07, "loss": 0.16304612159729004, "memory(GiB)": 153.57, "step": 12315, "token_acc": 0.9320209166410335, "train_speed(iter/s)": 0.315713 }, { "epoch": 4.695121951219512, "grad_norm": 1.5739445686340332, "learning_rate": 9.145840001572537e-07, "loss": 0.0804464042186737, "memory(GiB)": 153.57, "step": 12320, "token_acc": 0.9613726504569753, "train_speed(iter/s)": 0.315715 }, { "epoch": 4.6970274390243905, "grad_norm": 1.4919545650482178, "learning_rate": 9.032218636064072e-07, "loss": 0.1088001012802124, "memory(GiB)": 153.57, "step": 12325, "token_acc": 0.9623988576868158, "train_speed(iter/s)": 0.315725 }, { "epoch": 4.6989329268292686, "grad_norm": 1.0892140865325928, "learning_rate": 8.919301031279415e-07, "loss": 0.08367373943328857, "memory(GiB)": 153.57, "step": 12330, "token_acc": 0.9698443579766537, "train_speed(iter/s)": 0.315711 }, { "epoch": 4.700838414634147, "grad_norm": 1.0794316530227661, "learning_rate": 8.807087349076204e-07, "loss": 0.08880723714828491, "memory(GiB)": 153.57, "step": 12335, "token_acc": 0.9626111560226354, "train_speed(iter/s)": 0.315715 }, { "epoch": 4.702743902439025, "grad_norm": 1.4672832489013672, "learning_rate": 8.69557775030344e-07, "loss": 0.10711665153503418, "memory(GiB)": 153.57, "step": 12340, "token_acc": 0.9602272727272727, "train_speed(iter/s)": 0.315715 }, { "epoch": 4.704649390243903, "grad_norm": 0.9738861918449402, "learning_rate": 8.584772394800545e-07, "loss": 0.11072999238967896, "memory(GiB)": 153.57, "step": 12345, "token_acc": 0.9592128801431127, "train_speed(iter/s)": 0.315718 }, { "epoch": 4.706554878048781, "grad_norm": 1.1370614767074585, "learning_rate": 8.47467144139763e-07, "loss": 0.08112001419067383, "memory(GiB)": 153.57, "step": 12350, "token_acc": 0.9699184228416043, "train_speed(iter/s)": 0.315724 }, { "epoch": 4.708460365853659, "grad_norm": 3.3876492977142334, "learning_rate": 8.365275047915233e-07, "loss": 0.09961868524551391, "memory(GiB)": 153.57, "step": 12355, "token_acc": 0.9617286605031176, "train_speed(iter/s)": 0.31571 }, { "epoch": 4.710365853658536, "grad_norm": 1.5443370342254639, "learning_rate": 8.256583371163585e-07, "loss": 0.10417447090148926, "memory(GiB)": 153.57, "step": 12360, "token_acc": 0.9593430803124374, "train_speed(iter/s)": 0.315712 }, { "epoch": 4.712271341463414, "grad_norm": 1.5900300741195679, "learning_rate": 8.148596566943167e-07, "loss": 0.09912684559822083, "memory(GiB)": 153.57, "step": 12365, "token_acc": 0.9594350961538461, "train_speed(iter/s)": 0.315712 }, { "epoch": 4.714176829268292, "grad_norm": 1.6992629766464233, "learning_rate": 8.041314790043886e-07, "loss": 0.08645098805427551, "memory(GiB)": 153.57, "step": 12370, "token_acc": 0.9593023255813954, "train_speed(iter/s)": 0.315694 }, { "epoch": 4.7160823170731705, "grad_norm": 1.7543147802352905, "learning_rate": 7.934738194245061e-07, "loss": 0.09533692598342895, "memory(GiB)": 153.57, "step": 12375, "token_acc": 0.9593023255813954, "train_speed(iter/s)": 0.3157 }, { "epoch": 4.717987804878049, "grad_norm": 1.291985273361206, "learning_rate": 7.828866932315215e-07, "loss": 0.06921955943107605, "memory(GiB)": 153.57, "step": 12380, "token_acc": 0.9691528545119705, "train_speed(iter/s)": 0.315724 }, { "epoch": 4.719893292682927, "grad_norm": 1.9049361944198608, "learning_rate": 7.723701156011787e-07, "loss": 0.09360839724540711, "memory(GiB)": 153.57, "step": 12385, "token_acc": 0.9626490897677339, "train_speed(iter/s)": 0.315743 }, { "epoch": 4.721798780487805, "grad_norm": 1.344838261604309, "learning_rate": 7.61924101608108e-07, "loss": 0.08051227331161499, "memory(GiB)": 153.57, "step": 12390, "token_acc": 0.9683619160260201, "train_speed(iter/s)": 0.315725 }, { "epoch": 4.723704268292683, "grad_norm": 1.3414278030395508, "learning_rate": 7.515486662257765e-07, "loss": 0.07793418169021607, "memory(GiB)": 153.57, "step": 12395, "token_acc": 0.9625820180883136, "train_speed(iter/s)": 0.315729 }, { "epoch": 4.725609756097561, "grad_norm": 1.223118782043457, "learning_rate": 7.41243824326504e-07, "loss": 0.08979060649871826, "memory(GiB)": 153.57, "step": 12400, "token_acc": 0.9601930036188179, "train_speed(iter/s)": 0.315747 }, { "epoch": 4.727515243902439, "grad_norm": 1.4460698366165161, "learning_rate": 7.310095906813974e-07, "loss": 0.11026209592819214, "memory(GiB)": 153.57, "step": 12405, "token_acc": 0.9489695780176644, "train_speed(iter/s)": 0.315764 }, { "epoch": 4.729420731707317, "grad_norm": 1.7773253917694092, "learning_rate": 7.208459799603773e-07, "loss": 0.10490627288818359, "memory(GiB)": 153.57, "step": 12410, "token_acc": 0.9580877537655533, "train_speed(iter/s)": 0.315771 }, { "epoch": 4.731326219512195, "grad_norm": 2.547760486602783, "learning_rate": 7.107530067321011e-07, "loss": 0.1005556583404541, "memory(GiB)": 153.57, "step": 12415, "token_acc": 0.9537058685994856, "train_speed(iter/s)": 0.315764 }, { "epoch": 4.733231707317073, "grad_norm": 2.2227861881256104, "learning_rate": 7.007306854640072e-07, "loss": 0.12420713901519775, "memory(GiB)": 153.57, "step": 12420, "token_acc": 0.9516587677725118, "train_speed(iter/s)": 0.315776 }, { "epoch": 4.735137195121951, "grad_norm": 1.4570835828781128, "learning_rate": 6.907790305222317e-07, "loss": 0.07014179229736328, "memory(GiB)": 153.57, "step": 12425, "token_acc": 0.969913309535951, "train_speed(iter/s)": 0.315754 }, { "epoch": 4.7370426829268295, "grad_norm": 1.6058975458145142, "learning_rate": 6.808980561716415e-07, "loss": 0.07576755881309509, "memory(GiB)": 153.57, "step": 12430, "token_acc": 0.9673149207386827, "train_speed(iter/s)": 0.315756 }, { "epoch": 4.738948170731708, "grad_norm": 1.4564694166183472, "learning_rate": 6.710877765757629e-07, "loss": 0.10814146995544434, "memory(GiB)": 153.57, "step": 12435, "token_acc": 0.9596273291925466, "train_speed(iter/s)": 0.315751 }, { "epoch": 4.740853658536586, "grad_norm": 2.6422386169433594, "learning_rate": 6.613482057968024e-07, "loss": 0.11464170217514039, "memory(GiB)": 153.57, "step": 12440, "token_acc": 0.9532146091156052, "train_speed(iter/s)": 0.31576 }, { "epoch": 4.742759146341464, "grad_norm": 2.140021562576294, "learning_rate": 6.516793577956204e-07, "loss": 0.09199082255363464, "memory(GiB)": 153.57, "step": 12445, "token_acc": 0.9585889570552147, "train_speed(iter/s)": 0.315747 }, { "epoch": 4.744664634146341, "grad_norm": 1.5593761205673218, "learning_rate": 6.420812464316861e-07, "loss": 0.08068795204162597, "memory(GiB)": 153.57, "step": 12450, "token_acc": 0.9615453728661276, "train_speed(iter/s)": 0.315761 }, { "epoch": 4.746570121951219, "grad_norm": 3.3840434551239014, "learning_rate": 6.32553885463072e-07, "loss": 0.10263246297836304, "memory(GiB)": 153.57, "step": 12455, "token_acc": 0.964719717757742, "train_speed(iter/s)": 0.315734 }, { "epoch": 4.748475609756097, "grad_norm": 0.9616569876670837, "learning_rate": 6.230972885464537e-07, "loss": 0.09944815635681152, "memory(GiB)": 153.57, "step": 12460, "token_acc": 0.96327467482785, "train_speed(iter/s)": 0.315738 }, { "epoch": 4.750381097560975, "grad_norm": 3.4557268619537354, "learning_rate": 6.137114692370549e-07, "loss": 0.07758305668830871, "memory(GiB)": 153.57, "step": 12465, "token_acc": 0.9656029838375466, "train_speed(iter/s)": 0.315755 }, { "epoch": 4.752286585365853, "grad_norm": 3.347235918045044, "learning_rate": 6.043964409886582e-07, "loss": 0.09462991952896119, "memory(GiB)": 153.57, "step": 12470, "token_acc": 0.9705071664829107, "train_speed(iter/s)": 0.315775 }, { "epoch": 4.7541920731707314, "grad_norm": 1.105819582939148, "learning_rate": 5.951522171535606e-07, "loss": 0.08683058023452758, "memory(GiB)": 153.57, "step": 12475, "token_acc": 0.9653061224489796, "train_speed(iter/s)": 0.315795 }, { "epoch": 4.7560975609756095, "grad_norm": 1.4371346235275269, "learning_rate": 5.859788109825793e-07, "loss": 0.09814253449440002, "memory(GiB)": 153.57, "step": 12480, "token_acc": 0.9615786176654834, "train_speed(iter/s)": 0.315804 }, { "epoch": 4.758003048780488, "grad_norm": 2.3286550045013428, "learning_rate": 5.768762356250068e-07, "loss": 0.10367259979248047, "memory(GiB)": 153.57, "step": 12485, "token_acc": 0.953117674636662, "train_speed(iter/s)": 0.315816 }, { "epoch": 4.759908536585366, "grad_norm": 0.8985363245010376, "learning_rate": 5.678445041286229e-07, "loss": 0.06865269541740418, "memory(GiB)": 153.57, "step": 12490, "token_acc": 0.9776798825256975, "train_speed(iter/s)": 0.315832 }, { "epoch": 4.761814024390244, "grad_norm": 1.107334017753601, "learning_rate": 5.588836294396437e-07, "loss": 0.1064566731452942, "memory(GiB)": 153.57, "step": 12495, "token_acc": 0.9512769425828654, "train_speed(iter/s)": 0.315801 }, { "epoch": 4.763719512195122, "grad_norm": 1.2713805437088013, "learning_rate": 5.499936244027338e-07, "loss": 0.11147184371948242, "memory(GiB)": 153.57, "step": 12500, "token_acc": 0.9578175149014214, "train_speed(iter/s)": 0.315807 } ], "logging_steps": 5, "max_steps": 13120, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.316526694442418e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }