{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.989399293286219, "eval_steps": 500, "global_step": 2295, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021745039412883935, "grad_norm": 0.0746251568198204, "learning_rate": 1.9999990630761334e-05, "loss": 0.8001, "step": 1 }, { "epoch": 0.004349007882576787, "grad_norm": 0.07918819040060043, "learning_rate": 1.9999962523062883e-05, "loss": 0.7666, "step": 2 }, { "epoch": 0.006523511823865181, "grad_norm": 0.10933805257081985, "learning_rate": 1.999991567695732e-05, "loss": 0.8783, "step": 3 }, { "epoch": 0.008698015765153574, "grad_norm": 0.08125412464141846, "learning_rate": 1.9999850092532432e-05, "loss": 0.7695, "step": 4 }, { "epoch": 0.010872519706441968, "grad_norm": 0.07198456674814224, "learning_rate": 1.9999765769911108e-05, "loss": 0.8145, "step": 5 }, { "epoch": 0.013047023647730362, "grad_norm": 0.08905952423810959, "learning_rate": 1.9999662709251356e-05, "loss": 0.825, "step": 6 }, { "epoch": 0.015221527589018754, "grad_norm": 0.07000239193439484, "learning_rate": 1.99995409107463e-05, "loss": 0.911, "step": 7 }, { "epoch": 0.017396031530307148, "grad_norm": 0.08135883510112762, "learning_rate": 1.999940037462417e-05, "loss": 0.9274, "step": 8 }, { "epoch": 0.019570535471595544, "grad_norm": 0.09305227547883987, "learning_rate": 1.999924110114831e-05, "loss": 0.9341, "step": 9 }, { "epoch": 0.021745039412883936, "grad_norm": 0.10100866854190826, "learning_rate": 1.999906309061717e-05, "loss": 0.8917, "step": 10 }, { "epoch": 0.02391954335417233, "grad_norm": 0.07040048390626907, "learning_rate": 1.9998866343364317e-05, "loss": 0.8056, "step": 11 }, { "epoch": 0.026094047295460724, "grad_norm": 0.08670639991760254, "learning_rate": 1.999865085975843e-05, "loss": 0.8254, "step": 12 }, { "epoch": 0.028268551236749116, "grad_norm": 0.09604976326227188, "learning_rate": 1.9998416640203285e-05, "loss": 0.9074, "step": 13 }, { "epoch": 0.03044305517803751, "grad_norm": 0.07576121389865875, "learning_rate": 1.9998163685137785e-05, "loss": 0.7978, "step": 14 }, { "epoch": 0.0326175591193259, "grad_norm": 0.10403190553188324, "learning_rate": 1.9997891995035914e-05, "loss": 0.8231, "step": 15 }, { "epoch": 0.034792063060614296, "grad_norm": 0.09733453392982483, "learning_rate": 1.9997601570406792e-05, "loss": 0.9278, "step": 16 }, { "epoch": 0.03696656700190269, "grad_norm": 0.08796665072441101, "learning_rate": 1.999729241179462e-05, "loss": 0.7813, "step": 17 }, { "epoch": 0.03914107094319109, "grad_norm": 0.11550639569759369, "learning_rate": 1.999696451977872e-05, "loss": 0.9166, "step": 18 }, { "epoch": 0.041315574884479477, "grad_norm": 0.07925107330083847, "learning_rate": 1.9996617894973505e-05, "loss": 1.0912, "step": 19 }, { "epoch": 0.04349007882576787, "grad_norm": 0.08927574008703232, "learning_rate": 1.999625253802851e-05, "loss": 0.8584, "step": 20 }, { "epoch": 0.04566458276705627, "grad_norm": 0.07926242053508759, "learning_rate": 1.9995868449628346e-05, "loss": 0.8644, "step": 21 }, { "epoch": 0.04783908670834466, "grad_norm": 0.0827564224600792, "learning_rate": 1.999546563049274e-05, "loss": 0.8242, "step": 22 }, { "epoch": 0.05001359064963305, "grad_norm": 0.10352898389101028, "learning_rate": 1.9995044081376516e-05, "loss": 1.0451, "step": 23 }, { "epoch": 0.05218809459092145, "grad_norm": 0.0890275090932846, "learning_rate": 1.9994603803069594e-05, "loss": 0.8398, "step": 24 }, { "epoch": 0.05436259853220984, "grad_norm": 0.09530044347047806, "learning_rate": 1.9994144796396985e-05, "loss": 0.8129, "step": 25 }, { "epoch": 0.05653710247349823, "grad_norm": 0.10610915720462799, "learning_rate": 1.9993667062218796e-05, "loss": 0.8711, "step": 26 }, { "epoch": 0.05871160641478663, "grad_norm": 0.09329550713300705, "learning_rate": 1.9993170601430233e-05, "loss": 0.8755, "step": 27 }, { "epoch": 0.06088611035607502, "grad_norm": 0.08517653495073318, "learning_rate": 1.9992655414961583e-05, "loss": 0.7662, "step": 28 }, { "epoch": 0.06306061429736341, "grad_norm": 0.10181520879268646, "learning_rate": 1.9992121503778228e-05, "loss": 0.9588, "step": 29 }, { "epoch": 0.0652351182386518, "grad_norm": 0.09159471839666367, "learning_rate": 1.999156886888064e-05, "loss": 0.9369, "step": 30 }, { "epoch": 0.0674096221799402, "grad_norm": 0.08401357382535934, "learning_rate": 1.999099751130437e-05, "loss": 0.8405, "step": 31 }, { "epoch": 0.06958412612122859, "grad_norm": 0.10121188312768936, "learning_rate": 1.9990407432120055e-05, "loss": 1.0008, "step": 32 }, { "epoch": 0.07175863006251698, "grad_norm": 0.0957995057106018, "learning_rate": 1.9989798632433417e-05, "loss": 0.8342, "step": 33 }, { "epoch": 0.07393313400380538, "grad_norm": 0.1047530323266983, "learning_rate": 1.998917111338525e-05, "loss": 0.8884, "step": 34 }, { "epoch": 0.07610763794509377, "grad_norm": 0.10157911479473114, "learning_rate": 1.9988524876151425e-05, "loss": 0.9494, "step": 35 }, { "epoch": 0.07828214188638218, "grad_norm": 0.08752232044935226, "learning_rate": 1.99878599219429e-05, "loss": 0.7986, "step": 36 }, { "epoch": 0.08045664582767056, "grad_norm": 0.09770691394805908, "learning_rate": 1.9987176252005697e-05, "loss": 0.8944, "step": 37 }, { "epoch": 0.08263114976895895, "grad_norm": 0.09016908705234528, "learning_rate": 1.9986473867620905e-05, "loss": 0.7584, "step": 38 }, { "epoch": 0.08480565371024736, "grad_norm": 0.10123251378536224, "learning_rate": 1.9985752770104693e-05, "loss": 0.9032, "step": 39 }, { "epoch": 0.08698015765153574, "grad_norm": 0.11497762054204941, "learning_rate": 1.9985012960808275e-05, "loss": 0.8551, "step": 40 }, { "epoch": 0.08915466159282413, "grad_norm": 0.0881359800696373, "learning_rate": 1.9984254441117952e-05, "loss": 0.9751, "step": 41 }, { "epoch": 0.09132916553411254, "grad_norm": 0.10690237581729889, "learning_rate": 1.9983477212455074e-05, "loss": 0.8865, "step": 42 }, { "epoch": 0.09350366947540092, "grad_norm": 0.09461107105016708, "learning_rate": 1.9982681276276044e-05, "loss": 0.9014, "step": 43 }, { "epoch": 0.09567817341668931, "grad_norm": 0.10362497717142105, "learning_rate": 1.998186663407233e-05, "loss": 0.8372, "step": 44 }, { "epoch": 0.09785267735797772, "grad_norm": 0.09124545007944107, "learning_rate": 1.9981033287370443e-05, "loss": 0.8799, "step": 45 }, { "epoch": 0.1000271812992661, "grad_norm": 0.08993411809206009, "learning_rate": 1.998018123773195e-05, "loss": 0.8622, "step": 46 }, { "epoch": 0.1022016852405545, "grad_norm": 0.08710499852895737, "learning_rate": 1.997931048675346e-05, "loss": 0.7906, "step": 47 }, { "epoch": 0.1043761891818429, "grad_norm": 0.10102669894695282, "learning_rate": 1.997842103606663e-05, "loss": 0.8597, "step": 48 }, { "epoch": 0.10655069312313128, "grad_norm": 0.08813736587762833, "learning_rate": 1.9977512887338162e-05, "loss": 0.8763, "step": 49 }, { "epoch": 0.10872519706441967, "grad_norm": 0.10282223671674728, "learning_rate": 1.9976586042269776e-05, "loss": 0.8791, "step": 50 }, { "epoch": 0.11089970100570808, "grad_norm": 0.08153209835290909, "learning_rate": 1.9975640502598243e-05, "loss": 0.7912, "step": 51 }, { "epoch": 0.11307420494699646, "grad_norm": 0.08203904330730438, "learning_rate": 1.9974676270095362e-05, "loss": 0.8297, "step": 52 }, { "epoch": 0.11524870888828485, "grad_norm": 0.09119073301553726, "learning_rate": 1.997369334656796e-05, "loss": 0.831, "step": 53 }, { "epoch": 0.11742321282957326, "grad_norm": 0.10068037360906601, "learning_rate": 1.997269173385788e-05, "loss": 0.8284, "step": 54 }, { "epoch": 0.11959771677086165, "grad_norm": 0.10159336775541306, "learning_rate": 1.9971671433842e-05, "loss": 0.8546, "step": 55 }, { "epoch": 0.12177222071215003, "grad_norm": 0.09355925768613815, "learning_rate": 1.9970632448432198e-05, "loss": 0.9363, "step": 56 }, { "epoch": 0.12394672465343844, "grad_norm": 0.09793829172849655, "learning_rate": 1.9969574779575377e-05, "loss": 0.8599, "step": 57 }, { "epoch": 0.12612122859472683, "grad_norm": 0.12197580188512802, "learning_rate": 1.9968498429253453e-05, "loss": 0.952, "step": 58 }, { "epoch": 0.12829573253601523, "grad_norm": 0.10625457018613815, "learning_rate": 1.9967403399483336e-05, "loss": 0.7529, "step": 59 }, { "epoch": 0.1304702364773036, "grad_norm": 0.10031311213970184, "learning_rate": 1.9966289692316944e-05, "loss": 0.8112, "step": 60 }, { "epoch": 0.132644740418592, "grad_norm": 0.09758803248405457, "learning_rate": 1.9965157309841202e-05, "loss": 0.771, "step": 61 }, { "epoch": 0.1348192443598804, "grad_norm": 0.08177502453327179, "learning_rate": 1.996400625417802e-05, "loss": 0.8754, "step": 62 }, { "epoch": 0.13699374830116878, "grad_norm": 0.11524152755737305, "learning_rate": 1.9962836527484296e-05, "loss": 0.9539, "step": 63 }, { "epoch": 0.13916825224245719, "grad_norm": 0.09907615929841995, "learning_rate": 1.9961648131951924e-05, "loss": 0.7581, "step": 64 }, { "epoch": 0.1413427561837456, "grad_norm": 0.10424213111400604, "learning_rate": 1.9960441069807778e-05, "loss": 0.8877, "step": 65 }, { "epoch": 0.14351726012503396, "grad_norm": 0.08328958600759506, "learning_rate": 1.9959215343313703e-05, "loss": 0.8172, "step": 66 }, { "epoch": 0.14569176406632237, "grad_norm": 0.07888278365135193, "learning_rate": 1.9957970954766528e-05, "loss": 0.7979, "step": 67 }, { "epoch": 0.14786626800761077, "grad_norm": 0.09226617962121964, "learning_rate": 1.9956707906498046e-05, "loss": 0.9024, "step": 68 }, { "epoch": 0.15004077194889917, "grad_norm": 0.09333748370409012, "learning_rate": 1.995542620087502e-05, "loss": 0.954, "step": 69 }, { "epoch": 0.15221527589018755, "grad_norm": 0.09112944453954697, "learning_rate": 1.9954125840299165e-05, "loss": 0.8928, "step": 70 }, { "epoch": 0.15438977983147595, "grad_norm": 0.08967623114585876, "learning_rate": 1.995280682720716e-05, "loss": 0.7449, "step": 71 }, { "epoch": 0.15656428377276435, "grad_norm": 0.08987925946712494, "learning_rate": 1.9951469164070647e-05, "loss": 0.7786, "step": 72 }, { "epoch": 0.15873878771405273, "grad_norm": 0.08351942151784897, "learning_rate": 1.9950112853396188e-05, "loss": 0.7635, "step": 73 }, { "epoch": 0.16091329165534113, "grad_norm": 0.08796276152133942, "learning_rate": 1.9948737897725307e-05, "loss": 0.8201, "step": 74 }, { "epoch": 0.16308779559662953, "grad_norm": 0.08839084953069687, "learning_rate": 1.9947344299634464e-05, "loss": 0.7452, "step": 75 }, { "epoch": 0.1652622995379179, "grad_norm": 0.09375880658626556, "learning_rate": 1.9945932061735047e-05, "loss": 0.8057, "step": 76 }, { "epoch": 0.1674368034792063, "grad_norm": 0.09026333689689636, "learning_rate": 1.9944501186673378e-05, "loss": 0.725, "step": 77 }, { "epoch": 0.1696113074204947, "grad_norm": 0.09130623936653137, "learning_rate": 1.9943051677130696e-05, "loss": 0.7448, "step": 78 }, { "epoch": 0.1717858113617831, "grad_norm": 0.09838429093360901, "learning_rate": 1.9941583535823162e-05, "loss": 0.7849, "step": 79 }, { "epoch": 0.1739603153030715, "grad_norm": 0.1016785278916359, "learning_rate": 1.994009676550185e-05, "loss": 0.8326, "step": 80 }, { "epoch": 0.1761348192443599, "grad_norm": 0.09392695128917694, "learning_rate": 1.993859136895274e-05, "loss": 0.7594, "step": 81 }, { "epoch": 0.17830932318564827, "grad_norm": 0.0911640077829361, "learning_rate": 1.9937067348996716e-05, "loss": 0.814, "step": 82 }, { "epoch": 0.18048382712693667, "grad_norm": 0.08212007582187653, "learning_rate": 1.9935524708489556e-05, "loss": 0.8072, "step": 83 }, { "epoch": 0.18265833106822507, "grad_norm": 0.10156916081905365, "learning_rate": 1.9933963450321944e-05, "loss": 0.8346, "step": 84 }, { "epoch": 0.18483283500951345, "grad_norm": 0.08526447415351868, "learning_rate": 1.9932383577419432e-05, "loss": 0.8108, "step": 85 }, { "epoch": 0.18700733895080185, "grad_norm": 0.08526138961315155, "learning_rate": 1.9930785092742462e-05, "loss": 0.8107, "step": 86 }, { "epoch": 0.18918184289209025, "grad_norm": 0.09638401120901108, "learning_rate": 1.992916799928635e-05, "loss": 0.7286, "step": 87 }, { "epoch": 0.19135634683337863, "grad_norm": 0.13680808246135712, "learning_rate": 1.9927532300081286e-05, "loss": 0.9724, "step": 88 }, { "epoch": 0.19353085077466703, "grad_norm": 0.09739268571138382, "learning_rate": 1.9925877998192318e-05, "loss": 0.8019, "step": 89 }, { "epoch": 0.19570535471595543, "grad_norm": 0.10523343086242676, "learning_rate": 1.992420509671936e-05, "loss": 0.8982, "step": 90 }, { "epoch": 0.1978798586572438, "grad_norm": 0.09571706503629684, "learning_rate": 1.992251359879717e-05, "loss": 0.7892, "step": 91 }, { "epoch": 0.2000543625985322, "grad_norm": 0.09328663349151611, "learning_rate": 1.992080350759536e-05, "loss": 0.8497, "step": 92 }, { "epoch": 0.2022288665398206, "grad_norm": 0.09873535484075546, "learning_rate": 1.991907482631838e-05, "loss": 0.7951, "step": 93 }, { "epoch": 0.204403370481109, "grad_norm": 0.09133511036634445, "learning_rate": 1.9917327558205517e-05, "loss": 0.765, "step": 94 }, { "epoch": 0.2065778744223974, "grad_norm": 0.08904144167900085, "learning_rate": 1.9915561706530882e-05, "loss": 0.7245, "step": 95 }, { "epoch": 0.2087523783636858, "grad_norm": 0.08898426592350006, "learning_rate": 1.991377727460342e-05, "loss": 0.7938, "step": 96 }, { "epoch": 0.21092688230497417, "grad_norm": 0.08664948493242264, "learning_rate": 1.991197426576687e-05, "loss": 0.7062, "step": 97 }, { "epoch": 0.21310138624626257, "grad_norm": 0.09750048071146011, "learning_rate": 1.991015268339981e-05, "loss": 0.7252, "step": 98 }, { "epoch": 0.21527589018755097, "grad_norm": 0.09408871829509735, "learning_rate": 1.9908312530915603e-05, "loss": 0.9524, "step": 99 }, { "epoch": 0.21745039412883935, "grad_norm": 0.07219725102186203, "learning_rate": 1.9906453811762415e-05, "loss": 0.6776, "step": 100 }, { "epoch": 0.21962489807012775, "grad_norm": 0.10059143602848053, "learning_rate": 1.9904576529423202e-05, "loss": 0.7322, "step": 101 }, { "epoch": 0.22179940201141615, "grad_norm": 0.08263805508613586, "learning_rate": 1.9902680687415704e-05, "loss": 0.7835, "step": 102 }, { "epoch": 0.22397390595270453, "grad_norm": 0.10348347574472427, "learning_rate": 1.9900766289292442e-05, "loss": 0.7486, "step": 103 }, { "epoch": 0.22614840989399293, "grad_norm": 0.08329135924577713, "learning_rate": 1.989883333864071e-05, "loss": 0.7435, "step": 104 }, { "epoch": 0.22832291383528133, "grad_norm": 0.0918523445725441, "learning_rate": 1.9896881839082554e-05, "loss": 0.7355, "step": 105 }, { "epoch": 0.2304974177765697, "grad_norm": 0.08226175606250763, "learning_rate": 1.9894911794274797e-05, "loss": 0.7649, "step": 106 }, { "epoch": 0.2326719217178581, "grad_norm": 0.08478476107120514, "learning_rate": 1.9892923207908997e-05, "loss": 0.7044, "step": 107 }, { "epoch": 0.2348464256591465, "grad_norm": 0.0854225903749466, "learning_rate": 1.9890916083711463e-05, "loss": 0.7282, "step": 108 }, { "epoch": 0.2370209296004349, "grad_norm": 0.09152776747941971, "learning_rate": 1.9888890425443238e-05, "loss": 0.8018, "step": 109 }, { "epoch": 0.2391954335417233, "grad_norm": 0.09559736400842667, "learning_rate": 1.9886846236900102e-05, "loss": 0.8165, "step": 110 }, { "epoch": 0.2413699374830117, "grad_norm": 0.09022340923547745, "learning_rate": 1.9884783521912554e-05, "loss": 0.7237, "step": 111 }, { "epoch": 0.24354444142430007, "grad_norm": 0.08863180875778198, "learning_rate": 1.9882702284345803e-05, "loss": 0.7368, "step": 112 }, { "epoch": 0.24571894536558847, "grad_norm": 0.08581551909446716, "learning_rate": 1.988060252809977e-05, "loss": 0.6956, "step": 113 }, { "epoch": 0.24789344930687687, "grad_norm": 0.09068078547716141, "learning_rate": 1.9878484257109085e-05, "loss": 0.7093, "step": 114 }, { "epoch": 0.2500679532481653, "grad_norm": 0.09168930351734161, "learning_rate": 1.9876347475343062e-05, "loss": 0.7185, "step": 115 }, { "epoch": 0.25224245718945365, "grad_norm": 0.09128735214471817, "learning_rate": 1.98741921868057e-05, "loss": 0.7097, "step": 116 }, { "epoch": 0.254416961130742, "grad_norm": 0.08103678375482559, "learning_rate": 1.9872018395535694e-05, "loss": 0.6781, "step": 117 }, { "epoch": 0.25659146507203046, "grad_norm": 0.08674647659063339, "learning_rate": 1.9869826105606382e-05, "loss": 0.7095, "step": 118 }, { "epoch": 0.25876596901331883, "grad_norm": 0.08077181130647659, "learning_rate": 1.9867615321125796e-05, "loss": 0.7469, "step": 119 }, { "epoch": 0.2609404729546072, "grad_norm": 0.08557458966970444, "learning_rate": 1.9865386046236597e-05, "loss": 0.6935, "step": 120 }, { "epoch": 0.26311497689589564, "grad_norm": 0.08891908824443817, "learning_rate": 1.9863138285116115e-05, "loss": 0.7734, "step": 121 }, { "epoch": 0.265289480837184, "grad_norm": 0.08347958326339722, "learning_rate": 1.986087204197631e-05, "loss": 0.6439, "step": 122 }, { "epoch": 0.2674639847784724, "grad_norm": 0.09328535944223404, "learning_rate": 1.9858587321063777e-05, "loss": 0.7723, "step": 123 }, { "epoch": 0.2696384887197608, "grad_norm": 0.09118448197841644, "learning_rate": 1.985628412665973e-05, "loss": 0.7432, "step": 124 }, { "epoch": 0.2718129926610492, "grad_norm": 0.08305443078279495, "learning_rate": 1.9853962463080013e-05, "loss": 0.7282, "step": 125 }, { "epoch": 0.27398749660233757, "grad_norm": 0.09921488165855408, "learning_rate": 1.9851622334675065e-05, "loss": 0.8408, "step": 126 }, { "epoch": 0.276162000543626, "grad_norm": 0.10361591726541519, "learning_rate": 1.9849263745829934e-05, "loss": 0.675, "step": 127 }, { "epoch": 0.27833650448491437, "grad_norm": 0.08359205722808838, "learning_rate": 1.984688670096425e-05, "loss": 0.7451, "step": 128 }, { "epoch": 0.28051100842620275, "grad_norm": 0.09871591627597809, "learning_rate": 1.9844491204532238e-05, "loss": 0.8347, "step": 129 }, { "epoch": 0.2826855123674912, "grad_norm": 0.10117359459400177, "learning_rate": 1.984207726102269e-05, "loss": 0.7833, "step": 130 }, { "epoch": 0.28486001630877955, "grad_norm": 0.08958233147859573, "learning_rate": 1.9839644874958976e-05, "loss": 0.7106, "step": 131 }, { "epoch": 0.2870345202500679, "grad_norm": 0.08439264446496964, "learning_rate": 1.983719405089901e-05, "loss": 0.7232, "step": 132 }, { "epoch": 0.28920902419135636, "grad_norm": 0.09085080027580261, "learning_rate": 1.983472479343526e-05, "loss": 0.7498, "step": 133 }, { "epoch": 0.29138352813264473, "grad_norm": 0.08543995022773743, "learning_rate": 1.983223710719475e-05, "loss": 0.7632, "step": 134 }, { "epoch": 0.29355803207393316, "grad_norm": 0.08209240436553955, "learning_rate": 1.982973099683902e-05, "loss": 0.5842, "step": 135 }, { "epoch": 0.29573253601522154, "grad_norm": 0.08984076231718063, "learning_rate": 1.9827206467064133e-05, "loss": 0.6498, "step": 136 }, { "epoch": 0.2979070399565099, "grad_norm": 0.08449489623308182, "learning_rate": 1.9824663522600682e-05, "loss": 0.7184, "step": 137 }, { "epoch": 0.30008154389779834, "grad_norm": 0.08838928490877151, "learning_rate": 1.9822102168213754e-05, "loss": 0.6791, "step": 138 }, { "epoch": 0.3022560478390867, "grad_norm": 0.09869147837162018, "learning_rate": 1.9819522408702937e-05, "loss": 0.7408, "step": 139 }, { "epoch": 0.3044305517803751, "grad_norm": 0.09267155081033707, "learning_rate": 1.9816924248902304e-05, "loss": 0.7771, "step": 140 }, { "epoch": 0.3066050557216635, "grad_norm": 0.09889409691095352, "learning_rate": 1.981430769368042e-05, "loss": 0.6345, "step": 141 }, { "epoch": 0.3087795596629519, "grad_norm": 0.0928419828414917, "learning_rate": 1.9811672747940303e-05, "loss": 0.7073, "step": 142 }, { "epoch": 0.31095406360424027, "grad_norm": 0.08259497582912445, "learning_rate": 1.9809019416619445e-05, "loss": 0.7148, "step": 143 }, { "epoch": 0.3131285675455287, "grad_norm": 0.1215728223323822, "learning_rate": 1.9806347704689778e-05, "loss": 0.7034, "step": 144 }, { "epoch": 0.3153030714868171, "grad_norm": 0.08006751537322998, "learning_rate": 1.9803657617157693e-05, "loss": 0.6339, "step": 145 }, { "epoch": 0.31747757542810545, "grad_norm": 0.09259115159511566, "learning_rate": 1.9800949159063995e-05, "loss": 0.6903, "step": 146 }, { "epoch": 0.3196520793693939, "grad_norm": 0.10012371093034744, "learning_rate": 1.9798222335483933e-05, "loss": 0.7025, "step": 147 }, { "epoch": 0.32182658331068226, "grad_norm": 0.08394313603639603, "learning_rate": 1.9795477151527148e-05, "loss": 0.586, "step": 148 }, { "epoch": 0.32400108725197063, "grad_norm": 0.08780057728290558, "learning_rate": 1.9792713612337702e-05, "loss": 0.8031, "step": 149 }, { "epoch": 0.32617559119325906, "grad_norm": 0.08117102831602097, "learning_rate": 1.9789931723094046e-05, "loss": 0.6318, "step": 150 }, { "epoch": 0.32835009513454744, "grad_norm": 0.07488783448934555, "learning_rate": 1.978713148900902e-05, "loss": 0.6555, "step": 151 }, { "epoch": 0.3305245990758358, "grad_norm": 0.09364376217126846, "learning_rate": 1.978431291532983e-05, "loss": 0.6419, "step": 152 }, { "epoch": 0.33269910301712424, "grad_norm": 0.08771781623363495, "learning_rate": 1.9781476007338058e-05, "loss": 0.7657, "step": 153 }, { "epoch": 0.3348736069584126, "grad_norm": 0.08260004967451096, "learning_rate": 1.9778620770349637e-05, "loss": 0.6761, "step": 154 }, { "epoch": 0.337048110899701, "grad_norm": 0.07586755603551865, "learning_rate": 1.9775747209714847e-05, "loss": 0.6401, "step": 155 }, { "epoch": 0.3392226148409894, "grad_norm": 0.10377306491136551, "learning_rate": 1.97728553308183e-05, "loss": 0.6477, "step": 156 }, { "epoch": 0.3413971187822778, "grad_norm": 0.08181344717741013, "learning_rate": 1.9769945139078942e-05, "loss": 0.7153, "step": 157 }, { "epoch": 0.3435716227235662, "grad_norm": 0.07357674837112427, "learning_rate": 1.9767016639950027e-05, "loss": 0.5822, "step": 158 }, { "epoch": 0.3457461266648546, "grad_norm": 0.08460139483213425, "learning_rate": 1.976406983891911e-05, "loss": 0.6967, "step": 159 }, { "epoch": 0.347920630606143, "grad_norm": 0.08439616858959198, "learning_rate": 1.976110474150806e-05, "loss": 0.6993, "step": 160 }, { "epoch": 0.35009513454743135, "grad_norm": 0.08121046423912048, "learning_rate": 1.9758121353273004e-05, "loss": 0.678, "step": 161 }, { "epoch": 0.3522696384887198, "grad_norm": 0.08539711683988571, "learning_rate": 1.975511967980437e-05, "loss": 0.7481, "step": 162 }, { "epoch": 0.35444414243000816, "grad_norm": 0.0795116275548935, "learning_rate": 1.975209972672683e-05, "loss": 0.6408, "step": 163 }, { "epoch": 0.35661864637129653, "grad_norm": 0.07473909854888916, "learning_rate": 1.9749061499699315e-05, "loss": 0.6538, "step": 164 }, { "epoch": 0.35879315031258496, "grad_norm": 0.09923236817121506, "learning_rate": 1.9746005004415004e-05, "loss": 0.6614, "step": 165 }, { "epoch": 0.36096765425387334, "grad_norm": 0.09059952199459076, "learning_rate": 1.9742930246601305e-05, "loss": 0.7641, "step": 166 }, { "epoch": 0.3631421581951617, "grad_norm": 0.07747036218643188, "learning_rate": 1.973983723201984e-05, "loss": 0.6679, "step": 167 }, { "epoch": 0.36531666213645014, "grad_norm": 0.07538899034261703, "learning_rate": 1.973672596646645e-05, "loss": 0.6136, "step": 168 }, { "epoch": 0.3674911660777385, "grad_norm": 0.08175907284021378, "learning_rate": 1.9733596455771176e-05, "loss": 0.6766, "step": 169 }, { "epoch": 0.3696656700190269, "grad_norm": 0.08192688971757889, "learning_rate": 1.973044870579824e-05, "loss": 0.6843, "step": 170 }, { "epoch": 0.3718401739603153, "grad_norm": 0.09001598507165909, "learning_rate": 1.972728272244605e-05, "loss": 0.7357, "step": 171 }, { "epoch": 0.3740146779016037, "grad_norm": 0.12252175062894821, "learning_rate": 1.9724098511647173e-05, "loss": 0.7664, "step": 172 }, { "epoch": 0.3761891818428921, "grad_norm": 0.08195328712463379, "learning_rate": 1.9720896079368338e-05, "loss": 0.7209, "step": 173 }, { "epoch": 0.3783636857841805, "grad_norm": 0.07590065896511078, "learning_rate": 1.9717675431610413e-05, "loss": 0.6278, "step": 174 }, { "epoch": 0.3805381897254689, "grad_norm": 0.08293472975492477, "learning_rate": 1.9714436574408408e-05, "loss": 0.5872, "step": 175 }, { "epoch": 0.38271269366675725, "grad_norm": 0.07701645791530609, "learning_rate": 1.971117951383144e-05, "loss": 0.6297, "step": 176 }, { "epoch": 0.3848871976080457, "grad_norm": 0.07831201702356339, "learning_rate": 1.9707904255982748e-05, "loss": 0.6371, "step": 177 }, { "epoch": 0.38706170154933406, "grad_norm": 0.07456380128860474, "learning_rate": 1.9704610806999668e-05, "loss": 0.6411, "step": 178 }, { "epoch": 0.38923620549062243, "grad_norm": 0.16381080448627472, "learning_rate": 1.9701299173053616e-05, "loss": 0.5942, "step": 179 }, { "epoch": 0.39141070943191086, "grad_norm": 0.08868569135665894, "learning_rate": 1.9697969360350098e-05, "loss": 0.7498, "step": 180 }, { "epoch": 0.39358521337319924, "grad_norm": 0.08234266191720963, "learning_rate": 1.969462137512867e-05, "loss": 0.6521, "step": 181 }, { "epoch": 0.3957597173144876, "grad_norm": 0.07368024438619614, "learning_rate": 1.969125522366295e-05, "loss": 0.6631, "step": 182 }, { "epoch": 0.39793422125577604, "grad_norm": 0.08606752753257751, "learning_rate": 1.968787091226059e-05, "loss": 0.6137, "step": 183 }, { "epoch": 0.4001087251970644, "grad_norm": 0.09288710355758667, "learning_rate": 1.9684468447263276e-05, "loss": 0.7181, "step": 184 }, { "epoch": 0.4022832291383528, "grad_norm": 0.08670896291732788, "learning_rate": 1.9681047835046708e-05, "loss": 0.6263, "step": 185 }, { "epoch": 0.4044577330796412, "grad_norm": 0.07927229255437851, "learning_rate": 1.96776090820206e-05, "loss": 0.7089, "step": 186 }, { "epoch": 0.4066322370209296, "grad_norm": 0.07195180654525757, "learning_rate": 1.967415219462864e-05, "loss": 0.5897, "step": 187 }, { "epoch": 0.408806740962218, "grad_norm": 0.08431829512119293, "learning_rate": 1.967067717934852e-05, "loss": 0.7097, "step": 188 }, { "epoch": 0.4109812449035064, "grad_norm": 0.08400678634643555, "learning_rate": 1.9667184042691877e-05, "loss": 0.8058, "step": 189 }, { "epoch": 0.4131557488447948, "grad_norm": 0.07457119971513748, "learning_rate": 1.9663672791204328e-05, "loss": 0.6031, "step": 190 }, { "epoch": 0.41533025278608315, "grad_norm": 0.08521685749292374, "learning_rate": 1.9660143431465416e-05, "loss": 0.6546, "step": 191 }, { "epoch": 0.4175047567273716, "grad_norm": 0.09003946930170059, "learning_rate": 1.9656595970088627e-05, "loss": 0.6907, "step": 192 }, { "epoch": 0.41967926066865996, "grad_norm": 0.08383707702159882, "learning_rate": 1.9653030413721366e-05, "loss": 0.6861, "step": 193 }, { "epoch": 0.42185376460994833, "grad_norm": 0.08015389740467072, "learning_rate": 1.964944676904494e-05, "loss": 0.5587, "step": 194 }, { "epoch": 0.42402826855123676, "grad_norm": 0.08410577476024628, "learning_rate": 1.9645845042774555e-05, "loss": 0.5389, "step": 195 }, { "epoch": 0.42620277249252514, "grad_norm": 0.08081209659576416, "learning_rate": 1.9642225241659294e-05, "loss": 0.573, "step": 196 }, { "epoch": 0.4283772764338135, "grad_norm": 0.08114443719387054, "learning_rate": 1.9638587372482115e-05, "loss": 0.651, "step": 197 }, { "epoch": 0.43055178037510194, "grad_norm": 0.08458591252565384, "learning_rate": 1.9634931442059833e-05, "loss": 0.7571, "step": 198 }, { "epoch": 0.4327262843163903, "grad_norm": 0.08143232017755508, "learning_rate": 1.96312574572431e-05, "loss": 0.6826, "step": 199 }, { "epoch": 0.4349007882576787, "grad_norm": 0.08297967165708542, "learning_rate": 1.962756542491641e-05, "loss": 0.7914, "step": 200 }, { "epoch": 0.4370752921989671, "grad_norm": 0.08793352544307709, "learning_rate": 1.962385535199807e-05, "loss": 0.6239, "step": 201 }, { "epoch": 0.4392497961402555, "grad_norm": 0.08785484731197357, "learning_rate": 1.962012724544019e-05, "loss": 0.6604, "step": 202 }, { "epoch": 0.4414243000815439, "grad_norm": 0.07696893811225891, "learning_rate": 1.9616381112228666e-05, "loss": 0.5864, "step": 203 }, { "epoch": 0.4435988040228323, "grad_norm": 0.07822995632886887, "learning_rate": 1.961261695938319e-05, "loss": 0.6511, "step": 204 }, { "epoch": 0.4457733079641207, "grad_norm": 0.07448404282331467, "learning_rate": 1.960883479395721e-05, "loss": 0.5544, "step": 205 }, { "epoch": 0.44794781190540905, "grad_norm": 0.08035268634557724, "learning_rate": 1.960503462303793e-05, "loss": 0.6708, "step": 206 }, { "epoch": 0.4501223158466975, "grad_norm": 0.07720299810171127, "learning_rate": 1.9601216453746285e-05, "loss": 0.6944, "step": 207 }, { "epoch": 0.45229681978798586, "grad_norm": 0.08583532273769379, "learning_rate": 1.9597380293236947e-05, "loss": 0.6353, "step": 208 }, { "epoch": 0.45447132372927423, "grad_norm": 0.10567526519298553, "learning_rate": 1.9593526148698293e-05, "loss": 0.9502, "step": 209 }, { "epoch": 0.45664582767056267, "grad_norm": 0.0733906626701355, "learning_rate": 1.9589654027352412e-05, "loss": 0.5567, "step": 210 }, { "epoch": 0.45882033161185104, "grad_norm": 0.07252166420221329, "learning_rate": 1.9585763936455066e-05, "loss": 0.5526, "step": 211 }, { "epoch": 0.4609948355531394, "grad_norm": 0.0754428505897522, "learning_rate": 1.9581855883295685e-05, "loss": 0.5759, "step": 212 }, { "epoch": 0.46316933949442785, "grad_norm": 0.09258468449115753, "learning_rate": 1.9577929875197376e-05, "loss": 0.6412, "step": 213 }, { "epoch": 0.4653438434357162, "grad_norm": 0.07481358200311661, "learning_rate": 1.9573985919516875e-05, "loss": 0.5361, "step": 214 }, { "epoch": 0.4675183473770046, "grad_norm": 0.08345936238765717, "learning_rate": 1.957002402364456e-05, "loss": 0.7947, "step": 215 }, { "epoch": 0.469692851318293, "grad_norm": 0.08155124634504318, "learning_rate": 1.956604419500441e-05, "loss": 0.5559, "step": 216 }, { "epoch": 0.4718673552595814, "grad_norm": 0.08983232825994492, "learning_rate": 1.9562046441054026e-05, "loss": 0.6809, "step": 217 }, { "epoch": 0.4740418592008698, "grad_norm": 0.0778331384062767, "learning_rate": 1.955803076928459e-05, "loss": 0.7657, "step": 218 }, { "epoch": 0.4762163631421582, "grad_norm": 0.08158181607723236, "learning_rate": 1.9553997187220856e-05, "loss": 0.6081, "step": 219 }, { "epoch": 0.4783908670834466, "grad_norm": 0.07603444159030914, "learning_rate": 1.9549945702421144e-05, "loss": 0.5501, "step": 220 }, { "epoch": 0.48056537102473496, "grad_norm": 0.07621001452207565, "learning_rate": 1.954587632247732e-05, "loss": 0.5913, "step": 221 }, { "epoch": 0.4827398749660234, "grad_norm": 0.06982841342687607, "learning_rate": 1.9541789055014786e-05, "loss": 0.5917, "step": 222 }, { "epoch": 0.48491437890731176, "grad_norm": 0.07415331900119781, "learning_rate": 1.953768390769245e-05, "loss": 0.5418, "step": 223 }, { "epoch": 0.48708888284860014, "grad_norm": 0.0779222920536995, "learning_rate": 1.9533560888202742e-05, "loss": 0.6014, "step": 224 }, { "epoch": 0.48926338678988857, "grad_norm": 0.0805523693561554, "learning_rate": 1.9529420004271568e-05, "loss": 0.7661, "step": 225 }, { "epoch": 0.49143789073117694, "grad_norm": 0.07758031040430069, "learning_rate": 1.9525261263658315e-05, "loss": 0.5546, "step": 226 }, { "epoch": 0.49361239467246537, "grad_norm": 0.08573088049888611, "learning_rate": 1.9521084674155828e-05, "loss": 0.612, "step": 227 }, { "epoch": 0.49578689861375375, "grad_norm": 0.08065496385097504, "learning_rate": 1.95168902435904e-05, "loss": 0.6151, "step": 228 }, { "epoch": 0.4979614025550421, "grad_norm": 0.07862064987421036, "learning_rate": 1.951267797982176e-05, "loss": 0.7336, "step": 229 }, { "epoch": 0.5001359064963306, "grad_norm": 0.07799354195594788, "learning_rate": 1.950844789074305e-05, "loss": 0.564, "step": 230 }, { "epoch": 0.5023104104376189, "grad_norm": 0.08205103129148483, "learning_rate": 1.9504199984280802e-05, "loss": 0.6513, "step": 231 }, { "epoch": 0.5044849143789073, "grad_norm": 0.0951075628399849, "learning_rate": 1.949993426839495e-05, "loss": 0.6574, "step": 232 }, { "epoch": 0.5066594183201957, "grad_norm": 0.07305890321731567, "learning_rate": 1.9495650751078802e-05, "loss": 0.6176, "step": 233 }, { "epoch": 0.508833922261484, "grad_norm": 0.07560906559228897, "learning_rate": 1.9491349440359014e-05, "loss": 0.6253, "step": 234 }, { "epoch": 0.5110084262027725, "grad_norm": 0.08403836190700531, "learning_rate": 1.9487030344295586e-05, "loss": 0.6925, "step": 235 }, { "epoch": 0.5131829301440609, "grad_norm": 0.08658340573310852, "learning_rate": 1.948269347098185e-05, "loss": 0.7139, "step": 236 }, { "epoch": 0.5153574340853493, "grad_norm": 0.08847357332706451, "learning_rate": 1.9478338828544437e-05, "loss": 0.6979, "step": 237 }, { "epoch": 0.5175319380266377, "grad_norm": 0.09701989591121674, "learning_rate": 1.9473966425143292e-05, "loss": 0.8799, "step": 238 }, { "epoch": 0.519706441967926, "grad_norm": 0.07688704133033752, "learning_rate": 1.9469576268971635e-05, "loss": 0.6935, "step": 239 }, { "epoch": 0.5218809459092144, "grad_norm": 0.06935624778270721, "learning_rate": 1.9465168368255946e-05, "loss": 0.6065, "step": 240 }, { "epoch": 0.5240554498505029, "grad_norm": 0.07073609530925751, "learning_rate": 1.946074273125596e-05, "loss": 0.6315, "step": 241 }, { "epoch": 0.5262299537917913, "grad_norm": 0.06792890280485153, "learning_rate": 1.945629936626465e-05, "loss": 0.5993, "step": 242 }, { "epoch": 0.5284044577330796, "grad_norm": 0.07362180203199387, "learning_rate": 1.94518382816082e-05, "loss": 0.6608, "step": 243 }, { "epoch": 0.530578961674368, "grad_norm": 0.0733044371008873, "learning_rate": 1.9447359485646002e-05, "loss": 0.7764, "step": 244 }, { "epoch": 0.5327534656156564, "grad_norm": 0.08234452456235886, "learning_rate": 1.9442862986770645e-05, "loss": 0.6601, "step": 245 }, { "epoch": 0.5349279695569448, "grad_norm": 0.07335782796144485, "learning_rate": 1.9438348793407882e-05, "loss": 0.6351, "step": 246 }, { "epoch": 0.5371024734982333, "grad_norm": 0.07529875636100769, "learning_rate": 1.943381691401662e-05, "loss": 0.6409, "step": 247 }, { "epoch": 0.5392769774395216, "grad_norm": 0.0795830562710762, "learning_rate": 1.9429267357088914e-05, "loss": 0.6389, "step": 248 }, { "epoch": 0.54145148138081, "grad_norm": 0.06863214820623398, "learning_rate": 1.942470013114994e-05, "loss": 0.6046, "step": 249 }, { "epoch": 0.5436259853220984, "grad_norm": 0.07275517284870148, "learning_rate": 1.9420115244757985e-05, "loss": 0.5959, "step": 250 }, { "epoch": 0.5458004892633868, "grad_norm": 0.08018425852060318, "learning_rate": 1.9415512706504425e-05, "loss": 0.5602, "step": 251 }, { "epoch": 0.5479749932046751, "grad_norm": 0.07520022988319397, "learning_rate": 1.9410892525013717e-05, "loss": 0.6514, "step": 252 }, { "epoch": 0.5501494971459636, "grad_norm": 0.08175225555896759, "learning_rate": 1.940625470894338e-05, "loss": 0.5378, "step": 253 }, { "epoch": 0.552324001087252, "grad_norm": 0.06719613820314407, "learning_rate": 1.9401599266983973e-05, "loss": 0.6431, "step": 254 }, { "epoch": 0.5544985050285404, "grad_norm": 0.06274879723787308, "learning_rate": 1.9396926207859085e-05, "loss": 0.5881, "step": 255 }, { "epoch": 0.5566730089698287, "grad_norm": 0.07025575637817383, "learning_rate": 1.939223554032532e-05, "loss": 0.5935, "step": 256 }, { "epoch": 0.5588475129111171, "grad_norm": 0.10003781318664551, "learning_rate": 1.938752727317227e-05, "loss": 0.693, "step": 257 }, { "epoch": 0.5610220168524055, "grad_norm": 0.06989789754152298, "learning_rate": 1.9382801415222516e-05, "loss": 0.6084, "step": 258 }, { "epoch": 0.563196520793694, "grad_norm": 0.07558146119117737, "learning_rate": 1.9378057975331594e-05, "loss": 0.607, "step": 259 }, { "epoch": 0.5653710247349824, "grad_norm": 0.06668895483016968, "learning_rate": 1.9373296962387988e-05, "loss": 0.6615, "step": 260 }, { "epoch": 0.5675455286762707, "grad_norm": 0.07800541073083878, "learning_rate": 1.9368518385313108e-05, "loss": 0.5829, "step": 261 }, { "epoch": 0.5697200326175591, "grad_norm": 0.0679517313838005, "learning_rate": 1.9363722253061287e-05, "loss": 0.5694, "step": 262 }, { "epoch": 0.5718945365588475, "grad_norm": 0.06871545314788818, "learning_rate": 1.9358908574619735e-05, "loss": 0.6039, "step": 263 }, { "epoch": 0.5740690405001359, "grad_norm": 0.06857559084892273, "learning_rate": 1.935407735900857e-05, "loss": 0.5942, "step": 264 }, { "epoch": 0.5762435444414243, "grad_norm": 0.09512191265821457, "learning_rate": 1.9349228615280736e-05, "loss": 0.7422, "step": 265 }, { "epoch": 0.5784180483827127, "grad_norm": 0.06066673621535301, "learning_rate": 1.9344362352522055e-05, "loss": 0.5302, "step": 266 }, { "epoch": 0.5805925523240011, "grad_norm": 0.07071176171302795, "learning_rate": 1.9339478579851156e-05, "loss": 0.6232, "step": 267 }, { "epoch": 0.5827670562652895, "grad_norm": 0.06289282441139221, "learning_rate": 1.933457730641948e-05, "loss": 0.5491, "step": 268 }, { "epoch": 0.5849415602065778, "grad_norm": 0.06764508783817291, "learning_rate": 1.9329658541411277e-05, "loss": 0.5523, "step": 269 }, { "epoch": 0.5871160641478663, "grad_norm": 0.06487374007701874, "learning_rate": 1.932472229404356e-05, "loss": 0.5696, "step": 270 }, { "epoch": 0.5892905680891547, "grad_norm": 0.06642784178256989, "learning_rate": 1.9319768573566104e-05, "loss": 0.6032, "step": 271 }, { "epoch": 0.5914650720304431, "grad_norm": 0.07738050818443298, "learning_rate": 1.9314797389261426e-05, "loss": 0.5678, "step": 272 }, { "epoch": 0.5936395759717314, "grad_norm": 0.07546510547399521, "learning_rate": 1.930980875044477e-05, "loss": 0.5592, "step": 273 }, { "epoch": 0.5958140799130198, "grad_norm": 0.07071225345134735, "learning_rate": 1.9304802666464084e-05, "loss": 0.6034, "step": 274 }, { "epoch": 0.5979885838543082, "grad_norm": 0.06379324942827225, "learning_rate": 1.929977914670001e-05, "loss": 0.5415, "step": 275 }, { "epoch": 0.6001630877955967, "grad_norm": 0.06884902715682983, "learning_rate": 1.9294738200565856e-05, "loss": 0.5719, "step": 276 }, { "epoch": 0.6023375917368851, "grad_norm": 0.09404662996530533, "learning_rate": 1.9289679837507593e-05, "loss": 0.6607, "step": 277 }, { "epoch": 0.6045120956781734, "grad_norm": 0.06606297194957733, "learning_rate": 1.9284604067003812e-05, "loss": 0.5734, "step": 278 }, { "epoch": 0.6066865996194618, "grad_norm": 0.06787490099668503, "learning_rate": 1.927951089856575e-05, "loss": 0.6816, "step": 279 }, { "epoch": 0.6088611035607502, "grad_norm": 0.0662330761551857, "learning_rate": 1.9274400341737214e-05, "loss": 0.5708, "step": 280 }, { "epoch": 0.6110356075020386, "grad_norm": 0.06857778877019882, "learning_rate": 1.9269272406094617e-05, "loss": 0.6085, "step": 281 }, { "epoch": 0.613210111443327, "grad_norm": 0.07370905578136444, "learning_rate": 1.926412710124693e-05, "loss": 0.5988, "step": 282 }, { "epoch": 0.6153846153846154, "grad_norm": 0.08097003400325775, "learning_rate": 1.9258964436835668e-05, "loss": 0.6186, "step": 283 }, { "epoch": 0.6175591193259038, "grad_norm": 0.07061789184808731, "learning_rate": 1.9253784422534876e-05, "loss": 0.5776, "step": 284 }, { "epoch": 0.6197336232671922, "grad_norm": 0.0812373086810112, "learning_rate": 1.924858706805112e-05, "loss": 0.5504, "step": 285 }, { "epoch": 0.6219081272084805, "grad_norm": 0.06886117160320282, "learning_rate": 1.924337238312344e-05, "loss": 0.6338, "step": 286 }, { "epoch": 0.6240826311497689, "grad_norm": 0.08107959479093552, "learning_rate": 1.9238140377523375e-05, "loss": 0.6403, "step": 287 }, { "epoch": 0.6262571350910574, "grad_norm": 0.09636881202459335, "learning_rate": 1.9232891061054896e-05, "loss": 0.5775, "step": 288 }, { "epoch": 0.6284316390323458, "grad_norm": 0.07794267684221268, "learning_rate": 1.9227624443554425e-05, "loss": 0.5457, "step": 289 }, { "epoch": 0.6306061429736342, "grad_norm": 0.06867080181837082, "learning_rate": 1.9222340534890803e-05, "loss": 0.5451, "step": 290 }, { "epoch": 0.6327806469149225, "grad_norm": 0.07095597684383392, "learning_rate": 1.921703934496527e-05, "loss": 0.6557, "step": 291 }, { "epoch": 0.6349551508562109, "grad_norm": 0.13286642730236053, "learning_rate": 1.921172088371145e-05, "loss": 0.5669, "step": 292 }, { "epoch": 0.6371296547974993, "grad_norm": 0.07619698345661163, "learning_rate": 1.9206385161095328e-05, "loss": 0.5706, "step": 293 }, { "epoch": 0.6393041587387878, "grad_norm": 0.06443142890930176, "learning_rate": 1.9201032187115235e-05, "loss": 0.517, "step": 294 }, { "epoch": 0.6414786626800761, "grad_norm": 0.08062798529863358, "learning_rate": 1.9195661971801825e-05, "loss": 0.598, "step": 295 }, { "epoch": 0.6436531666213645, "grad_norm": 0.07245161384344101, "learning_rate": 1.9190274525218077e-05, "loss": 0.5647, "step": 296 }, { "epoch": 0.6458276705626529, "grad_norm": 0.08857771754264832, "learning_rate": 1.9184869857459233e-05, "loss": 0.6477, "step": 297 }, { "epoch": 0.6480021745039413, "grad_norm": 0.0621081106364727, "learning_rate": 1.917944797865282e-05, "loss": 0.5599, "step": 298 }, { "epoch": 0.6501766784452296, "grad_norm": 0.07608568668365479, "learning_rate": 1.917400889895862e-05, "loss": 0.6214, "step": 299 }, { "epoch": 0.6523511823865181, "grad_norm": 0.06423980742692947, "learning_rate": 1.9168552628568632e-05, "loss": 0.5302, "step": 300 }, { "epoch": 0.6545256863278065, "grad_norm": 0.06609907746315002, "learning_rate": 1.9163079177707077e-05, "loss": 0.6779, "step": 301 }, { "epoch": 0.6567001902690949, "grad_norm": 0.07207638025283813, "learning_rate": 1.9157588556630373e-05, "loss": 0.5176, "step": 302 }, { "epoch": 0.6588746942103832, "grad_norm": 0.08203461766242981, "learning_rate": 1.9152080775627105e-05, "loss": 0.5886, "step": 303 }, { "epoch": 0.6610491981516716, "grad_norm": 0.07297263294458389, "learning_rate": 1.914655584501802e-05, "loss": 0.5217, "step": 304 }, { "epoch": 0.66322370209296, "grad_norm": 0.06291449069976807, "learning_rate": 1.914101377515599e-05, "loss": 0.563, "step": 305 }, { "epoch": 0.6653982060342485, "grad_norm": 0.0708426907658577, "learning_rate": 1.913545457642601e-05, "loss": 0.5735, "step": 306 }, { "epoch": 0.6675727099755369, "grad_norm": 0.06969770789146423, "learning_rate": 1.9129878259245182e-05, "loss": 0.5107, "step": 307 }, { "epoch": 0.6697472139168252, "grad_norm": 0.09216295182704926, "learning_rate": 1.912428483406266e-05, "loss": 0.5896, "step": 308 }, { "epoch": 0.6719217178581136, "grad_norm": 0.08255861699581146, "learning_rate": 1.9118674311359685e-05, "loss": 0.5625, "step": 309 }, { "epoch": 0.674096221799402, "grad_norm": 0.07867822051048279, "learning_rate": 1.9113046701649517e-05, "loss": 0.5858, "step": 310 }, { "epoch": 0.6762707257406904, "grad_norm": 0.06747031211853027, "learning_rate": 1.9107402015477435e-05, "loss": 0.5443, "step": 311 }, { "epoch": 0.6784452296819788, "grad_norm": 0.07020735740661621, "learning_rate": 1.910174026342073e-05, "loss": 0.593, "step": 312 }, { "epoch": 0.6806197336232672, "grad_norm": 0.07389922440052032, "learning_rate": 1.909606145608866e-05, "loss": 0.7238, "step": 313 }, { "epoch": 0.6827942375645556, "grad_norm": 0.07128795236349106, "learning_rate": 1.909036560412244e-05, "loss": 0.6418, "step": 314 }, { "epoch": 0.684968741505844, "grad_norm": 0.0664687380194664, "learning_rate": 1.9084652718195237e-05, "loss": 0.6256, "step": 315 }, { "epoch": 0.6871432454471323, "grad_norm": 0.06428362429141998, "learning_rate": 1.9078922809012127e-05, "loss": 0.5522, "step": 316 }, { "epoch": 0.6893177493884207, "grad_norm": 0.07462253421545029, "learning_rate": 1.9073175887310086e-05, "loss": 0.5827, "step": 317 }, { "epoch": 0.6914922533297092, "grad_norm": 0.07591954618692398, "learning_rate": 1.9067411963857967e-05, "loss": 0.5387, "step": 318 }, { "epoch": 0.6936667572709976, "grad_norm": 0.07778114080429077, "learning_rate": 1.906163104945649e-05, "loss": 0.6467, "step": 319 }, { "epoch": 0.695841261212286, "grad_norm": 0.07467600703239441, "learning_rate": 1.9055833154938208e-05, "loss": 0.5907, "step": 320 }, { "epoch": 0.6980157651535743, "grad_norm": 0.07313650101423264, "learning_rate": 1.9050018291167492e-05, "loss": 0.5151, "step": 321 }, { "epoch": 0.7001902690948627, "grad_norm": 0.08900684118270874, "learning_rate": 1.9044186469040508e-05, "loss": 0.5239, "step": 322 }, { "epoch": 0.7023647730361511, "grad_norm": 0.07227291911840439, "learning_rate": 1.9038337699485207e-05, "loss": 0.5727, "step": 323 }, { "epoch": 0.7045392769774396, "grad_norm": 0.06213615462183952, "learning_rate": 1.903247199346129e-05, "loss": 0.5795, "step": 324 }, { "epoch": 0.7067137809187279, "grad_norm": 0.06224863976240158, "learning_rate": 1.90265893619602e-05, "loss": 0.5111, "step": 325 }, { "epoch": 0.7088882848600163, "grad_norm": 0.07143815606832504, "learning_rate": 1.9020689816005086e-05, "loss": 0.591, "step": 326 }, { "epoch": 0.7110627888013047, "grad_norm": 0.06437085568904877, "learning_rate": 1.9014773366650807e-05, "loss": 0.5378, "step": 327 }, { "epoch": 0.7132372927425931, "grad_norm": 0.07230640947818756, "learning_rate": 1.9008840024983883e-05, "loss": 0.6771, "step": 328 }, { "epoch": 0.7154117966838814, "grad_norm": 0.0662621557712555, "learning_rate": 1.9002889802122495e-05, "loss": 0.5679, "step": 329 }, { "epoch": 0.7175863006251699, "grad_norm": 0.06512124836444855, "learning_rate": 1.8996922709216456e-05, "loss": 0.597, "step": 330 }, { "epoch": 0.7197608045664583, "grad_norm": 0.06657914072275162, "learning_rate": 1.8990938757447184e-05, "loss": 0.581, "step": 331 }, { "epoch": 0.7219353085077467, "grad_norm": 0.09024710953235626, "learning_rate": 1.8984937958027697e-05, "loss": 0.5681, "step": 332 }, { "epoch": 0.724109812449035, "grad_norm": 0.06663142144680023, "learning_rate": 1.8978920322202582e-05, "loss": 0.5626, "step": 333 }, { "epoch": 0.7262843163903234, "grad_norm": 0.07135281711816788, "learning_rate": 1.897288586124797e-05, "loss": 0.5357, "step": 334 }, { "epoch": 0.7284588203316118, "grad_norm": 0.090835802257061, "learning_rate": 1.8966834586471517e-05, "loss": 0.7526, "step": 335 }, { "epoch": 0.7306333242729003, "grad_norm": 0.06867898255586624, "learning_rate": 1.89607665092124e-05, "loss": 0.6475, "step": 336 }, { "epoch": 0.7328078282141887, "grad_norm": 0.0679989755153656, "learning_rate": 1.8954681640841267e-05, "loss": 0.5182, "step": 337 }, { "epoch": 0.734982332155477, "grad_norm": 0.07487937062978745, "learning_rate": 1.894857999276023e-05, "loss": 0.5967, "step": 338 }, { "epoch": 0.7371568360967654, "grad_norm": 0.05853964388370514, "learning_rate": 1.8942461576402858e-05, "loss": 0.51, "step": 339 }, { "epoch": 0.7393313400380538, "grad_norm": 0.06330685317516327, "learning_rate": 1.8936326403234125e-05, "loss": 0.6164, "step": 340 }, { "epoch": 0.7415058439793422, "grad_norm": 0.07178864628076553, "learning_rate": 1.8930174484750413e-05, "loss": 0.5703, "step": 341 }, { "epoch": 0.7436803479206306, "grad_norm": 0.06133674085140228, "learning_rate": 1.892400583247948e-05, "loss": 0.4839, "step": 342 }, { "epoch": 0.745854851861919, "grad_norm": 0.0777459591627121, "learning_rate": 1.8917820457980444e-05, "loss": 0.633, "step": 343 }, { "epoch": 0.7480293558032074, "grad_norm": 0.0892263725399971, "learning_rate": 1.891161837284375e-05, "loss": 0.9611, "step": 344 }, { "epoch": 0.7502038597444958, "grad_norm": 0.07068607956171036, "learning_rate": 1.8905399588691165e-05, "loss": 0.608, "step": 345 }, { "epoch": 0.7523783636857841, "grad_norm": 0.06626912951469421, "learning_rate": 1.8899164117175745e-05, "loss": 0.5571, "step": 346 }, { "epoch": 0.7545528676270725, "grad_norm": 0.06587188690900803, "learning_rate": 1.8892911969981806e-05, "loss": 0.5783, "step": 347 }, { "epoch": 0.756727371568361, "grad_norm": 0.06823555380105972, "learning_rate": 1.888664315882493e-05, "loss": 0.5153, "step": 348 }, { "epoch": 0.7589018755096494, "grad_norm": 0.06443888694047928, "learning_rate": 1.8880357695451907e-05, "loss": 0.5483, "step": 349 }, { "epoch": 0.7610763794509378, "grad_norm": 0.07560069859027863, "learning_rate": 1.8874055591640746e-05, "loss": 0.6471, "step": 350 }, { "epoch": 0.7632508833922261, "grad_norm": 0.06059009209275246, "learning_rate": 1.886773685920062e-05, "loss": 0.5108, "step": 351 }, { "epoch": 0.7654253873335145, "grad_norm": 0.08049315959215164, "learning_rate": 1.8861401509971878e-05, "loss": 0.6476, "step": 352 }, { "epoch": 0.7675998912748029, "grad_norm": 0.06180081516504288, "learning_rate": 1.8855049555826004e-05, "loss": 0.5459, "step": 353 }, { "epoch": 0.7697743952160914, "grad_norm": 0.06988122314214706, "learning_rate": 1.884868100866558e-05, "loss": 0.5481, "step": 354 }, { "epoch": 0.7719488991573797, "grad_norm": 0.06675580143928528, "learning_rate": 1.8842295880424305e-05, "loss": 0.5894, "step": 355 }, { "epoch": 0.7741234030986681, "grad_norm": 0.07367975264787674, "learning_rate": 1.8835894183066934e-05, "loss": 0.6505, "step": 356 }, { "epoch": 0.7762979070399565, "grad_norm": 0.0673639103770256, "learning_rate": 1.8829475928589272e-05, "loss": 0.5328, "step": 357 }, { "epoch": 0.7784724109812449, "grad_norm": 0.07420552521944046, "learning_rate": 1.882304112901815e-05, "loss": 0.5852, "step": 358 }, { "epoch": 0.7806469149225334, "grad_norm": 0.08115178346633911, "learning_rate": 1.88165897964114e-05, "loss": 0.6943, "step": 359 }, { "epoch": 0.7828214188638217, "grad_norm": 0.061986930668354034, "learning_rate": 1.8810121942857848e-05, "loss": 0.5001, "step": 360 }, { "epoch": 0.7849959228051101, "grad_norm": 0.060133032500743866, "learning_rate": 1.8803637580477254e-05, "loss": 0.541, "step": 361 }, { "epoch": 0.7871704267463985, "grad_norm": 0.06773900985717773, "learning_rate": 1.879713672142033e-05, "loss": 0.6, "step": 362 }, { "epoch": 0.7893449306876869, "grad_norm": 0.06743879616260529, "learning_rate": 1.87906193778687e-05, "loss": 0.5844, "step": 363 }, { "epoch": 0.7915194346289752, "grad_norm": 0.07073816657066345, "learning_rate": 1.878408556203487e-05, "loss": 0.6186, "step": 364 }, { "epoch": 0.7936939385702637, "grad_norm": 0.069249726831913, "learning_rate": 1.8777535286162217e-05, "loss": 0.5677, "step": 365 }, { "epoch": 0.7958684425115521, "grad_norm": 0.07309424877166748, "learning_rate": 1.877096856252496e-05, "loss": 0.6781, "step": 366 }, { "epoch": 0.7980429464528405, "grad_norm": 0.06280476599931717, "learning_rate": 1.8764385403428137e-05, "loss": 0.5624, "step": 367 }, { "epoch": 0.8002174503941288, "grad_norm": 0.06109658256173134, "learning_rate": 1.875778582120759e-05, "loss": 0.5338, "step": 368 }, { "epoch": 0.8023919543354172, "grad_norm": 0.05737539008259773, "learning_rate": 1.8751169828229927e-05, "loss": 0.471, "step": 369 }, { "epoch": 0.8045664582767056, "grad_norm": 0.06516994535923004, "learning_rate": 1.8744537436892517e-05, "loss": 0.5304, "step": 370 }, { "epoch": 0.8067409622179941, "grad_norm": 0.07488464564085007, "learning_rate": 1.8737888659623444e-05, "loss": 0.6031, "step": 371 }, { "epoch": 0.8089154661592824, "grad_norm": 0.07439956814050674, "learning_rate": 1.8731223508881512e-05, "loss": 0.6722, "step": 372 }, { "epoch": 0.8110899701005708, "grad_norm": 0.06686954945325851, "learning_rate": 1.8724541997156194e-05, "loss": 0.5272, "step": 373 }, { "epoch": 0.8132644740418592, "grad_norm": 0.0563318096101284, "learning_rate": 1.8717844136967626e-05, "loss": 0.4735, "step": 374 }, { "epoch": 0.8154389779831476, "grad_norm": 0.06146978959441185, "learning_rate": 1.8711129940866577e-05, "loss": 0.5446, "step": 375 }, { "epoch": 0.817613481924436, "grad_norm": 0.0725124180316925, "learning_rate": 1.870439942143443e-05, "loss": 0.5773, "step": 376 }, { "epoch": 0.8197879858657244, "grad_norm": 0.06086898222565651, "learning_rate": 1.8697652591283157e-05, "loss": 0.5268, "step": 377 }, { "epoch": 0.8219624898070128, "grad_norm": 0.0689455047249794, "learning_rate": 1.8690889463055285e-05, "loss": 0.5362, "step": 378 }, { "epoch": 0.8241369937483012, "grad_norm": 0.07236643135547638, "learning_rate": 1.8684110049423888e-05, "loss": 0.6042, "step": 379 }, { "epoch": 0.8263114976895896, "grad_norm": 0.07310078293085098, "learning_rate": 1.8677314363092555e-05, "loss": 0.5768, "step": 380 }, { "epoch": 0.8284860016308779, "grad_norm": 0.06427124887704849, "learning_rate": 1.8670502416795368e-05, "loss": 0.5493, "step": 381 }, { "epoch": 0.8306605055721663, "grad_norm": 0.06283894181251526, "learning_rate": 1.8663674223296876e-05, "loss": 0.5434, "step": 382 }, { "epoch": 0.8328350095134548, "grad_norm": 0.07058608531951904, "learning_rate": 1.8656829795392076e-05, "loss": 0.6304, "step": 383 }, { "epoch": 0.8350095134547432, "grad_norm": 0.07025455683469772, "learning_rate": 1.864996914590638e-05, "loss": 0.7133, "step": 384 }, { "epoch": 0.8371840173960315, "grad_norm": 0.07467500120401382, "learning_rate": 1.8643092287695604e-05, "loss": 0.6442, "step": 385 }, { "epoch": 0.8393585213373199, "grad_norm": 0.10256560891866684, "learning_rate": 1.8636199233645934e-05, "loss": 0.5251, "step": 386 }, { "epoch": 0.8415330252786083, "grad_norm": 0.159496009349823, "learning_rate": 1.86292899966739e-05, "loss": 0.58, "step": 387 }, { "epoch": 0.8437075292198967, "grad_norm": 0.07732810825109482, "learning_rate": 1.862236458972636e-05, "loss": 0.5351, "step": 388 }, { "epoch": 0.8458820331611852, "grad_norm": 0.06530837714672089, "learning_rate": 1.8615423025780475e-05, "loss": 0.6311, "step": 389 }, { "epoch": 0.8480565371024735, "grad_norm": 0.06626260280609131, "learning_rate": 1.860846531784368e-05, "loss": 0.4523, "step": 390 }, { "epoch": 0.8502310410437619, "grad_norm": 0.06009805202484131, "learning_rate": 1.860149147895366e-05, "loss": 0.5353, "step": 391 }, { "epoch": 0.8524055449850503, "grad_norm": 0.07053548097610474, "learning_rate": 1.8594501522178318e-05, "loss": 0.5609, "step": 392 }, { "epoch": 0.8545800489263387, "grad_norm": 0.057913295924663544, "learning_rate": 1.858749546061578e-05, "loss": 0.4931, "step": 393 }, { "epoch": 0.856754552867627, "grad_norm": 0.07603286951780319, "learning_rate": 1.8580473307394334e-05, "loss": 0.5451, "step": 394 }, { "epoch": 0.8589290568089155, "grad_norm": 0.07145309448242188, "learning_rate": 1.8573435075672422e-05, "loss": 0.5685, "step": 395 }, { "epoch": 0.8611035607502039, "grad_norm": 0.06997530907392502, "learning_rate": 1.856638077863863e-05, "loss": 0.5885, "step": 396 }, { "epoch": 0.8632780646914923, "grad_norm": 0.06684005260467529, "learning_rate": 1.8559310429511625e-05, "loss": 0.7244, "step": 397 }, { "epoch": 0.8654525686327806, "grad_norm": 0.05901939049363136, "learning_rate": 1.855222404154017e-05, "loss": 0.5337, "step": 398 }, { "epoch": 0.867627072574069, "grad_norm": 0.06867847591638565, "learning_rate": 1.854512162800308e-05, "loss": 0.5699, "step": 399 }, { "epoch": 0.8698015765153574, "grad_norm": 0.07950898259878159, "learning_rate": 1.8538003202209186e-05, "loss": 0.5877, "step": 400 }, { "epoch": 0.8719760804566459, "grad_norm": 0.07120728492736816, "learning_rate": 1.8530868777497348e-05, "loss": 0.5801, "step": 401 }, { "epoch": 0.8741505843979342, "grad_norm": 0.0909164622426033, "learning_rate": 1.852371836723638e-05, "loss": 0.7314, "step": 402 }, { "epoch": 0.8763250883392226, "grad_norm": 0.33725911378860474, "learning_rate": 1.8516551984825063e-05, "loss": 0.5538, "step": 403 }, { "epoch": 0.878499592280511, "grad_norm": 0.0694350004196167, "learning_rate": 1.8509369643692115e-05, "loss": 0.5751, "step": 404 }, { "epoch": 0.8806740962217994, "grad_norm": 0.06652981787919998, "learning_rate": 1.8502171357296144e-05, "loss": 0.4818, "step": 405 }, { "epoch": 0.8828486001630877, "grad_norm": 0.06328922510147095, "learning_rate": 1.849495713912564e-05, "loss": 0.6393, "step": 406 }, { "epoch": 0.8850231041043762, "grad_norm": 0.06877338141202927, "learning_rate": 1.8487727002698955e-05, "loss": 0.5647, "step": 407 }, { "epoch": 0.8871976080456646, "grad_norm": 0.07963277399539948, "learning_rate": 1.848048096156426e-05, "loss": 0.5504, "step": 408 }, { "epoch": 0.889372111986953, "grad_norm": 0.061071597039699554, "learning_rate": 1.8473219029299537e-05, "loss": 0.5341, "step": 409 }, { "epoch": 0.8915466159282414, "grad_norm": 0.058434270322322845, "learning_rate": 1.8465941219512533e-05, "loss": 0.4683, "step": 410 }, { "epoch": 0.8937211198695297, "grad_norm": 0.07438481599092484, "learning_rate": 1.8458647545840766e-05, "loss": 0.6219, "step": 411 }, { "epoch": 0.8958956238108181, "grad_norm": 0.07952351868152618, "learning_rate": 1.845133802195146e-05, "loss": 0.5661, "step": 412 }, { "epoch": 0.8980701277521066, "grad_norm": 0.07641007006168365, "learning_rate": 1.8444012661541556e-05, "loss": 0.6416, "step": 413 }, { "epoch": 0.900244631693395, "grad_norm": 0.06343628466129303, "learning_rate": 1.8436671478337666e-05, "loss": 0.5531, "step": 414 }, { "epoch": 0.9024191356346833, "grad_norm": 0.07128674536943436, "learning_rate": 1.8429314486096042e-05, "loss": 0.5431, "step": 415 }, { "epoch": 0.9045936395759717, "grad_norm": 0.06979305297136307, "learning_rate": 1.8421941698602572e-05, "loss": 0.6062, "step": 416 }, { "epoch": 0.9067681435172601, "grad_norm": 0.07235769182443619, "learning_rate": 1.8414553129672734e-05, "loss": 0.5756, "step": 417 }, { "epoch": 0.9089426474585485, "grad_norm": 0.08389944583177567, "learning_rate": 1.8407148793151586e-05, "loss": 0.5136, "step": 418 }, { "epoch": 0.911117151399837, "grad_norm": 0.07698196172714233, "learning_rate": 1.8399728702913723e-05, "loss": 0.6323, "step": 419 }, { "epoch": 0.9132916553411253, "grad_norm": 0.07112561911344528, "learning_rate": 1.839229287286327e-05, "loss": 0.6917, "step": 420 }, { "epoch": 0.9154661592824137, "grad_norm": 0.08736804127693176, "learning_rate": 1.8384841316933833e-05, "loss": 0.5869, "step": 421 }, { "epoch": 0.9176406632237021, "grad_norm": 0.06583776324987411, "learning_rate": 1.8377374049088494e-05, "loss": 0.5732, "step": 422 }, { "epoch": 0.9198151671649905, "grad_norm": 0.06712512671947479, "learning_rate": 1.836989108331978e-05, "loss": 0.5705, "step": 423 }, { "epoch": 0.9219896711062788, "grad_norm": 0.06708601117134094, "learning_rate": 1.836239243364963e-05, "loss": 0.4941, "step": 424 }, { "epoch": 0.9241641750475673, "grad_norm": 0.07236512750387192, "learning_rate": 1.8354878114129368e-05, "loss": 0.5788, "step": 425 }, { "epoch": 0.9263386789888557, "grad_norm": 0.06179850921034813, "learning_rate": 1.8347348138839685e-05, "loss": 0.5098, "step": 426 }, { "epoch": 0.9285131829301441, "grad_norm": 0.07724650204181671, "learning_rate": 1.8339802521890608e-05, "loss": 0.688, "step": 427 }, { "epoch": 0.9306876868714324, "grad_norm": 0.08667793869972229, "learning_rate": 1.8332241277421477e-05, "loss": 0.654, "step": 428 }, { "epoch": 0.9328621908127208, "grad_norm": 0.06334693729877472, "learning_rate": 1.832466441960091e-05, "loss": 0.5114, "step": 429 }, { "epoch": 0.9350366947540092, "grad_norm": 0.06075149029493332, "learning_rate": 1.831707196262679e-05, "loss": 0.529, "step": 430 }, { "epoch": 0.9372111986952977, "grad_norm": 0.08355309069156647, "learning_rate": 1.8309463920726217e-05, "loss": 0.7672, "step": 431 }, { "epoch": 0.939385702636586, "grad_norm": 0.05992349237203598, "learning_rate": 1.8301840308155507e-05, "loss": 0.55, "step": 432 }, { "epoch": 0.9415602065778744, "grad_norm": 0.06330110132694244, "learning_rate": 1.8294201139200152e-05, "loss": 0.5638, "step": 433 }, { "epoch": 0.9437347105191628, "grad_norm": 0.07264009863138199, "learning_rate": 1.8286546428174787e-05, "loss": 0.5591, "step": 434 }, { "epoch": 0.9459092144604512, "grad_norm": 0.06289627403020859, "learning_rate": 1.827887618942318e-05, "loss": 0.5666, "step": 435 }, { "epoch": 0.9480837184017396, "grad_norm": 0.06342589855194092, "learning_rate": 1.8271190437318186e-05, "loss": 0.5824, "step": 436 }, { "epoch": 0.950258222343028, "grad_norm": 0.06801969558000565, "learning_rate": 1.8263489186261734e-05, "loss": 0.5752, "step": 437 }, { "epoch": 0.9524327262843164, "grad_norm": 0.06664588302373886, "learning_rate": 1.82557724506848e-05, "loss": 0.646, "step": 438 }, { "epoch": 0.9546072302256048, "grad_norm": 0.06647296994924545, "learning_rate": 1.8248040245047365e-05, "loss": 0.5425, "step": 439 }, { "epoch": 0.9567817341668932, "grad_norm": 0.06871071457862854, "learning_rate": 1.824029258383841e-05, "loss": 0.4695, "step": 440 }, { "epoch": 0.9589562381081815, "grad_norm": 0.06482117623090744, "learning_rate": 1.8232529481575874e-05, "loss": 0.4798, "step": 441 }, { "epoch": 0.9611307420494699, "grad_norm": 0.06982200592756271, "learning_rate": 1.8224750952806626e-05, "loss": 0.7468, "step": 442 }, { "epoch": 0.9633052459907584, "grad_norm": 0.07166951149702072, "learning_rate": 1.821695701210644e-05, "loss": 0.6126, "step": 443 }, { "epoch": 0.9654797499320468, "grad_norm": 0.06602595001459122, "learning_rate": 1.8209147674079983e-05, "loss": 0.5982, "step": 444 }, { "epoch": 0.9676542538733351, "grad_norm": 0.07407505810260773, "learning_rate": 1.8201322953360758e-05, "loss": 0.5907, "step": 445 }, { "epoch": 0.9698287578146235, "grad_norm": 0.07786395400762558, "learning_rate": 1.8193482864611102e-05, "loss": 0.6097, "step": 446 }, { "epoch": 0.9720032617559119, "grad_norm": 0.06929359585046768, "learning_rate": 1.818562742252215e-05, "loss": 0.6151, "step": 447 }, { "epoch": 0.9741777656972003, "grad_norm": 0.07777715474367142, "learning_rate": 1.8177756641813805e-05, "loss": 0.5832, "step": 448 }, { "epoch": 0.9763522696384888, "grad_norm": 0.06341813504695892, "learning_rate": 1.8169870537234707e-05, "loss": 0.4798, "step": 449 }, { "epoch": 0.9785267735797771, "grad_norm": 0.06264607608318329, "learning_rate": 1.816196912356222e-05, "loss": 0.5927, "step": 450 }, { "epoch": 0.9807012775210655, "grad_norm": 0.08713778108358383, "learning_rate": 1.8154052415602382e-05, "loss": 0.5245, "step": 451 }, { "epoch": 0.9828757814623539, "grad_norm": 0.06647881120443344, "learning_rate": 1.814612042818991e-05, "loss": 0.7168, "step": 452 }, { "epoch": 0.9850502854036423, "grad_norm": 0.05877126008272171, "learning_rate": 1.8138173176188133e-05, "loss": 0.4786, "step": 453 }, { "epoch": 0.9872247893449307, "grad_norm": 0.07822337746620178, "learning_rate": 1.8130210674488994e-05, "loss": 0.6509, "step": 454 }, { "epoch": 0.9893992932862191, "grad_norm": 0.060204919427633286, "learning_rate": 1.812223293801301e-05, "loss": 0.548, "step": 455 }, { "epoch": 0.9915737972275075, "grad_norm": 0.1021980419754982, "learning_rate": 1.8114239981709234e-05, "loss": 0.5798, "step": 456 }, { "epoch": 0.9937483011687959, "grad_norm": 0.06757181882858276, "learning_rate": 1.810623182055526e-05, "loss": 0.5694, "step": 457 }, { "epoch": 0.9959228051100842, "grad_norm": 0.0683899000287056, "learning_rate": 1.809820846955716e-05, "loss": 0.5613, "step": 458 }, { "epoch": 0.9980973090513726, "grad_norm": 0.06746495515108109, "learning_rate": 1.8090169943749477e-05, "loss": 0.5854, "step": 459 }, { "epoch": 1.0, "grad_norm": 0.06785427033901215, "learning_rate": 1.8082116258195173e-05, "loss": 0.5628, "step": 460 }, { "epoch": 1.0, "eval_loss": 0.5784299969673157, "eval_runtime": 12.575, "eval_samples_per_second": 5.885, "eval_steps_per_second": 5.885, "step": 460 }, { "epoch": 1.0021745039412884, "grad_norm": 0.0628708228468895, "learning_rate": 1.807404742798564e-05, "loss": 0.5059, "step": 461 }, { "epoch": 1.0043490078825767, "grad_norm": 0.07247425615787506, "learning_rate": 1.8065963468240625e-05, "loss": 0.6026, "step": 462 }, { "epoch": 1.0065235118238651, "grad_norm": 0.07581452280282974, "learning_rate": 1.805786439410825e-05, "loss": 0.4863, "step": 463 }, { "epoch": 1.0086980157651535, "grad_norm": 0.0607333704829216, "learning_rate": 1.804975022076494e-05, "loss": 0.5123, "step": 464 }, { "epoch": 1.0108725197064419, "grad_norm": 0.06368887424468994, "learning_rate": 1.8041620963415418e-05, "loss": 0.4928, "step": 465 }, { "epoch": 1.0130470236477305, "grad_norm": 0.0645102709531784, "learning_rate": 1.8033476637292682e-05, "loss": 0.4943, "step": 466 }, { "epoch": 1.0152215275890188, "grad_norm": 0.06531580537557602, "learning_rate": 1.8025317257657954e-05, "loss": 0.5833, "step": 467 }, { "epoch": 1.0173960315303072, "grad_norm": 0.05975591763854027, "learning_rate": 1.8017142839800667e-05, "loss": 0.5005, "step": 468 }, { "epoch": 1.0195705354715956, "grad_norm": 0.06923265010118484, "learning_rate": 1.8008953399038442e-05, "loss": 0.5695, "step": 469 }, { "epoch": 1.021745039412884, "grad_norm": 0.07725095003843307, "learning_rate": 1.800074895071704e-05, "loss": 0.5641, "step": 470 }, { "epoch": 1.0239195433541723, "grad_norm": 0.08455652743577957, "learning_rate": 1.7992529510210347e-05, "loss": 0.651, "step": 471 }, { "epoch": 1.0260940472954607, "grad_norm": 0.06396053731441498, "learning_rate": 1.7984295092920344e-05, "loss": 0.602, "step": 472 }, { "epoch": 1.028268551236749, "grad_norm": 0.07036537677049637, "learning_rate": 1.797604571427708e-05, "loss": 0.5967, "step": 473 }, { "epoch": 1.0304430551780375, "grad_norm": 0.07938440144062042, "learning_rate": 1.7967781389738625e-05, "loss": 0.6325, "step": 474 }, { "epoch": 1.0326175591193258, "grad_norm": 0.0671621710062027, "learning_rate": 1.795950213479107e-05, "loss": 0.4776, "step": 475 }, { "epoch": 1.0347920630606142, "grad_norm": 0.06855512410402298, "learning_rate": 1.795120796494848e-05, "loss": 0.5465, "step": 476 }, { "epoch": 1.0369665670019026, "grad_norm": 0.08335117250680923, "learning_rate": 1.794289889575286e-05, "loss": 0.6032, "step": 477 }, { "epoch": 1.0391410709431912, "grad_norm": 0.06471085548400879, "learning_rate": 1.793457494277415e-05, "loss": 0.5959, "step": 478 }, { "epoch": 1.0413155748844796, "grad_norm": 0.08365058898925781, "learning_rate": 1.7926236121610163e-05, "loss": 0.6577, "step": 479 }, { "epoch": 1.043490078825768, "grad_norm": 0.07287590950727463, "learning_rate": 1.7917882447886585e-05, "loss": 0.6224, "step": 480 }, { "epoch": 1.0456645827670563, "grad_norm": 0.06598275899887085, "learning_rate": 1.790951393725692e-05, "loss": 0.5404, "step": 481 }, { "epoch": 1.0478390867083447, "grad_norm": 0.06515929102897644, "learning_rate": 1.7901130605402494e-05, "loss": 0.5848, "step": 482 }, { "epoch": 1.050013590649633, "grad_norm": 0.0671316385269165, "learning_rate": 1.7892732468032385e-05, "loss": 0.5342, "step": 483 }, { "epoch": 1.0521880945909214, "grad_norm": 0.06343325227499008, "learning_rate": 1.788431954088343e-05, "loss": 0.5121, "step": 484 }, { "epoch": 1.0543625985322098, "grad_norm": 0.07387620955705643, "learning_rate": 1.787589183972017e-05, "loss": 0.5224, "step": 485 }, { "epoch": 1.0565371024734982, "grad_norm": 0.0674918070435524, "learning_rate": 1.7867449380334834e-05, "loss": 0.5336, "step": 486 }, { "epoch": 1.0587116064147866, "grad_norm": 0.06690666079521179, "learning_rate": 1.7858992178547306e-05, "loss": 0.4926, "step": 487 }, { "epoch": 1.060886110356075, "grad_norm": 0.06445205211639404, "learning_rate": 1.7850520250205095e-05, "loss": 0.552, "step": 488 }, { "epoch": 1.0630606142973633, "grad_norm": 0.07294350117444992, "learning_rate": 1.7842033611183308e-05, "loss": 0.5913, "step": 489 }, { "epoch": 1.0652351182386517, "grad_norm": 0.0654023140668869, "learning_rate": 1.7833532277384607e-05, "loss": 0.5191, "step": 490 }, { "epoch": 1.0674096221799403, "grad_norm": 0.06359139084815979, "learning_rate": 1.7825016264739202e-05, "loss": 0.561, "step": 491 }, { "epoch": 1.0695841261212287, "grad_norm": 0.06811202317476273, "learning_rate": 1.7816485589204802e-05, "loss": 0.5766, "step": 492 }, { "epoch": 1.071758630062517, "grad_norm": 0.06508602201938629, "learning_rate": 1.7807940266766595e-05, "loss": 0.5371, "step": 493 }, { "epoch": 1.0739331340038054, "grad_norm": 0.0661052018404007, "learning_rate": 1.779938031343721e-05, "loss": 0.5201, "step": 494 }, { "epoch": 1.0761076379450938, "grad_norm": 0.07383374869823456, "learning_rate": 1.7790805745256703e-05, "loss": 0.5465, "step": 495 }, { "epoch": 1.0782821418863822, "grad_norm": 0.07273479551076889, "learning_rate": 1.778221657829251e-05, "loss": 0.5911, "step": 496 }, { "epoch": 1.0804566458276705, "grad_norm": 0.07575912773609161, "learning_rate": 1.7773612828639415e-05, "loss": 0.6007, "step": 497 }, { "epoch": 1.082631149768959, "grad_norm": 0.07243568450212479, "learning_rate": 1.7764994512419534e-05, "loss": 0.5026, "step": 498 }, { "epoch": 1.0848056537102473, "grad_norm": 0.05891551077365875, "learning_rate": 1.775636164578229e-05, "loss": 0.5537, "step": 499 }, { "epoch": 1.0869801576515357, "grad_norm": 0.0614311546087265, "learning_rate": 1.7747714244904348e-05, "loss": 0.5652, "step": 500 }, { "epoch": 1.089154661592824, "grad_norm": 0.0637718215584755, "learning_rate": 1.7739052325989628e-05, "loss": 0.4913, "step": 501 }, { "epoch": 1.0913291655341126, "grad_norm": 0.07575857639312744, "learning_rate": 1.7730375905269248e-05, "loss": 0.5657, "step": 502 }, { "epoch": 1.093503669475401, "grad_norm": 0.07907545566558838, "learning_rate": 1.7721684999001496e-05, "loss": 0.5461, "step": 503 }, { "epoch": 1.0956781734166894, "grad_norm": 0.0687398612499237, "learning_rate": 1.771297962347181e-05, "loss": 0.5589, "step": 504 }, { "epoch": 1.0978526773579778, "grad_norm": 0.08037672936916351, "learning_rate": 1.7704259794992734e-05, "loss": 0.5777, "step": 505 }, { "epoch": 1.1000271812992661, "grad_norm": 0.06579285860061646, "learning_rate": 1.7695525529903895e-05, "loss": 0.5578, "step": 506 }, { "epoch": 1.1022016852405545, "grad_norm": 0.07901425659656525, "learning_rate": 1.768677684457199e-05, "loss": 0.7764, "step": 507 }, { "epoch": 1.1043761891818429, "grad_norm": 0.08795467764139175, "learning_rate": 1.767801375539071e-05, "loss": 0.5182, "step": 508 }, { "epoch": 1.1065506931231313, "grad_norm": 0.07071004807949066, "learning_rate": 1.7669236278780758e-05, "loss": 0.7709, "step": 509 }, { "epoch": 1.1087251970644196, "grad_norm": 0.05952536687254906, "learning_rate": 1.766044443118978e-05, "loss": 0.4741, "step": 510 }, { "epoch": 1.110899701005708, "grad_norm": 0.07513156533241272, "learning_rate": 1.7651638229092373e-05, "loss": 0.628, "step": 511 }, { "epoch": 1.1130742049469964, "grad_norm": 0.0708564966917038, "learning_rate": 1.7642817688990006e-05, "loss": 0.5747, "step": 512 }, { "epoch": 1.1152487088882848, "grad_norm": 0.0806412473320961, "learning_rate": 1.763398282741103e-05, "loss": 0.5468, "step": 513 }, { "epoch": 1.1174232128295734, "grad_norm": 0.06672287732362747, "learning_rate": 1.762513366091064e-05, "loss": 0.6372, "step": 514 }, { "epoch": 1.1195977167708617, "grad_norm": 0.0723346620798111, "learning_rate": 1.7616270206070814e-05, "loss": 0.5955, "step": 515 }, { "epoch": 1.12177222071215, "grad_norm": 0.07586865872144699, "learning_rate": 1.7607392479500325e-05, "loss": 0.6145, "step": 516 }, { "epoch": 1.1239467246534385, "grad_norm": 0.07029952853918076, "learning_rate": 1.7598500497834678e-05, "loss": 0.5264, "step": 517 }, { "epoch": 1.1261212285947269, "grad_norm": 0.07074055820703506, "learning_rate": 1.7589594277736095e-05, "loss": 0.5713, "step": 518 }, { "epoch": 1.1282957325360152, "grad_norm": 0.07118069380521774, "learning_rate": 1.7580673835893473e-05, "loss": 0.5309, "step": 519 }, { "epoch": 1.1304702364773036, "grad_norm": 0.07080569118261337, "learning_rate": 1.7571739189022365e-05, "loss": 0.6062, "step": 520 }, { "epoch": 1.132644740418592, "grad_norm": 0.06360487639904022, "learning_rate": 1.7562790353864934e-05, "loss": 0.5197, "step": 521 }, { "epoch": 1.1348192443598804, "grad_norm": 0.05907633155584335, "learning_rate": 1.7553827347189937e-05, "loss": 0.4878, "step": 522 }, { "epoch": 1.1369937483011687, "grad_norm": 0.07753446698188782, "learning_rate": 1.7544850185792688e-05, "loss": 0.6988, "step": 523 }, { "epoch": 1.139168252242457, "grad_norm": 0.061696745455265045, "learning_rate": 1.7535858886495014e-05, "loss": 0.5308, "step": 524 }, { "epoch": 1.1413427561837457, "grad_norm": 0.07314109057188034, "learning_rate": 1.7526853466145248e-05, "loss": 0.609, "step": 525 }, { "epoch": 1.143517260125034, "grad_norm": 0.06924311071634293, "learning_rate": 1.7517833941618166e-05, "loss": 0.5281, "step": 526 }, { "epoch": 1.1456917640663224, "grad_norm": 0.07122194021940231, "learning_rate": 1.7508800329814993e-05, "loss": 0.5899, "step": 527 }, { "epoch": 1.1478662680076108, "grad_norm": 0.06243983656167984, "learning_rate": 1.749975264766334e-05, "loss": 0.5501, "step": 528 }, { "epoch": 1.1500407719488992, "grad_norm": 0.08208881318569183, "learning_rate": 1.749069091211718e-05, "loss": 0.9971, "step": 529 }, { "epoch": 1.1522152758901876, "grad_norm": 0.05865352600812912, "learning_rate": 1.7481615140156837e-05, "loss": 0.5033, "step": 530 }, { "epoch": 1.154389779831476, "grad_norm": 0.18052862584590912, "learning_rate": 1.747252534878891e-05, "loss": 0.4659, "step": 531 }, { "epoch": 1.1565642837727643, "grad_norm": 0.07844268530607224, "learning_rate": 1.7463421555046298e-05, "loss": 0.6152, "step": 532 }, { "epoch": 1.1587387877140527, "grad_norm": 0.06657543033361435, "learning_rate": 1.7454303775988118e-05, "loss": 0.4957, "step": 533 }, { "epoch": 1.160913291655341, "grad_norm": 0.06461094319820404, "learning_rate": 1.7445172028699703e-05, "loss": 0.5538, "step": 534 }, { "epoch": 1.1630877955966294, "grad_norm": 0.06608734279870987, "learning_rate": 1.743602633029255e-05, "loss": 0.553, "step": 535 }, { "epoch": 1.1652622995379178, "grad_norm": 0.10203693807125092, "learning_rate": 1.742686669790431e-05, "loss": 0.5932, "step": 536 }, { "epoch": 1.1674368034792062, "grad_norm": 0.06268064677715302, "learning_rate": 1.7417693148698743e-05, "loss": 0.5259, "step": 537 }, { "epoch": 1.1696113074204948, "grad_norm": 0.062100693583488464, "learning_rate": 1.740850569986568e-05, "loss": 0.5563, "step": 538 }, { "epoch": 1.1717858113617832, "grad_norm": 0.23905989527702332, "learning_rate": 1.7399304368620996e-05, "loss": 0.5786, "step": 539 }, { "epoch": 1.1739603153030715, "grad_norm": 0.07133086025714874, "learning_rate": 1.7390089172206594e-05, "loss": 0.602, "step": 540 }, { "epoch": 1.17613481924436, "grad_norm": 0.06164265424013138, "learning_rate": 1.738086012789034e-05, "loss": 0.5644, "step": 541 }, { "epoch": 1.1783093231856483, "grad_norm": 0.06620533019304276, "learning_rate": 1.7371617252966066e-05, "loss": 0.6245, "step": 542 }, { "epoch": 1.1804838271269367, "grad_norm": 0.08410079032182693, "learning_rate": 1.7362360564753507e-05, "loss": 0.6106, "step": 543 }, { "epoch": 1.182658331068225, "grad_norm": 0.05992994084954262, "learning_rate": 1.735309008059829e-05, "loss": 0.5456, "step": 544 }, { "epoch": 1.1848328350095134, "grad_norm": 0.07226452976465225, "learning_rate": 1.7343805817871885e-05, "loss": 0.6357, "step": 545 }, { "epoch": 1.1870073389508018, "grad_norm": 0.07331179082393646, "learning_rate": 1.7334507793971592e-05, "loss": 0.582, "step": 546 }, { "epoch": 1.1891818428920902, "grad_norm": 0.07126093655824661, "learning_rate": 1.7325196026320496e-05, "loss": 0.6935, "step": 547 }, { "epoch": 1.1913563468333785, "grad_norm": 0.06638887524604797, "learning_rate": 1.7315870532367423e-05, "loss": 0.6227, "step": 548 }, { "epoch": 1.1935308507746671, "grad_norm": 0.06571762263774872, "learning_rate": 1.7306531329586933e-05, "loss": 0.5761, "step": 549 }, { "epoch": 1.1957053547159555, "grad_norm": 0.07049442827701569, "learning_rate": 1.729717843547927e-05, "loss": 0.5279, "step": 550 }, { "epoch": 1.197879858657244, "grad_norm": 0.05960113927721977, "learning_rate": 1.7287811867570333e-05, "loss": 0.5824, "step": 551 }, { "epoch": 1.2000543625985323, "grad_norm": 0.0737079307436943, "learning_rate": 1.7278431643411643e-05, "loss": 0.5482, "step": 552 }, { "epoch": 1.2022288665398206, "grad_norm": 0.06514870375394821, "learning_rate": 1.726903778058031e-05, "loss": 0.539, "step": 553 }, { "epoch": 1.204403370481109, "grad_norm": 0.06857598572969437, "learning_rate": 1.7259630296679008e-05, "loss": 0.6448, "step": 554 }, { "epoch": 1.2065778744223974, "grad_norm": 0.06951887905597687, "learning_rate": 1.725020920933593e-05, "loss": 0.6819, "step": 555 }, { "epoch": 1.2087523783636858, "grad_norm": 0.06152326613664627, "learning_rate": 1.724077453620475e-05, "loss": 0.5896, "step": 556 }, { "epoch": 1.2109268823049741, "grad_norm": 0.07186588644981384, "learning_rate": 1.7231326294964618e-05, "loss": 0.5742, "step": 557 }, { "epoch": 1.2131013862462625, "grad_norm": 0.06004633754491806, "learning_rate": 1.7221864503320093e-05, "loss": 0.4759, "step": 558 }, { "epoch": 1.215275890187551, "grad_norm": 0.07309587299823761, "learning_rate": 1.721238917900114e-05, "loss": 0.6026, "step": 559 }, { "epoch": 1.2174503941288393, "grad_norm": 0.0630454272031784, "learning_rate": 1.7202900339763066e-05, "loss": 0.6107, "step": 560 }, { "epoch": 1.2196248980701276, "grad_norm": 0.06956064701080322, "learning_rate": 1.7193398003386514e-05, "loss": 0.5097, "step": 561 }, { "epoch": 1.2217994020114162, "grad_norm": 0.05754707381129265, "learning_rate": 1.7183882187677413e-05, "loss": 0.5344, "step": 562 }, { "epoch": 1.2239739059527046, "grad_norm": 0.06699857115745544, "learning_rate": 1.7174352910466958e-05, "loss": 0.5792, "step": 563 }, { "epoch": 1.226148409893993, "grad_norm": 0.07099662721157074, "learning_rate": 1.716481018961156e-05, "loss": 0.5317, "step": 564 }, { "epoch": 1.2283229138352814, "grad_norm": 0.07400328665971756, "learning_rate": 1.7155254042992827e-05, "loss": 0.5307, "step": 565 }, { "epoch": 1.2304974177765697, "grad_norm": 0.05860786512494087, "learning_rate": 1.7145684488517518e-05, "loss": 0.4991, "step": 566 }, { "epoch": 1.232671921717858, "grad_norm": 0.06784110516309738, "learning_rate": 1.7136101544117526e-05, "loss": 0.5992, "step": 567 }, { "epoch": 1.2348464256591465, "grad_norm": 0.06375797092914581, "learning_rate": 1.7126505227749826e-05, "loss": 0.526, "step": 568 }, { "epoch": 1.2370209296004349, "grad_norm": 0.06652164459228516, "learning_rate": 1.7116895557396457e-05, "loss": 0.4434, "step": 569 }, { "epoch": 1.2391954335417232, "grad_norm": 0.06669425219297409, "learning_rate": 1.710727255106447e-05, "loss": 0.5384, "step": 570 }, { "epoch": 1.2413699374830116, "grad_norm": 0.06583644449710846, "learning_rate": 1.709763622678593e-05, "loss": 0.4812, "step": 571 }, { "epoch": 1.2435444414243, "grad_norm": 0.07691150903701782, "learning_rate": 1.7087986602617824e-05, "loss": 0.5013, "step": 572 }, { "epoch": 1.2457189453655886, "grad_norm": 0.06736266613006592, "learning_rate": 1.7078323696642093e-05, "loss": 0.5553, "step": 573 }, { "epoch": 1.247893449306877, "grad_norm": 0.06591708213090897, "learning_rate": 1.706864752696554e-05, "loss": 0.5591, "step": 574 }, { "epoch": 1.2500679532481653, "grad_norm": 0.07698856294155121, "learning_rate": 1.7058958111719836e-05, "loss": 0.6179, "step": 575 }, { "epoch": 1.2522424571894537, "grad_norm": 0.06556461751461029, "learning_rate": 1.7049255469061476e-05, "loss": 0.53, "step": 576 }, { "epoch": 1.254416961130742, "grad_norm": 0.07580272108316422, "learning_rate": 1.7039539617171724e-05, "loss": 0.5664, "step": 577 }, { "epoch": 1.2565914650720305, "grad_norm": 0.06747961044311523, "learning_rate": 1.702981057425662e-05, "loss": 0.5903, "step": 578 }, { "epoch": 1.2587659690133188, "grad_norm": 0.06584040075540543, "learning_rate": 1.70200683585469e-05, "loss": 0.5437, "step": 579 }, { "epoch": 1.2609404729546072, "grad_norm": 0.06757070124149323, "learning_rate": 1.7010312988297993e-05, "loss": 0.5474, "step": 580 }, { "epoch": 1.2631149768958956, "grad_norm": 0.06933177262544632, "learning_rate": 1.7000544481789984e-05, "loss": 0.543, "step": 581 }, { "epoch": 1.265289480837184, "grad_norm": 0.183536097407341, "learning_rate": 1.699076285732756e-05, "loss": 0.6183, "step": 582 }, { "epoch": 1.2674639847784723, "grad_norm": 0.06044726073741913, "learning_rate": 1.6980968133240002e-05, "loss": 0.5018, "step": 583 }, { "epoch": 1.2696384887197607, "grad_norm": 0.06674567610025406, "learning_rate": 1.6971160327881126e-05, "loss": 0.5636, "step": 584 }, { "epoch": 1.271812992661049, "grad_norm": 0.07175873965024948, "learning_rate": 1.696133945962927e-05, "loss": 0.5343, "step": 585 }, { "epoch": 1.2739874966023375, "grad_norm": 0.06979343295097351, "learning_rate": 1.6951505546887238e-05, "loss": 0.5461, "step": 586 }, { "epoch": 1.276162000543626, "grad_norm": 0.06679370254278183, "learning_rate": 1.6941658608082296e-05, "loss": 0.5739, "step": 587 }, { "epoch": 1.2783365044849144, "grad_norm": 0.07115251570940018, "learning_rate": 1.6931798661666103e-05, "loss": 0.574, "step": 588 }, { "epoch": 1.2805110084262028, "grad_norm": 0.06166532635688782, "learning_rate": 1.6921925726114694e-05, "loss": 0.5456, "step": 589 }, { "epoch": 1.2826855123674912, "grad_norm": 0.0727899968624115, "learning_rate": 1.691203981992845e-05, "loss": 0.6112, "step": 590 }, { "epoch": 1.2848600163087796, "grad_norm": 0.07212860137224197, "learning_rate": 1.6902140961632054e-05, "loss": 0.5627, "step": 591 }, { "epoch": 1.287034520250068, "grad_norm": 0.06781869381666183, "learning_rate": 1.6892229169774463e-05, "loss": 0.4649, "step": 592 }, { "epoch": 1.2892090241913563, "grad_norm": 0.06660555303096771, "learning_rate": 1.6882304462928853e-05, "loss": 0.5472, "step": 593 }, { "epoch": 1.2913835281326447, "grad_norm": 0.07833001017570496, "learning_rate": 1.687236685969263e-05, "loss": 0.6991, "step": 594 }, { "epoch": 1.2935580320739333, "grad_norm": 0.06855905055999756, "learning_rate": 1.686241637868734e-05, "loss": 0.5142, "step": 595 }, { "epoch": 1.2957325360152216, "grad_norm": 0.06545227766036987, "learning_rate": 1.6852453038558667e-05, "loss": 0.5118, "step": 596 }, { "epoch": 1.29790703995651, "grad_norm": 0.0633307695388794, "learning_rate": 1.6842476857976397e-05, "loss": 0.5172, "step": 597 }, { "epoch": 1.3000815438977984, "grad_norm": 0.06989748775959015, "learning_rate": 1.683248785563438e-05, "loss": 0.5681, "step": 598 }, { "epoch": 1.3022560478390868, "grad_norm": 0.06796829402446747, "learning_rate": 1.6822486050250472e-05, "loss": 0.6138, "step": 599 }, { "epoch": 1.3044305517803751, "grad_norm": 0.06390702724456787, "learning_rate": 1.681247146056654e-05, "loss": 0.5024, "step": 600 }, { "epoch": 1.3066050557216635, "grad_norm": 0.0655435249209404, "learning_rate": 1.6802444105348405e-05, "loss": 0.577, "step": 601 }, { "epoch": 1.308779559662952, "grad_norm": 0.06885149329900742, "learning_rate": 1.6792404003385797e-05, "loss": 0.5583, "step": 602 }, { "epoch": 1.3109540636042403, "grad_norm": 0.06849273294210434, "learning_rate": 1.678235117349234e-05, "loss": 0.5448, "step": 603 }, { "epoch": 1.3131285675455286, "grad_norm": 0.06594900786876678, "learning_rate": 1.6772285634505507e-05, "loss": 0.5958, "step": 604 }, { "epoch": 1.315303071486817, "grad_norm": 0.06479936838150024, "learning_rate": 1.676220740528659e-05, "loss": 0.6655, "step": 605 }, { "epoch": 1.3174775754281054, "grad_norm": 0.07216524332761765, "learning_rate": 1.6752116504720644e-05, "loss": 0.549, "step": 606 }, { "epoch": 1.3196520793693938, "grad_norm": 0.07673719525337219, "learning_rate": 1.674201295171649e-05, "loss": 0.6243, "step": 607 }, { "epoch": 1.3218265833106821, "grad_norm": 0.06317044794559479, "learning_rate": 1.6731896765206648e-05, "loss": 0.4914, "step": 608 }, { "epoch": 1.3240010872519705, "grad_norm": 0.06494729965925217, "learning_rate": 1.6721767964147307e-05, "loss": 0.5202, "step": 609 }, { "epoch": 1.3261755911932591, "grad_norm": 0.06647679954767227, "learning_rate": 1.67116265675183e-05, "loss": 0.504, "step": 610 }, { "epoch": 1.3283500951345475, "grad_norm": 0.06783776730298996, "learning_rate": 1.670147259432306e-05, "loss": 0.5953, "step": 611 }, { "epoch": 1.3305245990758359, "grad_norm": 0.07011283934116364, "learning_rate": 1.6691306063588583e-05, "loss": 0.6384, "step": 612 }, { "epoch": 1.3326991030171242, "grad_norm": 0.06339769810438156, "learning_rate": 1.6681126994365407e-05, "loss": 0.4825, "step": 613 }, { "epoch": 1.3348736069584126, "grad_norm": 0.07224269956350327, "learning_rate": 1.667093540572755e-05, "loss": 0.5655, "step": 614 }, { "epoch": 1.337048110899701, "grad_norm": 0.07057581841945648, "learning_rate": 1.6660731316772503e-05, "loss": 0.6291, "step": 615 }, { "epoch": 1.3392226148409894, "grad_norm": 0.06559202820062637, "learning_rate": 1.6650514746621173e-05, "loss": 0.4987, "step": 616 }, { "epoch": 1.3413971187822777, "grad_norm": 0.06356988847255707, "learning_rate": 1.6640285714417857e-05, "loss": 0.5861, "step": 617 }, { "epoch": 1.3435716227235661, "grad_norm": 0.08413425087928772, "learning_rate": 1.66300442393302e-05, "loss": 0.5747, "step": 618 }, { "epoch": 1.3457461266648547, "grad_norm": 0.06350528448820114, "learning_rate": 1.6619790340549175e-05, "loss": 0.5109, "step": 619 }, { "epoch": 1.347920630606143, "grad_norm": 0.06695125252008438, "learning_rate": 1.660952403728902e-05, "loss": 0.5031, "step": 620 }, { "epoch": 1.3500951345474315, "grad_norm": 0.06070135161280632, "learning_rate": 1.659924534878723e-05, "loss": 0.435, "step": 621 }, { "epoch": 1.3522696384887198, "grad_norm": 0.06587585806846619, "learning_rate": 1.6588954294304495e-05, "loss": 0.577, "step": 622 }, { "epoch": 1.3544441424300082, "grad_norm": 0.07992653548717499, "learning_rate": 1.6578650893124686e-05, "loss": 0.515, "step": 623 }, { "epoch": 1.3566186463712966, "grad_norm": 0.07126399874687195, "learning_rate": 1.6568335164554813e-05, "loss": 0.6385, "step": 624 }, { "epoch": 1.358793150312585, "grad_norm": 0.06111137196421623, "learning_rate": 1.655800712792498e-05, "loss": 0.5375, "step": 625 }, { "epoch": 1.3609676542538733, "grad_norm": 0.07887528091669083, "learning_rate": 1.6547666802588344e-05, "loss": 0.5957, "step": 626 }, { "epoch": 1.3631421581951617, "grad_norm": 0.055274851620197296, "learning_rate": 1.6537314207921117e-05, "loss": 0.5247, "step": 627 }, { "epoch": 1.36531666213645, "grad_norm": 0.06984863430261612, "learning_rate": 1.652694936332247e-05, "loss": 0.508, "step": 628 }, { "epoch": 1.3674911660777385, "grad_norm": 0.062407515943050385, "learning_rate": 1.6516572288214555e-05, "loss": 0.6506, "step": 629 }, { "epoch": 1.3696656700190268, "grad_norm": 0.07123886048793793, "learning_rate": 1.650618300204242e-05, "loss": 0.6538, "step": 630 }, { "epoch": 1.3718401739603152, "grad_norm": 0.06715679168701172, "learning_rate": 1.6495781524274018e-05, "loss": 0.5808, "step": 631 }, { "epoch": 1.3740146779016036, "grad_norm": 0.06851977109909058, "learning_rate": 1.6485367874400124e-05, "loss": 0.5448, "step": 632 }, { "epoch": 1.376189181842892, "grad_norm": 0.06020629405975342, "learning_rate": 1.6474942071934336e-05, "loss": 0.4913, "step": 633 }, { "epoch": 1.3783636857841806, "grad_norm": 0.06305680423974991, "learning_rate": 1.6464504136413025e-05, "loss": 0.4968, "step": 634 }, { "epoch": 1.380538189725469, "grad_norm": 0.06899664551019669, "learning_rate": 1.6454054087395284e-05, "loss": 0.5758, "step": 635 }, { "epoch": 1.3827126936667573, "grad_norm": 0.062209632247686386, "learning_rate": 1.6443591944462917e-05, "loss": 0.5648, "step": 636 }, { "epoch": 1.3848871976080457, "grad_norm": 0.0617678239941597, "learning_rate": 1.6433117727220388e-05, "loss": 0.5687, "step": 637 }, { "epoch": 1.387061701549334, "grad_norm": 0.06315875053405762, "learning_rate": 1.6422631455294787e-05, "loss": 0.5021, "step": 638 }, { "epoch": 1.3892362054906224, "grad_norm": 0.0808146670460701, "learning_rate": 1.6412133148335786e-05, "loss": 0.5411, "step": 639 }, { "epoch": 1.3914107094319108, "grad_norm": 0.0668412297964096, "learning_rate": 1.6401622826015616e-05, "loss": 0.6457, "step": 640 }, { "epoch": 1.3935852133731992, "grad_norm": 0.07770834863185883, "learning_rate": 1.6391100508029026e-05, "loss": 0.6028, "step": 641 }, { "epoch": 1.3957597173144876, "grad_norm": 0.06383370608091354, "learning_rate": 1.6380566214093226e-05, "loss": 0.5999, "step": 642 }, { "epoch": 1.3979342212557762, "grad_norm": 0.07637319713830948, "learning_rate": 1.6370019963947888e-05, "loss": 0.6406, "step": 643 }, { "epoch": 1.4001087251970645, "grad_norm": 0.06470421701669693, "learning_rate": 1.635946177735508e-05, "loss": 0.5601, "step": 644 }, { "epoch": 1.402283229138353, "grad_norm": 0.05885904282331467, "learning_rate": 1.634889167409923e-05, "loss": 0.4753, "step": 645 }, { "epoch": 1.4044577330796413, "grad_norm": 0.0666317343711853, "learning_rate": 1.63383096739871e-05, "loss": 0.5904, "step": 646 }, { "epoch": 1.4066322370209297, "grad_norm": 0.07377368956804276, "learning_rate": 1.632771579684776e-05, "loss": 0.5028, "step": 647 }, { "epoch": 1.408806740962218, "grad_norm": 0.0664341002702713, "learning_rate": 1.631711006253251e-05, "loss": 0.6311, "step": 648 }, { "epoch": 1.4109812449035064, "grad_norm": 0.06824533641338348, "learning_rate": 1.6306492490914887e-05, "loss": 0.543, "step": 649 }, { "epoch": 1.4131557488447948, "grad_norm": 0.0624668225646019, "learning_rate": 1.6295863101890603e-05, "loss": 0.5158, "step": 650 }, { "epoch": 1.4153302527860832, "grad_norm": 0.08051895350217819, "learning_rate": 1.628522191537751e-05, "loss": 0.5532, "step": 651 }, { "epoch": 1.4175047567273715, "grad_norm": 0.06811799854040146, "learning_rate": 1.6274568951315575e-05, "loss": 0.517, "step": 652 }, { "epoch": 1.41967926066866, "grad_norm": 0.06288021802902222, "learning_rate": 1.626390422966683e-05, "loss": 0.5863, "step": 653 }, { "epoch": 1.4218537646099483, "grad_norm": 0.06595730036497116, "learning_rate": 1.625322777041534e-05, "loss": 0.4655, "step": 654 }, { "epoch": 1.4240282685512367, "grad_norm": 0.06451403349637985, "learning_rate": 1.624253959356717e-05, "loss": 0.512, "step": 655 }, { "epoch": 1.426202772492525, "grad_norm": 0.061061419546604156, "learning_rate": 1.623183971915032e-05, "loss": 0.4627, "step": 656 }, { "epoch": 1.4283772764338134, "grad_norm": 0.06964956969022751, "learning_rate": 1.6221128167214742e-05, "loss": 0.616, "step": 657 }, { "epoch": 1.430551780375102, "grad_norm": 0.06891736388206482, "learning_rate": 1.6210404957832244e-05, "loss": 0.5589, "step": 658 }, { "epoch": 1.4327262843163904, "grad_norm": 0.07816269248723984, "learning_rate": 1.6199670111096487e-05, "loss": 0.6169, "step": 659 }, { "epoch": 1.4349007882576787, "grad_norm": 0.06685820966959, "learning_rate": 1.6188923647122946e-05, "loss": 0.5267, "step": 660 }, { "epoch": 1.4370752921989671, "grad_norm": 0.0766514390707016, "learning_rate": 1.6178165586048857e-05, "loss": 0.6391, "step": 661 }, { "epoch": 1.4392497961402555, "grad_norm": 0.07985788583755493, "learning_rate": 1.6167395948033176e-05, "loss": 0.5987, "step": 662 }, { "epoch": 1.4414243000815439, "grad_norm": 0.0773911401629448, "learning_rate": 1.6156614753256583e-05, "loss": 0.5435, "step": 663 }, { "epoch": 1.4435988040228322, "grad_norm": 0.07827164977788925, "learning_rate": 1.6145822021921386e-05, "loss": 0.6905, "step": 664 }, { "epoch": 1.4457733079641206, "grad_norm": 0.0725138783454895, "learning_rate": 1.613501777425152e-05, "loss": 0.6577, "step": 665 }, { "epoch": 1.447947811905409, "grad_norm": 0.06611666083335876, "learning_rate": 1.61242020304925e-05, "loss": 0.5105, "step": 666 }, { "epoch": 1.4501223158466976, "grad_norm": 0.060293201357126236, "learning_rate": 1.611337481091139e-05, "loss": 0.5463, "step": 667 }, { "epoch": 1.452296819787986, "grad_norm": 0.06978996843099594, "learning_rate": 1.6102536135796734e-05, "loss": 0.528, "step": 668 }, { "epoch": 1.4544713237292743, "grad_norm": 0.07145840674638748, "learning_rate": 1.6091686025458577e-05, "loss": 0.4874, "step": 669 }, { "epoch": 1.4566458276705627, "grad_norm": 0.07020546495914459, "learning_rate": 1.6080824500228367e-05, "loss": 0.5489, "step": 670 }, { "epoch": 1.458820331611851, "grad_norm": 0.06637263298034668, "learning_rate": 1.6069951580458946e-05, "loss": 0.5462, "step": 671 }, { "epoch": 1.4609948355531395, "grad_norm": 0.07529100030660629, "learning_rate": 1.6059067286524514e-05, "loss": 0.5559, "step": 672 }, { "epoch": 1.4631693394944278, "grad_norm": 0.06846174597740173, "learning_rate": 1.6048171638820574e-05, "loss": 0.7282, "step": 673 }, { "epoch": 1.4653438434357162, "grad_norm": 0.07633936405181885, "learning_rate": 1.6037264657763918e-05, "loss": 0.6442, "step": 674 }, { "epoch": 1.4675183473770046, "grad_norm": 0.05540183186531067, "learning_rate": 1.6026346363792565e-05, "loss": 0.4251, "step": 675 }, { "epoch": 1.469692851318293, "grad_norm": 0.06218886747956276, "learning_rate": 1.6015416777365734e-05, "loss": 0.5032, "step": 676 }, { "epoch": 1.4718673552595813, "grad_norm": 0.08438904583454132, "learning_rate": 1.600447591896381e-05, "loss": 0.7677, "step": 677 }, { "epoch": 1.4740418592008697, "grad_norm": 0.0745784267783165, "learning_rate": 1.5993523809088292e-05, "loss": 0.6236, "step": 678 }, { "epoch": 1.476216363142158, "grad_norm": 0.07012149691581726, "learning_rate": 1.5982560468261764e-05, "loss": 0.6751, "step": 679 }, { "epoch": 1.4783908670834465, "grad_norm": 0.09631490707397461, "learning_rate": 1.5971585917027864e-05, "loss": 0.5262, "step": 680 }, { "epoch": 1.4805653710247348, "grad_norm": 0.0701725035905838, "learning_rate": 1.5960600175951224e-05, "loss": 0.52, "step": 681 }, { "epoch": 1.4827398749660234, "grad_norm": 0.10701775550842285, "learning_rate": 1.5949603265617452e-05, "loss": 0.5119, "step": 682 }, { "epoch": 1.4849143789073118, "grad_norm": 0.06533973664045334, "learning_rate": 1.5938595206633088e-05, "loss": 0.5882, "step": 683 }, { "epoch": 1.4870888828486002, "grad_norm": 0.0570087805390358, "learning_rate": 1.592757601962555e-05, "loss": 0.478, "step": 684 }, { "epoch": 1.4892633867898886, "grad_norm": 0.06642911583185196, "learning_rate": 1.5916545725243124e-05, "loss": 0.54, "step": 685 }, { "epoch": 1.491437890731177, "grad_norm": 0.05826708674430847, "learning_rate": 1.5905504344154897e-05, "loss": 0.4564, "step": 686 }, { "epoch": 1.4936123946724653, "grad_norm": 0.06394309550523758, "learning_rate": 1.589445189705074e-05, "loss": 0.5138, "step": 687 }, { "epoch": 1.4957868986137537, "grad_norm": 0.0932568684220314, "learning_rate": 1.588338840464125e-05, "loss": 0.7213, "step": 688 }, { "epoch": 1.497961402555042, "grad_norm": 0.08323768526315689, "learning_rate": 1.5872313887657734e-05, "loss": 0.5867, "step": 689 }, { "epoch": 1.5001359064963307, "grad_norm": 0.06260855495929718, "learning_rate": 1.5861228366852148e-05, "loss": 0.5643, "step": 690 }, { "epoch": 1.502310410437619, "grad_norm": 0.07448921352624893, "learning_rate": 1.585013186299707e-05, "loss": 0.6424, "step": 691 }, { "epoch": 1.5044849143789074, "grad_norm": 0.07150814682245255, "learning_rate": 1.5839024396885657e-05, "loss": 0.4459, "step": 692 }, { "epoch": 1.5066594183201958, "grad_norm": 0.05693221092224121, "learning_rate": 1.582790598933161e-05, "loss": 0.4445, "step": 693 }, { "epoch": 1.5088339222614842, "grad_norm": 0.06398003548383713, "learning_rate": 1.5816776661169133e-05, "loss": 0.5045, "step": 694 }, { "epoch": 1.5110084262027725, "grad_norm": 0.07388309389352798, "learning_rate": 1.5805636433252892e-05, "loss": 0.6285, "step": 695 }, { "epoch": 1.513182930144061, "grad_norm": 0.0646815299987793, "learning_rate": 1.579448532645798e-05, "loss": 0.5491, "step": 696 }, { "epoch": 1.5153574340853493, "grad_norm": 0.05883905664086342, "learning_rate": 1.5783323361679865e-05, "loss": 0.4346, "step": 697 }, { "epoch": 1.5175319380266377, "grad_norm": 0.06105039268732071, "learning_rate": 1.577215055983438e-05, "loss": 0.5471, "step": 698 }, { "epoch": 1.519706441967926, "grad_norm": 0.06177292391657829, "learning_rate": 1.5760966941857646e-05, "loss": 0.5108, "step": 699 }, { "epoch": 1.5218809459092144, "grad_norm": 0.07309827208518982, "learning_rate": 1.574977252870607e-05, "loss": 0.5403, "step": 700 }, { "epoch": 1.5240554498505028, "grad_norm": 0.0684160441160202, "learning_rate": 1.5738567341356265e-05, "loss": 0.7457, "step": 701 }, { "epoch": 1.5262299537917912, "grad_norm": 0.0659729465842247, "learning_rate": 1.5727351400805054e-05, "loss": 0.5193, "step": 702 }, { "epoch": 1.5284044577330795, "grad_norm": 0.06772555410861969, "learning_rate": 1.57161247280694e-05, "loss": 0.5181, "step": 703 }, { "epoch": 1.530578961674368, "grad_norm": 0.07151821255683899, "learning_rate": 1.570488734418638e-05, "loss": 0.5104, "step": 704 }, { "epoch": 1.5327534656156563, "grad_norm": 0.06749141216278076, "learning_rate": 1.5693639270213138e-05, "loss": 0.5014, "step": 705 }, { "epoch": 1.5349279695569447, "grad_norm": 0.06112991273403168, "learning_rate": 1.568238052722685e-05, "loss": 0.5254, "step": 706 }, { "epoch": 1.5371024734982333, "grad_norm": 0.061335477977991104, "learning_rate": 1.567111113632469e-05, "loss": 0.5859, "step": 707 }, { "epoch": 1.5392769774395216, "grad_norm": 0.05945085361599922, "learning_rate": 1.5659831118623783e-05, "loss": 0.528, "step": 708 }, { "epoch": 1.54145148138081, "grad_norm": 0.06980522722005844, "learning_rate": 1.5648540495261157e-05, "loss": 0.6052, "step": 709 }, { "epoch": 1.5436259853220984, "grad_norm": 0.07228517532348633, "learning_rate": 1.5637239287393725e-05, "loss": 0.6068, "step": 710 }, { "epoch": 1.5458004892633868, "grad_norm": 0.06574912369251251, "learning_rate": 1.5625927516198235e-05, "loss": 0.5822, "step": 711 }, { "epoch": 1.5479749932046751, "grad_norm": 0.06775934994220734, "learning_rate": 1.5614605202871213e-05, "loss": 0.5575, "step": 712 }, { "epoch": 1.5501494971459637, "grad_norm": 0.060427989810705185, "learning_rate": 1.5603272368628956e-05, "loss": 0.5068, "step": 713 }, { "epoch": 1.552324001087252, "grad_norm": 0.0664725974202156, "learning_rate": 1.5591929034707468e-05, "loss": 0.5741, "step": 714 }, { "epoch": 1.5544985050285405, "grad_norm": 0.0758625715970993, "learning_rate": 1.5580575222362435e-05, "loss": 0.56, "step": 715 }, { "epoch": 1.5566730089698289, "grad_norm": 0.07462309300899506, "learning_rate": 1.5569210952869166e-05, "loss": 0.6819, "step": 716 }, { "epoch": 1.5588475129111172, "grad_norm": 0.06304577738046646, "learning_rate": 1.5557836247522574e-05, "loss": 0.4724, "step": 717 }, { "epoch": 1.5610220168524056, "grad_norm": 0.06983862072229385, "learning_rate": 1.5546451127637125e-05, "loss": 0.7178, "step": 718 }, { "epoch": 1.563196520793694, "grad_norm": 0.06328754127025604, "learning_rate": 1.5535055614546805e-05, "loss": 0.5443, "step": 719 }, { "epoch": 1.5653710247349824, "grad_norm": 0.06555915623903275, "learning_rate": 1.552364972960506e-05, "loss": 0.5216, "step": 720 }, { "epoch": 1.5675455286762707, "grad_norm": 0.06661427766084671, "learning_rate": 1.551223349418479e-05, "loss": 0.6245, "step": 721 }, { "epoch": 1.569720032617559, "grad_norm": 0.06281550228595734, "learning_rate": 1.5500806929678276e-05, "loss": 0.51, "step": 722 }, { "epoch": 1.5718945365588475, "grad_norm": 0.060447584837675095, "learning_rate": 1.5489370057497165e-05, "loss": 0.4512, "step": 723 }, { "epoch": 1.5740690405001359, "grad_norm": 0.06219898909330368, "learning_rate": 1.5477922899072412e-05, "loss": 0.4685, "step": 724 }, { "epoch": 1.5762435444414242, "grad_norm": 0.0827278196811676, "learning_rate": 1.5466465475854246e-05, "loss": 0.644, "step": 725 }, { "epoch": 1.5784180483827126, "grad_norm": 0.06705125421285629, "learning_rate": 1.545499780931214e-05, "loss": 0.5075, "step": 726 }, { "epoch": 1.580592552324001, "grad_norm": 0.07685334235429764, "learning_rate": 1.544351992093475e-05, "loss": 0.6125, "step": 727 }, { "epoch": 1.5827670562652894, "grad_norm": 0.06547702103853226, "learning_rate": 1.543203183222989e-05, "loss": 0.565, "step": 728 }, { "epoch": 1.5849415602065777, "grad_norm": 0.07954264432191849, "learning_rate": 1.5420533564724495e-05, "loss": 0.6151, "step": 729 }, { "epoch": 1.5871160641478663, "grad_norm": 0.063746877014637, "learning_rate": 1.540902513996456e-05, "loss": 0.5561, "step": 730 }, { "epoch": 1.5892905680891547, "grad_norm": 0.07055594772100449, "learning_rate": 1.539750657951513e-05, "loss": 0.5593, "step": 731 }, { "epoch": 1.591465072030443, "grad_norm": 0.06880395114421844, "learning_rate": 1.5385977904960228e-05, "loss": 0.5166, "step": 732 }, { "epoch": 1.5936395759717314, "grad_norm": 0.0615411251783371, "learning_rate": 1.5374439137902832e-05, "loss": 0.5084, "step": 733 }, { "epoch": 1.5958140799130198, "grad_norm": 0.07859042286872864, "learning_rate": 1.5362890299964834e-05, "loss": 0.5398, "step": 734 }, { "epoch": 1.5979885838543082, "grad_norm": 0.06247608736157417, "learning_rate": 1.5351331412787004e-05, "loss": 0.4697, "step": 735 }, { "epoch": 1.6001630877955968, "grad_norm": 0.06752211600542068, "learning_rate": 1.5339762498028937e-05, "loss": 0.5274, "step": 736 }, { "epoch": 1.6023375917368852, "grad_norm": 0.06527712941169739, "learning_rate": 1.5328183577369007e-05, "loss": 0.5697, "step": 737 }, { "epoch": 1.6045120956781735, "grad_norm": 0.07516384869813919, "learning_rate": 1.5316594672504362e-05, "loss": 0.6213, "step": 738 }, { "epoch": 1.606686599619462, "grad_norm": 0.06886757910251617, "learning_rate": 1.5304995805150834e-05, "loss": 0.516, "step": 739 }, { "epoch": 1.6088611035607503, "grad_norm": 0.07365527004003525, "learning_rate": 1.5293386997042943e-05, "loss": 0.6273, "step": 740 }, { "epoch": 1.6110356075020387, "grad_norm": 0.07842119038105011, "learning_rate": 1.528176826993382e-05, "loss": 0.6364, "step": 741 }, { "epoch": 1.613210111443327, "grad_norm": 0.06536397337913513, "learning_rate": 1.52701396455952e-05, "loss": 0.4995, "step": 742 }, { "epoch": 1.6153846153846154, "grad_norm": 0.06250962615013123, "learning_rate": 1.5258501145817344e-05, "loss": 0.5085, "step": 743 }, { "epoch": 1.6175591193259038, "grad_norm": 0.0725678876042366, "learning_rate": 1.5246852792409033e-05, "loss": 0.5678, "step": 744 }, { "epoch": 1.6197336232671922, "grad_norm": 0.06364832818508148, "learning_rate": 1.5235194607197508e-05, "loss": 0.5143, "step": 745 }, { "epoch": 1.6219081272084805, "grad_norm": 0.06354716420173645, "learning_rate": 1.5223526612028432e-05, "loss": 0.5218, "step": 746 }, { "epoch": 1.624082631149769, "grad_norm": 0.07059025019407272, "learning_rate": 1.5211848828765852e-05, "loss": 0.5438, "step": 747 }, { "epoch": 1.6262571350910573, "grad_norm": 0.06322506815195084, "learning_rate": 1.5200161279292154e-05, "loss": 0.6081, "step": 748 }, { "epoch": 1.6284316390323457, "grad_norm": 0.07927685976028442, "learning_rate": 1.518846398550803e-05, "loss": 0.5364, "step": 749 }, { "epoch": 1.630606142973634, "grad_norm": 0.07515702396631241, "learning_rate": 1.5176756969332428e-05, "loss": 0.5871, "step": 750 }, { "epoch": 1.6327806469149224, "grad_norm": 0.07103421539068222, "learning_rate": 1.5165040252702502e-05, "loss": 0.6083, "step": 751 }, { "epoch": 1.6349551508562108, "grad_norm": 0.06636621057987213, "learning_rate": 1.5153313857573615e-05, "loss": 0.5972, "step": 752 }, { "epoch": 1.6371296547974992, "grad_norm": 0.06537255644798279, "learning_rate": 1.5141577805919225e-05, "loss": 0.5221, "step": 753 }, { "epoch": 1.6393041587387878, "grad_norm": 0.06700573861598969, "learning_rate": 1.5129832119730919e-05, "loss": 0.5503, "step": 754 }, { "epoch": 1.6414786626800761, "grad_norm": 0.06926625967025757, "learning_rate": 1.5118076821018322e-05, "loss": 0.5294, "step": 755 }, { "epoch": 1.6436531666213645, "grad_norm": 0.06755527853965759, "learning_rate": 1.510631193180907e-05, "loss": 0.5243, "step": 756 }, { "epoch": 1.645827670562653, "grad_norm": 0.05800420418381691, "learning_rate": 1.509453747414878e-05, "loss": 0.4497, "step": 757 }, { "epoch": 1.6480021745039413, "grad_norm": 0.08826916664838791, "learning_rate": 1.5082753470100987e-05, "loss": 0.7483, "step": 758 }, { "epoch": 1.6501766784452296, "grad_norm": 0.06957676261663437, "learning_rate": 1.5070959941747126e-05, "loss": 0.5406, "step": 759 }, { "epoch": 1.6523511823865182, "grad_norm": 0.08578504621982574, "learning_rate": 1.5059156911186465e-05, "loss": 0.5695, "step": 760 }, { "epoch": 1.6545256863278066, "grad_norm": 0.0738890990614891, "learning_rate": 1.5047344400536095e-05, "loss": 0.579, "step": 761 }, { "epoch": 1.656700190269095, "grad_norm": 0.07239063084125519, "learning_rate": 1.5035522431930859e-05, "loss": 0.5564, "step": 762 }, { "epoch": 1.6588746942103834, "grad_norm": 0.07460078597068787, "learning_rate": 1.5023691027523324e-05, "loss": 0.6433, "step": 763 }, { "epoch": 1.6610491981516717, "grad_norm": 0.06398321688175201, "learning_rate": 1.5011850209483741e-05, "loss": 0.539, "step": 764 }, { "epoch": 1.66322370209296, "grad_norm": 0.06408295780420303, "learning_rate": 1.5000000000000002e-05, "loss": 0.4275, "step": 765 }, { "epoch": 1.6653982060342485, "grad_norm": 0.07352191209793091, "learning_rate": 1.4988140421277595e-05, "loss": 0.5691, "step": 766 }, { "epoch": 1.6675727099755369, "grad_norm": 0.06760482490062714, "learning_rate": 1.497627149553956e-05, "loss": 0.5786, "step": 767 }, { "epoch": 1.6697472139168252, "grad_norm": 0.07196834683418274, "learning_rate": 1.4964393245026467e-05, "loss": 0.6819, "step": 768 }, { "epoch": 1.6719217178581136, "grad_norm": 0.06779640167951584, "learning_rate": 1.4952505691996338e-05, "loss": 0.5591, "step": 769 }, { "epoch": 1.674096221799402, "grad_norm": 0.05927790701389313, "learning_rate": 1.494060885872464e-05, "loss": 0.4617, "step": 770 }, { "epoch": 1.6762707257406904, "grad_norm": 0.06884414702653885, "learning_rate": 1.4928702767504235e-05, "loss": 0.6309, "step": 771 }, { "epoch": 1.6784452296819787, "grad_norm": 0.06911049783229828, "learning_rate": 1.4916787440645315e-05, "loss": 0.4801, "step": 772 }, { "epoch": 1.680619733623267, "grad_norm": 0.0634150356054306, "learning_rate": 1.4904862900475391e-05, "loss": 0.5594, "step": 773 }, { "epoch": 1.6827942375645555, "grad_norm": 0.072935551404953, "learning_rate": 1.4892929169339237e-05, "loss": 0.6107, "step": 774 }, { "epoch": 1.6849687415058439, "grad_norm": 0.06303034722805023, "learning_rate": 1.488098626959885e-05, "loss": 0.51, "step": 775 }, { "epoch": 1.6871432454471322, "grad_norm": 0.06640374660491943, "learning_rate": 1.4869034223633399e-05, "loss": 0.525, "step": 776 }, { "epoch": 1.6893177493884206, "grad_norm": 0.05558007210493088, "learning_rate": 1.4857073053839206e-05, "loss": 0.457, "step": 777 }, { "epoch": 1.6914922533297092, "grad_norm": 0.07275041937828064, "learning_rate": 1.4845102782629676e-05, "loss": 0.5036, "step": 778 }, { "epoch": 1.6936667572709976, "grad_norm": 0.056968871504068375, "learning_rate": 1.483312343243528e-05, "loss": 0.4371, "step": 779 }, { "epoch": 1.695841261212286, "grad_norm": 0.0760185718536377, "learning_rate": 1.4821135025703491e-05, "loss": 0.5159, "step": 780 }, { "epoch": 1.6980157651535743, "grad_norm": 0.07559648156166077, "learning_rate": 1.480913758489876e-05, "loss": 0.5837, "step": 781 }, { "epoch": 1.7001902690948627, "grad_norm": 0.06931426376104355, "learning_rate": 1.4797131132502464e-05, "loss": 0.5162, "step": 782 }, { "epoch": 1.702364773036151, "grad_norm": 0.08291727304458618, "learning_rate": 1.4785115691012866e-05, "loss": 0.6561, "step": 783 }, { "epoch": 1.7045392769774397, "grad_norm": 0.07082043588161469, "learning_rate": 1.4773091282945076e-05, "loss": 0.5891, "step": 784 }, { "epoch": 1.706713780918728, "grad_norm": 0.07126478105783463, "learning_rate": 1.4761057930831002e-05, "loss": 0.6268, "step": 785 }, { "epoch": 1.7088882848600164, "grad_norm": 0.08124355226755142, "learning_rate": 1.4749015657219315e-05, "loss": 0.6403, "step": 786 }, { "epoch": 1.7110627888013048, "grad_norm": 0.07027478516101837, "learning_rate": 1.47369644846754e-05, "loss": 0.566, "step": 787 }, { "epoch": 1.7132372927425932, "grad_norm": 0.0759085938334465, "learning_rate": 1.4724904435781322e-05, "loss": 0.6084, "step": 788 }, { "epoch": 1.7154117966838816, "grad_norm": 0.06845879554748535, "learning_rate": 1.4712835533135774e-05, "loss": 0.5079, "step": 789 }, { "epoch": 1.71758630062517, "grad_norm": 0.06919465959072113, "learning_rate": 1.470075779935404e-05, "loss": 0.5635, "step": 790 }, { "epoch": 1.7197608045664583, "grad_norm": 0.07033417373895645, "learning_rate": 1.4688671257067962e-05, "loss": 0.5688, "step": 791 }, { "epoch": 1.7219353085077467, "grad_norm": 0.07464487105607986, "learning_rate": 1.4676575928925869e-05, "loss": 0.6111, "step": 792 }, { "epoch": 1.724109812449035, "grad_norm": 0.08142214268445969, "learning_rate": 1.4664471837592573e-05, "loss": 0.6358, "step": 793 }, { "epoch": 1.7262843163903234, "grad_norm": 0.08277947455644608, "learning_rate": 1.4652359005749295e-05, "loss": 0.5563, "step": 794 }, { "epoch": 1.7284588203316118, "grad_norm": 0.07332737743854523, "learning_rate": 1.4640237456093636e-05, "loss": 0.7405, "step": 795 }, { "epoch": 1.7306333242729002, "grad_norm": 0.06244014576077461, "learning_rate": 1.4628107211339537e-05, "loss": 0.4792, "step": 796 }, { "epoch": 1.7328078282141886, "grad_norm": 0.06878259032964706, "learning_rate": 1.4615968294217226e-05, "loss": 0.5999, "step": 797 }, { "epoch": 1.734982332155477, "grad_norm": 0.06387055665254593, "learning_rate": 1.4603820727473192e-05, "loss": 0.4824, "step": 798 }, { "epoch": 1.7371568360967653, "grad_norm": 0.1264907270669937, "learning_rate": 1.4591664533870118e-05, "loss": 0.5049, "step": 799 }, { "epoch": 1.7393313400380537, "grad_norm": 0.06507895141839981, "learning_rate": 1.4579499736186864e-05, "loss": 0.5692, "step": 800 }, { "epoch": 1.741505843979342, "grad_norm": 0.07129712402820587, "learning_rate": 1.4567326357218408e-05, "loss": 0.609, "step": 801 }, { "epoch": 1.7436803479206306, "grad_norm": 0.06694424152374268, "learning_rate": 1.4555144419775808e-05, "loss": 0.659, "step": 802 }, { "epoch": 1.745854851861919, "grad_norm": 0.06411007791757584, "learning_rate": 1.4542953946686161e-05, "loss": 0.5433, "step": 803 }, { "epoch": 1.7480293558032074, "grad_norm": 0.0659380853176117, "learning_rate": 1.4530754960792554e-05, "loss": 0.5079, "step": 804 }, { "epoch": 1.7502038597444958, "grad_norm": 0.06691862642765045, "learning_rate": 1.4518547484954033e-05, "loss": 0.4983, "step": 805 }, { "epoch": 1.7523783636857841, "grad_norm": 0.06752948462963104, "learning_rate": 1.4506331542045545e-05, "loss": 0.5632, "step": 806 }, { "epoch": 1.7545528676270725, "grad_norm": 0.07077533006668091, "learning_rate": 1.449410715495791e-05, "loss": 0.5861, "step": 807 }, { "epoch": 1.7567273715683611, "grad_norm": 0.06619212031364441, "learning_rate": 1.448187434659777e-05, "loss": 0.5024, "step": 808 }, { "epoch": 1.7589018755096495, "grad_norm": 0.07222898304462433, "learning_rate": 1.4469633139887542e-05, "loss": 0.5087, "step": 809 }, { "epoch": 1.7610763794509379, "grad_norm": 0.06092243641614914, "learning_rate": 1.4457383557765385e-05, "loss": 0.5018, "step": 810 }, { "epoch": 1.7632508833922262, "grad_norm": 0.06287255883216858, "learning_rate": 1.4445125623185148e-05, "loss": 0.5825, "step": 811 }, { "epoch": 1.7654253873335146, "grad_norm": 0.06416705995798111, "learning_rate": 1.4432859359116337e-05, "loss": 0.5155, "step": 812 }, { "epoch": 1.767599891274803, "grad_norm": 0.06798828393220901, "learning_rate": 1.4420584788544059e-05, "loss": 0.524, "step": 813 }, { "epoch": 1.7697743952160914, "grad_norm": 0.07180637866258621, "learning_rate": 1.4408301934468996e-05, "loss": 0.5783, "step": 814 }, { "epoch": 1.7719488991573797, "grad_norm": 0.0682697594165802, "learning_rate": 1.439601081990734e-05, "loss": 0.4412, "step": 815 }, { "epoch": 1.7741234030986681, "grad_norm": 0.07137314975261688, "learning_rate": 1.4383711467890776e-05, "loss": 0.5046, "step": 816 }, { "epoch": 1.7762979070399565, "grad_norm": 0.07311975210905075, "learning_rate": 1.437140390146641e-05, "loss": 0.5173, "step": 817 }, { "epoch": 1.7784724109812449, "grad_norm": 0.06769049167633057, "learning_rate": 1.435908814369675e-05, "loss": 0.5961, "step": 818 }, { "epoch": 1.7806469149225332, "grad_norm": 0.07518597692251205, "learning_rate": 1.4346764217659652e-05, "loss": 0.634, "step": 819 }, { "epoch": 1.7828214188638216, "grad_norm": 0.07013806700706482, "learning_rate": 1.4334432146448272e-05, "loss": 0.5322, "step": 820 }, { "epoch": 1.78499592280511, "grad_norm": 0.07949534803628922, "learning_rate": 1.4322091953171043e-05, "loss": 0.5295, "step": 821 }, { "epoch": 1.7871704267463984, "grad_norm": 0.07485979795455933, "learning_rate": 1.4309743660951597e-05, "loss": 0.6005, "step": 822 }, { "epoch": 1.7893449306876867, "grad_norm": 0.07056054472923279, "learning_rate": 1.429738729292876e-05, "loss": 0.5177, "step": 823 }, { "epoch": 1.7915194346289751, "grad_norm": 0.06703866273164749, "learning_rate": 1.4285022872256484e-05, "loss": 0.5608, "step": 824 }, { "epoch": 1.7936939385702637, "grad_norm": 0.06247849017381668, "learning_rate": 1.427265042210381e-05, "loss": 0.5931, "step": 825 }, { "epoch": 1.795868442511552, "grad_norm": 0.06365659832954407, "learning_rate": 1.4260269965654824e-05, "loss": 0.4869, "step": 826 }, { "epoch": 1.7980429464528405, "grad_norm": 0.0702958032488823, "learning_rate": 1.4247881526108622e-05, "loss": 0.5494, "step": 827 }, { "epoch": 1.8002174503941288, "grad_norm": 0.07113964855670929, "learning_rate": 1.4235485126679244e-05, "loss": 0.5667, "step": 828 }, { "epoch": 1.8023919543354172, "grad_norm": 0.06958702951669693, "learning_rate": 1.4223080790595662e-05, "loss": 0.573, "step": 829 }, { "epoch": 1.8045664582767056, "grad_norm": 0.0608779639005661, "learning_rate": 1.4210668541101713e-05, "loss": 0.5034, "step": 830 }, { "epoch": 1.8067409622179942, "grad_norm": 0.07167661190032959, "learning_rate": 1.4198248401456057e-05, "loss": 0.7442, "step": 831 }, { "epoch": 1.8089154661592826, "grad_norm": 0.06457799673080444, "learning_rate": 1.4185820394932148e-05, "loss": 0.5682, "step": 832 }, { "epoch": 1.811089970100571, "grad_norm": 0.07647538185119629, "learning_rate": 1.417338454481818e-05, "loss": 0.5627, "step": 833 }, { "epoch": 1.8132644740418593, "grad_norm": 0.06366881728172302, "learning_rate": 1.4160940874417041e-05, "loss": 0.4773, "step": 834 }, { "epoch": 1.8154389779831477, "grad_norm": 0.06594689190387726, "learning_rate": 1.4148489407046274e-05, "loss": 0.5725, "step": 835 }, { "epoch": 1.817613481924436, "grad_norm": 0.06731311231851578, "learning_rate": 1.4136030166038031e-05, "loss": 0.5726, "step": 836 }, { "epoch": 1.8197879858657244, "grad_norm": 0.06387005001306534, "learning_rate": 1.4123563174739036e-05, "loss": 0.5227, "step": 837 }, { "epoch": 1.8219624898070128, "grad_norm": 0.0688241645693779, "learning_rate": 1.4111088456510531e-05, "loss": 0.5333, "step": 838 }, { "epoch": 1.8241369937483012, "grad_norm": 0.06998128443956375, "learning_rate": 1.409860603472824e-05, "loss": 0.4918, "step": 839 }, { "epoch": 1.8263114976895896, "grad_norm": 0.06779828667640686, "learning_rate": 1.4086115932782316e-05, "loss": 0.5247, "step": 840 }, { "epoch": 1.828486001630878, "grad_norm": 0.06935732811689377, "learning_rate": 1.4073618174077315e-05, "loss": 0.5409, "step": 841 }, { "epoch": 1.8306605055721663, "grad_norm": 0.06997125595808029, "learning_rate": 1.4061112782032126e-05, "loss": 0.6939, "step": 842 }, { "epoch": 1.8328350095134547, "grad_norm": 0.07509777694940567, "learning_rate": 1.4048599780079957e-05, "loss": 0.5167, "step": 843 }, { "epoch": 1.835009513454743, "grad_norm": 0.07312557101249695, "learning_rate": 1.4036079191668266e-05, "loss": 0.592, "step": 844 }, { "epoch": 1.8371840173960314, "grad_norm": 0.06642705202102661, "learning_rate": 1.4023551040258726e-05, "loss": 0.571, "step": 845 }, { "epoch": 1.8393585213373198, "grad_norm": 0.06936568766832352, "learning_rate": 1.4011015349327188e-05, "loss": 0.6301, "step": 846 }, { "epoch": 1.8415330252786082, "grad_norm": 0.06802138686180115, "learning_rate": 1.3998472142363628e-05, "loss": 0.5864, "step": 847 }, { "epoch": 1.8437075292198966, "grad_norm": 0.06581053882837296, "learning_rate": 1.3985921442872105e-05, "loss": 0.4711, "step": 848 }, { "epoch": 1.8458820331611852, "grad_norm": 0.06381881982088089, "learning_rate": 1.3973363274370722e-05, "loss": 0.5271, "step": 849 }, { "epoch": 1.8480565371024735, "grad_norm": 0.09898774325847626, "learning_rate": 1.396079766039157e-05, "loss": 0.7207, "step": 850 }, { "epoch": 1.850231041043762, "grad_norm": 0.06849417835474014, "learning_rate": 1.3948224624480697e-05, "loss": 0.5118, "step": 851 }, { "epoch": 1.8524055449850503, "grad_norm": 0.12313933670520782, "learning_rate": 1.3935644190198061e-05, "loss": 0.5799, "step": 852 }, { "epoch": 1.8545800489263387, "grad_norm": 0.06408977508544922, "learning_rate": 1.3923056381117479e-05, "loss": 0.5575, "step": 853 }, { "epoch": 1.856754552867627, "grad_norm": 0.07439928501844406, "learning_rate": 1.3910461220826585e-05, "loss": 0.5453, "step": 854 }, { "epoch": 1.8589290568089156, "grad_norm": 0.07531527429819107, "learning_rate": 1.3897858732926794e-05, "loss": 0.5749, "step": 855 }, { "epoch": 1.861103560750204, "grad_norm": 0.06447598338127136, "learning_rate": 1.3885248941033252e-05, "loss": 0.5395, "step": 856 }, { "epoch": 1.8632780646914924, "grad_norm": 0.07099034637212753, "learning_rate": 1.3872631868774789e-05, "loss": 0.588, "step": 857 }, { "epoch": 1.8654525686327807, "grad_norm": 0.06222882121801376, "learning_rate": 1.3860007539793872e-05, "loss": 0.48, "step": 858 }, { "epoch": 1.8676270725740691, "grad_norm": 0.07421304285526276, "learning_rate": 1.384737597774657e-05, "loss": 0.6431, "step": 859 }, { "epoch": 1.8698015765153575, "grad_norm": 0.06363499164581299, "learning_rate": 1.3834737206302519e-05, "loss": 0.4738, "step": 860 }, { "epoch": 1.8719760804566459, "grad_norm": 0.06778541207313538, "learning_rate": 1.382209124914484e-05, "loss": 0.5204, "step": 861 }, { "epoch": 1.8741505843979342, "grad_norm": 0.06782150268554688, "learning_rate": 1.3809438129970136e-05, "loss": 0.5446, "step": 862 }, { "epoch": 1.8763250883392226, "grad_norm": 0.06705275923013687, "learning_rate": 1.3796777872488427e-05, "loss": 0.5919, "step": 863 }, { "epoch": 1.878499592280511, "grad_norm": 0.06900067627429962, "learning_rate": 1.3784110500423104e-05, "loss": 0.5199, "step": 864 }, { "epoch": 1.8806740962217994, "grad_norm": 0.06157677248120308, "learning_rate": 1.3771436037510897e-05, "loss": 0.5029, "step": 865 }, { "epoch": 1.8828486001630877, "grad_norm": 0.07160373777151108, "learning_rate": 1.3758754507501817e-05, "loss": 0.5262, "step": 866 }, { "epoch": 1.8850231041043761, "grad_norm": 0.08398843556642532, "learning_rate": 1.3746065934159123e-05, "loss": 0.6335, "step": 867 }, { "epoch": 1.8871976080456645, "grad_norm": 0.06484688073396683, "learning_rate": 1.3733370341259265e-05, "loss": 0.5552, "step": 868 }, { "epoch": 1.8893721119869529, "grad_norm": 0.07313846796751022, "learning_rate": 1.3720667752591857e-05, "loss": 0.5609, "step": 869 }, { "epoch": 1.8915466159282412, "grad_norm": 0.07041696459054947, "learning_rate": 1.3707958191959609e-05, "loss": 0.6689, "step": 870 }, { "epoch": 1.8937211198695296, "grad_norm": 0.08569202572107315, "learning_rate": 1.369524168317831e-05, "loss": 0.482, "step": 871 }, { "epoch": 1.895895623810818, "grad_norm": 0.07040594518184662, "learning_rate": 1.3682518250076755e-05, "loss": 0.5056, "step": 872 }, { "epoch": 1.8980701277521066, "grad_norm": 0.059122782200574875, "learning_rate": 1.3669787916496722e-05, "loss": 0.466, "step": 873 }, { "epoch": 1.900244631693395, "grad_norm": 0.06426321715116501, "learning_rate": 1.365705070629292e-05, "loss": 0.5011, "step": 874 }, { "epoch": 1.9024191356346833, "grad_norm": 0.06532970815896988, "learning_rate": 1.3644306643332939e-05, "loss": 0.4884, "step": 875 }, { "epoch": 1.9045936395759717, "grad_norm": 0.06459186226129532, "learning_rate": 1.3631555751497214e-05, "loss": 0.4573, "step": 876 }, { "epoch": 1.90676814351726, "grad_norm": 0.06885459274053574, "learning_rate": 1.3618798054678972e-05, "loss": 0.5957, "step": 877 }, { "epoch": 1.9089426474585485, "grad_norm": 0.07539945095777512, "learning_rate": 1.3606033576784197e-05, "loss": 0.5521, "step": 878 }, { "epoch": 1.911117151399837, "grad_norm": 0.06804901361465454, "learning_rate": 1.3593262341731577e-05, "loss": 0.4877, "step": 879 }, { "epoch": 1.9132916553411254, "grad_norm": 0.06771431118249893, "learning_rate": 1.3580484373452462e-05, "loss": 0.5874, "step": 880 }, { "epoch": 1.9154661592824138, "grad_norm": 0.07059838622808456, "learning_rate": 1.3567699695890817e-05, "loss": 0.519, "step": 881 }, { "epoch": 1.9176406632237022, "grad_norm": 0.0645422637462616, "learning_rate": 1.355490833300318e-05, "loss": 0.5449, "step": 882 }, { "epoch": 1.9198151671649906, "grad_norm": 0.07668022066354752, "learning_rate": 1.3542110308758625e-05, "loss": 0.5836, "step": 883 }, { "epoch": 1.921989671106279, "grad_norm": 0.06644226610660553, "learning_rate": 1.3529305647138689e-05, "loss": 0.5795, "step": 884 }, { "epoch": 1.9241641750475673, "grad_norm": 0.06938440352678299, "learning_rate": 1.3516494372137368e-05, "loss": 0.6431, "step": 885 }, { "epoch": 1.9263386789888557, "grad_norm": 0.08686212450265884, "learning_rate": 1.3503676507761036e-05, "loss": 0.6295, "step": 886 }, { "epoch": 1.928513182930144, "grad_norm": 0.06303563714027405, "learning_rate": 1.3490852078028423e-05, "loss": 0.5808, "step": 887 }, { "epoch": 1.9306876868714324, "grad_norm": 0.06833554804325104, "learning_rate": 1.3478021106970552e-05, "loss": 0.5211, "step": 888 }, { "epoch": 1.9328621908127208, "grad_norm": 0.07347500324249268, "learning_rate": 1.346518361863071e-05, "loss": 0.5734, "step": 889 }, { "epoch": 1.9350366947540092, "grad_norm": 0.0686698630452156, "learning_rate": 1.34523396370644e-05, "loss": 0.5335, "step": 890 }, { "epoch": 1.9372111986952976, "grad_norm": 0.06444957107305527, "learning_rate": 1.3439489186339283e-05, "loss": 0.5363, "step": 891 }, { "epoch": 1.939385702636586, "grad_norm": 0.06830204278230667, "learning_rate": 1.3426632290535149e-05, "loss": 0.4966, "step": 892 }, { "epoch": 1.9415602065778743, "grad_norm": 0.06537693738937378, "learning_rate": 1.3413768973743863e-05, "loss": 0.5547, "step": 893 }, { "epoch": 1.9437347105191627, "grad_norm": 0.06726716458797455, "learning_rate": 1.3400899260069325e-05, "loss": 0.5482, "step": 894 }, { "epoch": 1.945909214460451, "grad_norm": 0.0636625736951828, "learning_rate": 1.3388023173627413e-05, "loss": 0.5394, "step": 895 }, { "epoch": 1.9480837184017394, "grad_norm": 0.08654431253671646, "learning_rate": 1.3375140738545954e-05, "loss": 0.8174, "step": 896 }, { "epoch": 1.950258222343028, "grad_norm": 0.07606187462806702, "learning_rate": 1.3362251978964675e-05, "loss": 0.5029, "step": 897 }, { "epoch": 1.9524327262843164, "grad_norm": 0.06829311698675156, "learning_rate": 1.334935691903514e-05, "loss": 0.5046, "step": 898 }, { "epoch": 1.9546072302256048, "grad_norm": 0.08110406249761581, "learning_rate": 1.3336455582920738e-05, "loss": 0.5663, "step": 899 }, { "epoch": 1.9567817341668932, "grad_norm": 0.07771358639001846, "learning_rate": 1.3323547994796597e-05, "loss": 0.5712, "step": 900 }, { "epoch": 1.9589562381081815, "grad_norm": 0.06912509351968765, "learning_rate": 1.3310634178849583e-05, "loss": 0.5421, "step": 901 }, { "epoch": 1.96113074204947, "grad_norm": 0.06802995502948761, "learning_rate": 1.3297714159278213e-05, "loss": 0.5347, "step": 902 }, { "epoch": 1.9633052459907585, "grad_norm": 0.12441185116767883, "learning_rate": 1.328478796029264e-05, "loss": 0.5775, "step": 903 }, { "epoch": 1.9654797499320469, "grad_norm": 0.07012849301099777, "learning_rate": 1.3271855606114593e-05, "loss": 0.5918, "step": 904 }, { "epoch": 1.9676542538733353, "grad_norm": 0.07766040414571762, "learning_rate": 1.3258917120977327e-05, "loss": 0.7218, "step": 905 }, { "epoch": 1.9698287578146236, "grad_norm": 0.08586324751377106, "learning_rate": 1.3245972529125609e-05, "loss": 0.5125, "step": 906 }, { "epoch": 1.972003261755912, "grad_norm": 0.0665990561246872, "learning_rate": 1.3233021854815617e-05, "loss": 0.5525, "step": 907 }, { "epoch": 1.9741777656972004, "grad_norm": 0.06829942762851715, "learning_rate": 1.322006512231495e-05, "loss": 0.5208, "step": 908 }, { "epoch": 1.9763522696384888, "grad_norm": 0.06830884516239166, "learning_rate": 1.3207102355902553e-05, "loss": 0.4718, "step": 909 }, { "epoch": 1.9785267735797771, "grad_norm": 0.06645084172487259, "learning_rate": 1.3194133579868672e-05, "loss": 0.5761, "step": 910 }, { "epoch": 1.9807012775210655, "grad_norm": 0.07676375657320023, "learning_rate": 1.3181158818514824e-05, "loss": 0.4726, "step": 911 }, { "epoch": 1.9828757814623539, "grad_norm": 0.08508909493684769, "learning_rate": 1.3168178096153732e-05, "loss": 0.6219, "step": 912 }, { "epoch": 1.9850502854036423, "grad_norm": 0.06981798261404037, "learning_rate": 1.3155191437109294e-05, "loss": 0.4881, "step": 913 }, { "epoch": 1.9872247893449306, "grad_norm": 0.0756353959441185, "learning_rate": 1.314219886571653e-05, "loss": 0.5864, "step": 914 }, { "epoch": 1.989399293286219, "grad_norm": 0.061641592532396317, "learning_rate": 1.3129200406321545e-05, "loss": 0.4612, "step": 915 }, { "epoch": 1.9915737972275074, "grad_norm": 0.06330305337905884, "learning_rate": 1.311619608328147e-05, "loss": 0.4604, "step": 916 }, { "epoch": 1.9937483011687958, "grad_norm": 0.08563757687807083, "learning_rate": 1.3103185920964425e-05, "loss": 0.4967, "step": 917 }, { "epoch": 1.9959228051100841, "grad_norm": 0.07061520963907242, "learning_rate": 1.3090169943749475e-05, "loss": 0.6641, "step": 918 }, { "epoch": 1.9980973090513725, "grad_norm": 0.06188228726387024, "learning_rate": 1.3077148176026579e-05, "loss": 0.4442, "step": 919 }, { "epoch": 2.0, "grad_norm": 0.07721112668514252, "learning_rate": 1.3064120642196549e-05, "loss": 0.5836, "step": 920 }, { "epoch": 2.0, "eval_loss": 0.5578185319900513, "eval_runtime": 11.5729, "eval_samples_per_second": 6.394, "eval_steps_per_second": 6.394, "step": 920 }, { "epoch": 2.0021745039412884, "grad_norm": 0.06920801848173141, "learning_rate": 1.3051087366670994e-05, "loss": 0.5087, "step": 921 }, { "epoch": 2.0043490078825767, "grad_norm": 0.08128421753644943, "learning_rate": 1.3038048373872297e-05, "loss": 0.6233, "step": 922 }, { "epoch": 2.006523511823865, "grad_norm": 0.06810339540243149, "learning_rate": 1.3025003688233533e-05, "loss": 0.5187, "step": 923 }, { "epoch": 2.0086980157651535, "grad_norm": 0.08062627166509628, "learning_rate": 1.3011953334198465e-05, "loss": 0.7373, "step": 924 }, { "epoch": 2.010872519706442, "grad_norm": 0.06998694688081741, "learning_rate": 1.299889733622147e-05, "loss": 0.5303, "step": 925 }, { "epoch": 2.0130470236477302, "grad_norm": 0.0724995955824852, "learning_rate": 1.2985835718767496e-05, "loss": 0.5892, "step": 926 }, { "epoch": 2.0152215275890186, "grad_norm": 0.06745114177465439, "learning_rate": 1.2972768506312028e-05, "loss": 0.4718, "step": 927 }, { "epoch": 2.017396031530307, "grad_norm": 0.07050897926092148, "learning_rate": 1.2959695723341028e-05, "loss": 0.5878, "step": 928 }, { "epoch": 2.0195705354715954, "grad_norm": 0.07519054412841797, "learning_rate": 1.294661739435091e-05, "loss": 0.482, "step": 929 }, { "epoch": 2.0217450394128837, "grad_norm": 0.06846711784601212, "learning_rate": 1.2933533543848462e-05, "loss": 0.6117, "step": 930 }, { "epoch": 2.023919543354172, "grad_norm": 0.06705359369516373, "learning_rate": 1.2920444196350829e-05, "loss": 0.4788, "step": 931 }, { "epoch": 2.026094047295461, "grad_norm": 0.06846786290407181, "learning_rate": 1.2907349376385461e-05, "loss": 0.5013, "step": 932 }, { "epoch": 2.0282685512367493, "grad_norm": 0.07133980840444565, "learning_rate": 1.2894249108490052e-05, "loss": 0.6602, "step": 933 }, { "epoch": 2.0304430551780377, "grad_norm": 0.06726638972759247, "learning_rate": 1.288114341721251e-05, "loss": 0.5995, "step": 934 }, { "epoch": 2.032617559119326, "grad_norm": 0.07460395991802216, "learning_rate": 1.2868032327110904e-05, "loss": 0.6337, "step": 935 }, { "epoch": 2.0347920630606144, "grad_norm": 0.07262850552797318, "learning_rate": 1.2854915862753424e-05, "loss": 0.6284, "step": 936 }, { "epoch": 2.036966567001903, "grad_norm": 0.06484069675207138, "learning_rate": 1.284179404871832e-05, "loss": 0.4838, "step": 937 }, { "epoch": 2.039141070943191, "grad_norm": 0.05943964794278145, "learning_rate": 1.2828666909593883e-05, "loss": 0.4267, "step": 938 }, { "epoch": 2.0413155748844796, "grad_norm": 0.0669209212064743, "learning_rate": 1.2815534469978363e-05, "loss": 0.4911, "step": 939 }, { "epoch": 2.043490078825768, "grad_norm": 0.07119748741388321, "learning_rate": 1.2802396754479958e-05, "loss": 0.602, "step": 940 }, { "epoch": 2.0456645827670563, "grad_norm": 0.06685557961463928, "learning_rate": 1.2789253787716747e-05, "loss": 0.4934, "step": 941 }, { "epoch": 2.0478390867083447, "grad_norm": 0.0688343271613121, "learning_rate": 1.2776105594316647e-05, "loss": 0.5316, "step": 942 }, { "epoch": 2.050013590649633, "grad_norm": 0.06347215920686722, "learning_rate": 1.2762952198917371e-05, "loss": 0.5303, "step": 943 }, { "epoch": 2.0521880945909214, "grad_norm": 0.07876726239919662, "learning_rate": 1.2749793626166377e-05, "loss": 0.5767, "step": 944 }, { "epoch": 2.05436259853221, "grad_norm": 0.07204129546880722, "learning_rate": 1.2736629900720832e-05, "loss": 0.648, "step": 945 }, { "epoch": 2.056537102473498, "grad_norm": 0.07100638747215271, "learning_rate": 1.2723461047247545e-05, "loss": 0.5308, "step": 946 }, { "epoch": 2.0587116064147866, "grad_norm": 0.06281851977109909, "learning_rate": 1.2710287090422948e-05, "loss": 0.5124, "step": 947 }, { "epoch": 2.060886110356075, "grad_norm": 0.06617816537618637, "learning_rate": 1.2697108054933027e-05, "loss": 0.4933, "step": 948 }, { "epoch": 2.0630606142973633, "grad_norm": 0.0709948018193245, "learning_rate": 1.2683923965473293e-05, "loss": 0.6524, "step": 949 }, { "epoch": 2.0652351182386517, "grad_norm": 0.07335580140352249, "learning_rate": 1.2670734846748717e-05, "loss": 0.5453, "step": 950 }, { "epoch": 2.06740962217994, "grad_norm": 0.07226086407899857, "learning_rate": 1.26575407234737e-05, "loss": 0.6191, "step": 951 }, { "epoch": 2.0695841261212284, "grad_norm": 0.07761870324611664, "learning_rate": 1.2644341620372025e-05, "loss": 0.5442, "step": 952 }, { "epoch": 2.071758630062517, "grad_norm": 0.06222078576683998, "learning_rate": 1.2631137562176792e-05, "loss": 0.4832, "step": 953 }, { "epoch": 2.073933134003805, "grad_norm": 0.08979668468236923, "learning_rate": 1.2617928573630405e-05, "loss": 0.6742, "step": 954 }, { "epoch": 2.0761076379450936, "grad_norm": 0.07135899364948273, "learning_rate": 1.260471467948449e-05, "loss": 0.6128, "step": 955 }, { "epoch": 2.0782821418863824, "grad_norm": 0.06428013741970062, "learning_rate": 1.2591495904499878e-05, "loss": 0.5296, "step": 956 }, { "epoch": 2.0804566458276708, "grad_norm": 0.06867013871669769, "learning_rate": 1.2578272273446536e-05, "loss": 0.6095, "step": 957 }, { "epoch": 2.082631149768959, "grad_norm": 0.08703179657459259, "learning_rate": 1.2565043811103541e-05, "loss": 0.5745, "step": 958 }, { "epoch": 2.0848056537102475, "grad_norm": 0.06571877747774124, "learning_rate": 1.255181054225901e-05, "loss": 0.4859, "step": 959 }, { "epoch": 2.086980157651536, "grad_norm": 0.06962002068758011, "learning_rate": 1.2538572491710079e-05, "loss": 0.5503, "step": 960 }, { "epoch": 2.0891546615928243, "grad_norm": 0.0695275291800499, "learning_rate": 1.2525329684262838e-05, "loss": 0.6179, "step": 961 }, { "epoch": 2.0913291655341126, "grad_norm": 0.060368090867996216, "learning_rate": 1.2512082144732292e-05, "loss": 0.4664, "step": 962 }, { "epoch": 2.093503669475401, "grad_norm": 0.06244010105729103, "learning_rate": 1.2498829897942308e-05, "loss": 0.5314, "step": 963 }, { "epoch": 2.0956781734166894, "grad_norm": 0.07320035248994827, "learning_rate": 1.2485572968725588e-05, "loss": 0.5608, "step": 964 }, { "epoch": 2.0978526773579778, "grad_norm": 0.07618407160043716, "learning_rate": 1.247231138192359e-05, "loss": 0.7556, "step": 965 }, { "epoch": 2.100027181299266, "grad_norm": 0.06959985196590424, "learning_rate": 1.2459045162386514e-05, "loss": 0.6141, "step": 966 }, { "epoch": 2.1022016852405545, "grad_norm": 0.06870408356189728, "learning_rate": 1.2445774334973229e-05, "loss": 0.5737, "step": 967 }, { "epoch": 2.104376189181843, "grad_norm": 0.06213134154677391, "learning_rate": 1.2432498924551256e-05, "loss": 0.6932, "step": 968 }, { "epoch": 2.1065506931231313, "grad_norm": 0.0773962140083313, "learning_rate": 1.2419218955996677e-05, "loss": 0.6245, "step": 969 }, { "epoch": 2.1087251970644196, "grad_norm": 0.07048165053129196, "learning_rate": 1.2405934454194146e-05, "loss": 0.4826, "step": 970 }, { "epoch": 2.110899701005708, "grad_norm": 0.06887306272983551, "learning_rate": 1.239264544403679e-05, "loss": 0.5718, "step": 971 }, { "epoch": 2.1130742049469964, "grad_norm": 0.06301311403512955, "learning_rate": 1.2379351950426188e-05, "loss": 0.4855, "step": 972 }, { "epoch": 2.1152487088882848, "grad_norm": 0.06743936985731125, "learning_rate": 1.236605399827233e-05, "loss": 0.5166, "step": 973 }, { "epoch": 2.117423212829573, "grad_norm": 0.07066462934017181, "learning_rate": 1.2352751612493548e-05, "loss": 0.5664, "step": 974 }, { "epoch": 2.1195977167708615, "grad_norm": 0.07165549695491791, "learning_rate": 1.2339444818016488e-05, "loss": 0.6756, "step": 975 }, { "epoch": 2.12177222071215, "grad_norm": 0.144772469997406, "learning_rate": 1.2326133639776054e-05, "loss": 0.5775, "step": 976 }, { "epoch": 2.1239467246534383, "grad_norm": 0.0969739630818367, "learning_rate": 1.2312818102715375e-05, "loss": 0.6141, "step": 977 }, { "epoch": 2.1261212285947266, "grad_norm": 0.0661269873380661, "learning_rate": 1.2299498231785739e-05, "loss": 0.4211, "step": 978 }, { "epoch": 2.1282957325360154, "grad_norm": 0.08805619180202484, "learning_rate": 1.2286174051946547e-05, "loss": 0.5736, "step": 979 }, { "epoch": 2.1304702364773034, "grad_norm": 0.07626120001077652, "learning_rate": 1.227284558816529e-05, "loss": 0.5555, "step": 980 }, { "epoch": 2.132644740418592, "grad_norm": 0.06552954763174057, "learning_rate": 1.2259512865417478e-05, "loss": 0.5363, "step": 981 }, { "epoch": 2.1348192443598806, "grad_norm": 0.06996779143810272, "learning_rate": 1.2246175908686607e-05, "loss": 0.5535, "step": 982 }, { "epoch": 2.136993748301169, "grad_norm": 0.06789004057645798, "learning_rate": 1.2232834742964093e-05, "loss": 0.5286, "step": 983 }, { "epoch": 2.1391682522424573, "grad_norm": 0.07126764953136444, "learning_rate": 1.2219489393249263e-05, "loss": 0.5794, "step": 984 }, { "epoch": 2.1413427561837457, "grad_norm": 0.08257946372032166, "learning_rate": 1.220613988454926e-05, "loss": 0.5685, "step": 985 }, { "epoch": 2.143517260125034, "grad_norm": 0.0710805281996727, "learning_rate": 1.2192786241879033e-05, "loss": 0.5892, "step": 986 }, { "epoch": 2.1456917640663224, "grad_norm": 0.06792116165161133, "learning_rate": 1.2179428490261278e-05, "loss": 0.5758, "step": 987 }, { "epoch": 2.147866268007611, "grad_norm": 0.06910964846611023, "learning_rate": 1.2166066654726387e-05, "loss": 0.6171, "step": 988 }, { "epoch": 2.150040771948899, "grad_norm": 0.06712738424539566, "learning_rate": 1.2152700760312401e-05, "loss": 0.4908, "step": 989 }, { "epoch": 2.1522152758901876, "grad_norm": 0.06706628948450089, "learning_rate": 1.2139330832064975e-05, "loss": 0.5597, "step": 990 }, { "epoch": 2.154389779831476, "grad_norm": 0.06264661997556686, "learning_rate": 1.2125956895037318e-05, "loss": 0.5123, "step": 991 }, { "epoch": 2.1565642837727643, "grad_norm": 0.06402754038572311, "learning_rate": 1.211257897429015e-05, "loss": 0.527, "step": 992 }, { "epoch": 2.1587387877140527, "grad_norm": 0.060939688235521317, "learning_rate": 1.2099197094891659e-05, "loss": 0.517, "step": 993 }, { "epoch": 2.160913291655341, "grad_norm": 0.06698132306337357, "learning_rate": 1.2085811281917453e-05, "loss": 0.5441, "step": 994 }, { "epoch": 2.1630877955966294, "grad_norm": 0.06612163037061691, "learning_rate": 1.2072421560450497e-05, "loss": 0.5779, "step": 995 }, { "epoch": 2.165262299537918, "grad_norm": 0.06196536123752594, "learning_rate": 1.20590279555811e-05, "loss": 0.5189, "step": 996 }, { "epoch": 2.167436803479206, "grad_norm": 0.0697999894618988, "learning_rate": 1.2045630492406835e-05, "loss": 0.5922, "step": 997 }, { "epoch": 2.1696113074204946, "grad_norm": 0.06350264698266983, "learning_rate": 1.2032229196032508e-05, "loss": 0.5125, "step": 998 }, { "epoch": 2.171785811361783, "grad_norm": 0.09382516145706177, "learning_rate": 1.2018824091570103e-05, "loss": 0.6525, "step": 999 }, { "epoch": 2.1739603153030713, "grad_norm": 0.08568950742483139, "learning_rate": 1.2005415204138753e-05, "loss": 0.5312, "step": 1000 }, { "epoch": 2.1761348192443597, "grad_norm": 0.06920292228460312, "learning_rate": 1.1992002558864668e-05, "loss": 0.4767, "step": 1001 }, { "epoch": 2.178309323185648, "grad_norm": 0.06867019832134247, "learning_rate": 1.19785861808811e-05, "loss": 0.4709, "step": 1002 }, { "epoch": 2.1804838271269364, "grad_norm": 0.07175212353467941, "learning_rate": 1.1965166095328302e-05, "loss": 0.5962, "step": 1003 }, { "epoch": 2.1826583310682253, "grad_norm": 0.06831399351358414, "learning_rate": 1.1951742327353471e-05, "loss": 0.5284, "step": 1004 }, { "epoch": 2.1848328350095136, "grad_norm": 0.06634752452373505, "learning_rate": 1.1938314902110701e-05, "loss": 0.6213, "step": 1005 }, { "epoch": 2.187007338950802, "grad_norm": 0.07255899906158447, "learning_rate": 1.1924883844760944e-05, "loss": 0.5945, "step": 1006 }, { "epoch": 2.1891818428920904, "grad_norm": 0.07134190946817398, "learning_rate": 1.1911449180471962e-05, "loss": 0.5868, "step": 1007 }, { "epoch": 2.1913563468333788, "grad_norm": 0.08981459587812424, "learning_rate": 1.1898010934418261e-05, "loss": 0.6796, "step": 1008 }, { "epoch": 2.193530850774667, "grad_norm": 0.07578856498003006, "learning_rate": 1.1884569131781069e-05, "loss": 0.5902, "step": 1009 }, { "epoch": 2.1957053547159555, "grad_norm": 0.0669521614909172, "learning_rate": 1.1871123797748285e-05, "loss": 0.5431, "step": 1010 }, { "epoch": 2.197879858657244, "grad_norm": 0.06674208492040634, "learning_rate": 1.1857674957514411e-05, "loss": 0.5247, "step": 1011 }, { "epoch": 2.2000543625985323, "grad_norm": 0.06886722147464752, "learning_rate": 1.184422263628053e-05, "loss": 0.6153, "step": 1012 }, { "epoch": 2.2022288665398206, "grad_norm": 0.0678599402308464, "learning_rate": 1.1830766859254234e-05, "loss": 0.5181, "step": 1013 }, { "epoch": 2.204403370481109, "grad_norm": 0.0890408605337143, "learning_rate": 1.1817307651649616e-05, "loss": 0.8129, "step": 1014 }, { "epoch": 2.2065778744223974, "grad_norm": 0.06546042859554291, "learning_rate": 1.1803845038687171e-05, "loss": 0.5414, "step": 1015 }, { "epoch": 2.2087523783636858, "grad_norm": 0.06890612840652466, "learning_rate": 1.1790379045593787e-05, "loss": 0.5612, "step": 1016 }, { "epoch": 2.210926882304974, "grad_norm": 0.06217518821358681, "learning_rate": 1.177690969760269e-05, "loss": 0.4351, "step": 1017 }, { "epoch": 2.2131013862462625, "grad_norm": 0.06790955364704132, "learning_rate": 1.1763437019953377e-05, "loss": 0.5484, "step": 1018 }, { "epoch": 2.215275890187551, "grad_norm": 0.07314374297857285, "learning_rate": 1.1749961037891605e-05, "loss": 0.6457, "step": 1019 }, { "epoch": 2.2174503941288393, "grad_norm": 0.07015176862478256, "learning_rate": 1.1736481776669307e-05, "loss": 0.6256, "step": 1020 }, { "epoch": 2.2196248980701276, "grad_norm": 0.06909603625535965, "learning_rate": 1.1722999261544565e-05, "loss": 0.4936, "step": 1021 }, { "epoch": 2.221799402011416, "grad_norm": 0.06084631755948067, "learning_rate": 1.1709513517781563e-05, "loss": 0.5226, "step": 1022 }, { "epoch": 2.2239739059527044, "grad_norm": 0.0731106624007225, "learning_rate": 1.1696024570650528e-05, "loss": 0.5364, "step": 1023 }, { "epoch": 2.2261484098939928, "grad_norm": 0.06578187644481659, "learning_rate": 1.16825324454277e-05, "loss": 0.5538, "step": 1024 }, { "epoch": 2.228322913835281, "grad_norm": 0.07405096292495728, "learning_rate": 1.1669037167395256e-05, "loss": 0.5204, "step": 1025 }, { "epoch": 2.2304974177765695, "grad_norm": 0.06306886672973633, "learning_rate": 1.16555387618413e-05, "loss": 0.5382, "step": 1026 }, { "epoch": 2.2326719217178583, "grad_norm": 0.061942584812641144, "learning_rate": 1.164203725405979e-05, "loss": 0.5359, "step": 1027 }, { "epoch": 2.2348464256591467, "grad_norm": 0.060669250786304474, "learning_rate": 1.162853266935049e-05, "loss": 0.5003, "step": 1028 }, { "epoch": 2.237020929600435, "grad_norm": 0.062639020383358, "learning_rate": 1.1615025033018937e-05, "loss": 0.5079, "step": 1029 }, { "epoch": 2.2391954335417235, "grad_norm": 0.07334067672491074, "learning_rate": 1.160151437037639e-05, "loss": 0.5324, "step": 1030 }, { "epoch": 2.241369937483012, "grad_norm": 0.06911758333444595, "learning_rate": 1.1588000706739768e-05, "loss": 0.4923, "step": 1031 }, { "epoch": 2.2435444414243, "grad_norm": 0.06459035724401474, "learning_rate": 1.1574484067431617e-05, "loss": 0.5185, "step": 1032 }, { "epoch": 2.2457189453655886, "grad_norm": 0.0792449563741684, "learning_rate": 1.156096447778007e-05, "loss": 0.6189, "step": 1033 }, { "epoch": 2.247893449306877, "grad_norm": 0.06475230306386948, "learning_rate": 1.1547441963118771e-05, "loss": 0.5343, "step": 1034 }, { "epoch": 2.2500679532481653, "grad_norm": 0.06450542062520981, "learning_rate": 1.1533916548786856e-05, "loss": 0.6554, "step": 1035 }, { "epoch": 2.2522424571894537, "grad_norm": 0.07294163107872009, "learning_rate": 1.152038826012889e-05, "loss": 0.6233, "step": 1036 }, { "epoch": 2.254416961130742, "grad_norm": 0.07336447387933731, "learning_rate": 1.1506857122494832e-05, "loss": 0.5458, "step": 1037 }, { "epoch": 2.2565914650720305, "grad_norm": 0.0679054781794548, "learning_rate": 1.149332316123997e-05, "loss": 0.4549, "step": 1038 }, { "epoch": 2.258765969013319, "grad_norm": 0.06410389393568039, "learning_rate": 1.1479786401724884e-05, "loss": 0.5989, "step": 1039 }, { "epoch": 2.260940472954607, "grad_norm": 0.07650402188301086, "learning_rate": 1.1466246869315407e-05, "loss": 0.6197, "step": 1040 }, { "epoch": 2.2631149768958956, "grad_norm": 0.061269864439964294, "learning_rate": 1.145270458938255e-05, "loss": 0.4555, "step": 1041 }, { "epoch": 2.265289480837184, "grad_norm": 0.07264136523008347, "learning_rate": 1.1439159587302495e-05, "loss": 0.5652, "step": 1042 }, { "epoch": 2.2674639847784723, "grad_norm": 0.07271306216716766, "learning_rate": 1.1425611888456509e-05, "loss": 0.5415, "step": 1043 }, { "epoch": 2.2696384887197607, "grad_norm": 0.07464227825403214, "learning_rate": 1.1412061518230916e-05, "loss": 0.588, "step": 1044 }, { "epoch": 2.271812992661049, "grad_norm": 0.06809539347887039, "learning_rate": 1.1398508502017047e-05, "loss": 0.512, "step": 1045 }, { "epoch": 2.2739874966023375, "grad_norm": 0.058565836399793625, "learning_rate": 1.1384952865211188e-05, "loss": 0.5155, "step": 1046 }, { "epoch": 2.276162000543626, "grad_norm": 0.06849224865436554, "learning_rate": 1.1371394633214548e-05, "loss": 0.665, "step": 1047 }, { "epoch": 2.278336504484914, "grad_norm": 0.06861334294080734, "learning_rate": 1.1357833831433176e-05, "loss": 0.577, "step": 1048 }, { "epoch": 2.2805110084262026, "grad_norm": 0.06721647828817368, "learning_rate": 1.1344270485277958e-05, "loss": 0.481, "step": 1049 }, { "epoch": 2.2826855123674914, "grad_norm": 0.07307053357362747, "learning_rate": 1.133070462016454e-05, "loss": 0.5, "step": 1050 }, { "epoch": 2.2848600163087793, "grad_norm": 0.0716986134648323, "learning_rate": 1.1317136261513281e-05, "loss": 0.5542, "step": 1051 }, { "epoch": 2.287034520250068, "grad_norm": 0.07218325883150101, "learning_rate": 1.1303565434749225e-05, "loss": 0.6113, "step": 1052 }, { "epoch": 2.2892090241913565, "grad_norm": 0.07014322280883789, "learning_rate": 1.1289992165302036e-05, "loss": 0.5157, "step": 1053 }, { "epoch": 2.291383528132645, "grad_norm": 0.07056258618831635, "learning_rate": 1.127641647860595e-05, "loss": 0.6359, "step": 1054 }, { "epoch": 2.2935580320739333, "grad_norm": 0.07968546450138092, "learning_rate": 1.1262838400099733e-05, "loss": 0.467, "step": 1055 }, { "epoch": 2.2957325360152216, "grad_norm": 0.07766050100326538, "learning_rate": 1.1249257955226649e-05, "loss": 0.5635, "step": 1056 }, { "epoch": 2.29790703995651, "grad_norm": 0.06866757571697235, "learning_rate": 1.1235675169434372e-05, "loss": 0.6114, "step": 1057 }, { "epoch": 2.3000815438977984, "grad_norm": 0.06924376636743546, "learning_rate": 1.1222090068174983e-05, "loss": 0.586, "step": 1058 }, { "epoch": 2.3022560478390868, "grad_norm": 0.06507083773612976, "learning_rate": 1.1208502676904887e-05, "loss": 0.5317, "step": 1059 }, { "epoch": 2.304430551780375, "grad_norm": 0.06696628034114838, "learning_rate": 1.119491302108479e-05, "loss": 0.5352, "step": 1060 }, { "epoch": 2.3066050557216635, "grad_norm": 0.07117165625095367, "learning_rate": 1.1181321126179637e-05, "loss": 0.5771, "step": 1061 }, { "epoch": 2.308779559662952, "grad_norm": 0.05862511321902275, "learning_rate": 1.1167727017658562e-05, "loss": 0.4934, "step": 1062 }, { "epoch": 2.3109540636042403, "grad_norm": 0.0715501606464386, "learning_rate": 1.115413072099487e-05, "loss": 0.5318, "step": 1063 }, { "epoch": 2.3131285675455286, "grad_norm": 0.08439181745052338, "learning_rate": 1.1140532261665937e-05, "loss": 0.536, "step": 1064 }, { "epoch": 2.315303071486817, "grad_norm": 0.07143078744411469, "learning_rate": 1.1126931665153213e-05, "loss": 0.5793, "step": 1065 }, { "epoch": 2.3174775754281054, "grad_norm": 0.07027513533830643, "learning_rate": 1.1113328956942139e-05, "loss": 0.5794, "step": 1066 }, { "epoch": 2.3196520793693938, "grad_norm": 0.06756500899791718, "learning_rate": 1.1099724162522125e-05, "loss": 0.5146, "step": 1067 }, { "epoch": 2.321826583310682, "grad_norm": 0.06300317496061325, "learning_rate": 1.1086117307386481e-05, "loss": 0.5892, "step": 1068 }, { "epoch": 2.3240010872519705, "grad_norm": 0.06686022877693176, "learning_rate": 1.1072508417032382e-05, "loss": 0.5233, "step": 1069 }, { "epoch": 2.326175591193259, "grad_norm": 0.06862420588731766, "learning_rate": 1.1058897516960817e-05, "loss": 0.5528, "step": 1070 }, { "epoch": 2.3283500951345473, "grad_norm": 0.07075595110654831, "learning_rate": 1.1045284632676535e-05, "loss": 0.5942, "step": 1071 }, { "epoch": 2.3305245990758356, "grad_norm": 0.06664254516363144, "learning_rate": 1.1031669789688017e-05, "loss": 0.5133, "step": 1072 }, { "epoch": 2.3326991030171245, "grad_norm": 0.07058419287204742, "learning_rate": 1.10180530135074e-05, "loss": 0.5196, "step": 1073 }, { "epoch": 2.3348736069584124, "grad_norm": 0.07997550815343857, "learning_rate": 1.1004434329650453e-05, "loss": 0.5465, "step": 1074 }, { "epoch": 2.337048110899701, "grad_norm": 0.08553148061037064, "learning_rate": 1.0990813763636511e-05, "loss": 0.6605, "step": 1075 }, { "epoch": 2.3392226148409896, "grad_norm": 0.06632629036903381, "learning_rate": 1.0977191340988447e-05, "loss": 0.7119, "step": 1076 }, { "epoch": 2.341397118782278, "grad_norm": 0.06989524513483047, "learning_rate": 1.09635670872326e-05, "loss": 0.5221, "step": 1077 }, { "epoch": 2.3435716227235663, "grad_norm": 0.07591139525175095, "learning_rate": 1.094994102789875e-05, "loss": 0.4899, "step": 1078 }, { "epoch": 2.3457461266648547, "grad_norm": 0.06684182584285736, "learning_rate": 1.093631318852006e-05, "loss": 0.4861, "step": 1079 }, { "epoch": 2.347920630606143, "grad_norm": 0.06115030124783516, "learning_rate": 1.092268359463302e-05, "loss": 0.4952, "step": 1080 }, { "epoch": 2.3500951345474315, "grad_norm": 0.06786268949508667, "learning_rate": 1.090905227177742e-05, "loss": 0.5076, "step": 1081 }, { "epoch": 2.35226963848872, "grad_norm": 0.0675954669713974, "learning_rate": 1.0895419245496282e-05, "loss": 0.4937, "step": 1082 }, { "epoch": 2.354444142430008, "grad_norm": 0.06312787532806396, "learning_rate": 1.0881784541335818e-05, "loss": 0.5337, "step": 1083 }, { "epoch": 2.3566186463712966, "grad_norm": 0.07176151871681213, "learning_rate": 1.0868148184845391e-05, "loss": 0.467, "step": 1084 }, { "epoch": 2.358793150312585, "grad_norm": 0.07740528136491776, "learning_rate": 1.0854510201577451e-05, "loss": 0.598, "step": 1085 }, { "epoch": 2.3609676542538733, "grad_norm": 0.06273774057626724, "learning_rate": 1.0840870617087513e-05, "loss": 0.4699, "step": 1086 }, { "epoch": 2.3631421581951617, "grad_norm": 0.0632958859205246, "learning_rate": 1.0827229456934068e-05, "loss": 0.5068, "step": 1087 }, { "epoch": 2.36531666213645, "grad_norm": 0.06725380569696426, "learning_rate": 1.0813586746678584e-05, "loss": 0.532, "step": 1088 }, { "epoch": 2.3674911660777385, "grad_norm": 0.06564852595329285, "learning_rate": 1.0799942511885417e-05, "loss": 0.4504, "step": 1089 }, { "epoch": 2.369665670019027, "grad_norm": 0.07417241483926773, "learning_rate": 1.0786296778121787e-05, "loss": 0.5274, "step": 1090 }, { "epoch": 2.371840173960315, "grad_norm": 0.08923973143100739, "learning_rate": 1.0772649570957723e-05, "loss": 0.6727, "step": 1091 }, { "epoch": 2.3740146779016036, "grad_norm": 0.06518397480249405, "learning_rate": 1.0759000915966011e-05, "loss": 0.5474, "step": 1092 }, { "epoch": 2.376189181842892, "grad_norm": 0.07100885361433029, "learning_rate": 1.0745350838722153e-05, "loss": 0.5123, "step": 1093 }, { "epoch": 2.3783636857841803, "grad_norm": 0.07622502744197845, "learning_rate": 1.0731699364804312e-05, "loss": 0.5263, "step": 1094 }, { "epoch": 2.3805381897254687, "grad_norm": 0.06200535595417023, "learning_rate": 1.0718046519793276e-05, "loss": 0.4544, "step": 1095 }, { "epoch": 2.382712693666757, "grad_norm": 0.06931181252002716, "learning_rate": 1.0704392329272396e-05, "loss": 0.5514, "step": 1096 }, { "epoch": 2.3848871976080455, "grad_norm": 0.06804732233285904, "learning_rate": 1.0690736818827547e-05, "loss": 0.533, "step": 1097 }, { "epoch": 2.3870617015493343, "grad_norm": 0.06115454435348511, "learning_rate": 1.0677080014047076e-05, "loss": 0.4896, "step": 1098 }, { "epoch": 2.389236205490622, "grad_norm": 0.06372347474098206, "learning_rate": 1.0663421940521753e-05, "loss": 0.4686, "step": 1099 }, { "epoch": 2.391410709431911, "grad_norm": 0.0840807631611824, "learning_rate": 1.0649762623844733e-05, "loss": 0.5975, "step": 1100 }, { "epoch": 2.3935852133731994, "grad_norm": 0.06863995641469955, "learning_rate": 1.063610208961149e-05, "loss": 0.4821, "step": 1101 }, { "epoch": 2.395759717314488, "grad_norm": 0.06574525684118271, "learning_rate": 1.062244036341979e-05, "loss": 0.574, "step": 1102 }, { "epoch": 2.397934221255776, "grad_norm": 0.06965085864067078, "learning_rate": 1.0608777470869625e-05, "loss": 0.556, "step": 1103 }, { "epoch": 2.4001087251970645, "grad_norm": 0.0772022008895874, "learning_rate": 1.0595113437563175e-05, "loss": 0.5517, "step": 1104 }, { "epoch": 2.402283229138353, "grad_norm": 0.058444470167160034, "learning_rate": 1.0581448289104759e-05, "loss": 0.425, "step": 1105 }, { "epoch": 2.4044577330796413, "grad_norm": 0.07089883089065552, "learning_rate": 1.0567782051100786e-05, "loss": 0.5389, "step": 1106 }, { "epoch": 2.4066322370209297, "grad_norm": 0.0685463547706604, "learning_rate": 1.05541147491597e-05, "loss": 0.5853, "step": 1107 }, { "epoch": 2.408806740962218, "grad_norm": 0.061750102788209915, "learning_rate": 1.0540446408891949e-05, "loss": 0.528, "step": 1108 }, { "epoch": 2.4109812449035064, "grad_norm": 0.06702980399131775, "learning_rate": 1.052677705590992e-05, "loss": 0.4671, "step": 1109 }, { "epoch": 2.413155748844795, "grad_norm": 0.07064638286828995, "learning_rate": 1.0513106715827897e-05, "loss": 0.5351, "step": 1110 }, { "epoch": 2.415330252786083, "grad_norm": 0.07001657038927078, "learning_rate": 1.0499435414262018e-05, "loss": 0.5337, "step": 1111 }, { "epoch": 2.4175047567273715, "grad_norm": 0.0681389570236206, "learning_rate": 1.0485763176830221e-05, "loss": 0.52, "step": 1112 }, { "epoch": 2.41967926066866, "grad_norm": 0.06822334229946136, "learning_rate": 1.0472090029152196e-05, "loss": 0.5519, "step": 1113 }, { "epoch": 2.4218537646099483, "grad_norm": 0.06658223271369934, "learning_rate": 1.0458415996849337e-05, "loss": 0.4764, "step": 1114 }, { "epoch": 2.4240282685512367, "grad_norm": 0.07526403665542603, "learning_rate": 1.0444741105544705e-05, "loss": 0.4908, "step": 1115 }, { "epoch": 2.426202772492525, "grad_norm": 0.07005507498979568, "learning_rate": 1.0431065380862959e-05, "loss": 0.4624, "step": 1116 }, { "epoch": 2.4283772764338134, "grad_norm": 0.07718145102262497, "learning_rate": 1.0417388848430323e-05, "loss": 0.5534, "step": 1117 }, { "epoch": 2.430551780375102, "grad_norm": 0.07241858541965485, "learning_rate": 1.0403711533874541e-05, "loss": 0.5561, "step": 1118 }, { "epoch": 2.43272628431639, "grad_norm": 0.06385207176208496, "learning_rate": 1.0390033462824817e-05, "loss": 0.484, "step": 1119 }, { "epoch": 2.4349007882576785, "grad_norm": 0.06747225672006607, "learning_rate": 1.0376354660911772e-05, "loss": 0.5176, "step": 1120 }, { "epoch": 2.4370752921989673, "grad_norm": 0.06791368126869202, "learning_rate": 1.0362675153767398e-05, "loss": 0.5112, "step": 1121 }, { "epoch": 2.4392497961402553, "grad_norm": 0.06782704591751099, "learning_rate": 1.0348994967025012e-05, "loss": 0.5848, "step": 1122 }, { "epoch": 2.441424300081544, "grad_norm": 0.09157824516296387, "learning_rate": 1.0335314126319196e-05, "loss": 0.7414, "step": 1123 }, { "epoch": 2.4435988040228325, "grad_norm": 0.0603589303791523, "learning_rate": 1.0321632657285763e-05, "loss": 0.4656, "step": 1124 }, { "epoch": 2.445773307964121, "grad_norm": 0.07146688550710678, "learning_rate": 1.0307950585561705e-05, "loss": 0.5594, "step": 1125 }, { "epoch": 2.447947811905409, "grad_norm": 0.07382224500179291, "learning_rate": 1.029426793678514e-05, "loss": 0.5147, "step": 1126 }, { "epoch": 2.4501223158466976, "grad_norm": 0.08096981793642044, "learning_rate": 1.028058473659527e-05, "loss": 0.5769, "step": 1127 }, { "epoch": 2.452296819787986, "grad_norm": 0.0716109350323677, "learning_rate": 1.0266901010632324e-05, "loss": 0.5543, "step": 1128 }, { "epoch": 2.4544713237292743, "grad_norm": 0.09528513252735138, "learning_rate": 1.0253216784537527e-05, "loss": 0.6797, "step": 1129 }, { "epoch": 2.4566458276705627, "grad_norm": 0.06395294517278671, "learning_rate": 1.0239532083953032e-05, "loss": 0.5157, "step": 1130 }, { "epoch": 2.458820331611851, "grad_norm": 0.06381714344024658, "learning_rate": 1.0225846934521881e-05, "loss": 0.5147, "step": 1131 }, { "epoch": 2.4609948355531395, "grad_norm": 0.06876876205205917, "learning_rate": 1.021216136188797e-05, "loss": 0.4911, "step": 1132 }, { "epoch": 2.463169339494428, "grad_norm": 0.06143437698483467, "learning_rate": 1.0198475391695966e-05, "loss": 0.4927, "step": 1133 }, { "epoch": 2.465343843435716, "grad_norm": 0.06949868053197861, "learning_rate": 1.01847890495913e-05, "loss": 0.5127, "step": 1134 }, { "epoch": 2.4675183473770046, "grad_norm": 0.0653749406337738, "learning_rate": 1.0171102361220093e-05, "loss": 0.5687, "step": 1135 }, { "epoch": 2.469692851318293, "grad_norm": 0.06519146263599396, "learning_rate": 1.0157415352229115e-05, "loss": 0.498, "step": 1136 }, { "epoch": 2.4718673552595813, "grad_norm": 0.06614164263010025, "learning_rate": 1.0143728048265735e-05, "loss": 0.4716, "step": 1137 }, { "epoch": 2.4740418592008697, "grad_norm": 0.07097812741994858, "learning_rate": 1.0130040474977878e-05, "loss": 0.6112, "step": 1138 }, { "epoch": 2.476216363142158, "grad_norm": 0.06316900253295898, "learning_rate": 1.0116352658013973e-05, "loss": 0.4521, "step": 1139 }, { "epoch": 2.4783908670834465, "grad_norm": 0.07118026912212372, "learning_rate": 1.01026646230229e-05, "loss": 0.5444, "step": 1140 }, { "epoch": 2.480565371024735, "grad_norm": 0.06543668359518051, "learning_rate": 1.008897639565396e-05, "loss": 0.4538, "step": 1141 }, { "epoch": 2.482739874966023, "grad_norm": 0.06957754492759705, "learning_rate": 1.0075288001556798e-05, "loss": 0.4515, "step": 1142 }, { "epoch": 2.4849143789073116, "grad_norm": 0.0697249248623848, "learning_rate": 1.0061599466381388e-05, "loss": 0.5844, "step": 1143 }, { "epoch": 2.4870888828486, "grad_norm": 0.06559092551469803, "learning_rate": 1.0047910815777956e-05, "loss": 0.5539, "step": 1144 }, { "epoch": 2.4892633867898883, "grad_norm": 0.06857336312532425, "learning_rate": 1.0034222075396954e-05, "loss": 0.4682, "step": 1145 }, { "epoch": 2.491437890731177, "grad_norm": 0.07824619859457016, "learning_rate": 1.002053327088899e-05, "loss": 0.655, "step": 1146 }, { "epoch": 2.4936123946724655, "grad_norm": 0.08722686767578125, "learning_rate": 1.00068444279048e-05, "loss": 0.6018, "step": 1147 }, { "epoch": 2.495786898613754, "grad_norm": 0.06494418531656265, "learning_rate": 9.993155572095199e-06, "loss": 0.5218, "step": 1148 }, { "epoch": 2.4979614025550423, "grad_norm": 0.07330279052257538, "learning_rate": 9.979466729111014e-06, "loss": 0.5498, "step": 1149 }, { "epoch": 2.5001359064963307, "grad_norm": 0.07125426083803177, "learning_rate": 9.965777924603053e-06, "loss": 0.5908, "step": 1150 }, { "epoch": 2.502310410437619, "grad_norm": 0.07575562596321106, "learning_rate": 9.952089184222045e-06, "loss": 0.5674, "step": 1151 }, { "epoch": 2.5044849143789074, "grad_norm": 0.06985179334878922, "learning_rate": 9.938400533618615e-06, "loss": 0.4704, "step": 1152 }, { "epoch": 2.506659418320196, "grad_norm": 0.0643727257847786, "learning_rate": 9.924711998443205e-06, "loss": 0.5557, "step": 1153 }, { "epoch": 2.508833922261484, "grad_norm": 0.06342273950576782, "learning_rate": 9.911023604346043e-06, "loss": 0.5355, "step": 1154 }, { "epoch": 2.5110084262027725, "grad_norm": 0.06732742488384247, "learning_rate": 9.897335376977104e-06, "loss": 0.5543, "step": 1155 }, { "epoch": 2.513182930144061, "grad_norm": 0.07165748625993729, "learning_rate": 9.883647341986032e-06, "loss": 0.5579, "step": 1156 }, { "epoch": 2.5153574340853493, "grad_norm": 0.07147862762212753, "learning_rate": 9.869959525022124e-06, "loss": 0.5405, "step": 1157 }, { "epoch": 2.5175319380266377, "grad_norm": 0.06840886175632477, "learning_rate": 9.856271951734268e-06, "loss": 0.4981, "step": 1158 }, { "epoch": 2.519706441967926, "grad_norm": 0.07428555935621262, "learning_rate": 9.84258464777089e-06, "loss": 0.5671, "step": 1159 }, { "epoch": 2.5218809459092144, "grad_norm": 0.06570599973201752, "learning_rate": 9.828897638779909e-06, "loss": 0.518, "step": 1160 }, { "epoch": 2.524055449850503, "grad_norm": 0.06721179932355881, "learning_rate": 9.815210950408703e-06, "loss": 0.5091, "step": 1161 }, { "epoch": 2.526229953791791, "grad_norm": 0.07317999750375748, "learning_rate": 9.801524608304036e-06, "loss": 0.5512, "step": 1162 }, { "epoch": 2.5284044577330795, "grad_norm": 0.06572877615690231, "learning_rate": 9.787838638112034e-06, "loss": 0.5208, "step": 1163 }, { "epoch": 2.530578961674368, "grad_norm": 0.06959232687950134, "learning_rate": 9.774153065478122e-06, "loss": 0.5652, "step": 1164 }, { "epoch": 2.5327534656156563, "grad_norm": 0.06583401560783386, "learning_rate": 9.760467916046971e-06, "loss": 0.5355, "step": 1165 }, { "epoch": 2.5349279695569447, "grad_norm": 0.07404012978076935, "learning_rate": 9.746783215462476e-06, "loss": 0.5749, "step": 1166 }, { "epoch": 2.5371024734982335, "grad_norm": 0.0729111060500145, "learning_rate": 9.733098989367677e-06, "loss": 0.5499, "step": 1167 }, { "epoch": 2.5392769774395214, "grad_norm": 0.09916707873344421, "learning_rate": 9.719415263404732e-06, "loss": 0.5773, "step": 1168 }, { "epoch": 2.5414514813808102, "grad_norm": 0.07055360823869705, "learning_rate": 9.705732063214864e-06, "loss": 0.5612, "step": 1169 }, { "epoch": 2.543625985322098, "grad_norm": 0.06916360557079315, "learning_rate": 9.692049414438298e-06, "loss": 0.5548, "step": 1170 }, { "epoch": 2.545800489263387, "grad_norm": 0.06435995548963547, "learning_rate": 9.678367342714238e-06, "loss": 0.4936, "step": 1171 }, { "epoch": 2.547974993204675, "grad_norm": 0.08049161732196808, "learning_rate": 9.664685873680807e-06, "loss": 0.6725, "step": 1172 }, { "epoch": 2.5501494971459637, "grad_norm": 0.06180369108915329, "learning_rate": 9.651005032974994e-06, "loss": 0.4903, "step": 1173 }, { "epoch": 2.552324001087252, "grad_norm": 0.06488041579723358, "learning_rate": 9.637324846232603e-06, "loss": 0.4993, "step": 1174 }, { "epoch": 2.5544985050285405, "grad_norm": 0.06484771519899368, "learning_rate": 9.62364533908823e-06, "loss": 0.4327, "step": 1175 }, { "epoch": 2.556673008969829, "grad_norm": 0.0683218315243721, "learning_rate": 9.609966537175186e-06, "loss": 0.526, "step": 1176 }, { "epoch": 2.5588475129111172, "grad_norm": 0.06459216773509979, "learning_rate": 9.59628846612546e-06, "loss": 0.4511, "step": 1177 }, { "epoch": 2.5610220168524056, "grad_norm": 0.08217163383960724, "learning_rate": 9.58261115156968e-06, "loss": 0.6853, "step": 1178 }, { "epoch": 2.563196520793694, "grad_norm": 0.07259220629930496, "learning_rate": 9.568934619137048e-06, "loss": 0.5083, "step": 1179 }, { "epoch": 2.5653710247349824, "grad_norm": 0.06782633066177368, "learning_rate": 9.555258894455298e-06, "loss": 0.5039, "step": 1180 }, { "epoch": 2.5675455286762707, "grad_norm": 0.0894932821393013, "learning_rate": 9.541584003150664e-06, "loss": 0.8831, "step": 1181 }, { "epoch": 2.569720032617559, "grad_norm": 0.07626113295555115, "learning_rate": 9.527909970847809e-06, "loss": 0.5802, "step": 1182 }, { "epoch": 2.5718945365588475, "grad_norm": 0.0658908560872078, "learning_rate": 9.51423682316978e-06, "loss": 0.5021, "step": 1183 }, { "epoch": 2.574069040500136, "grad_norm": 0.07098022103309631, "learning_rate": 9.500564585737984e-06, "loss": 0.4807, "step": 1184 }, { "epoch": 2.5762435444414242, "grad_norm": 0.06681408733129501, "learning_rate": 9.486893284172103e-06, "loss": 0.5273, "step": 1185 }, { "epoch": 2.5784180483827126, "grad_norm": 0.06239790469408035, "learning_rate": 9.473222944090082e-06, "loss": 0.434, "step": 1186 }, { "epoch": 2.580592552324001, "grad_norm": 0.07028213143348694, "learning_rate": 9.459553591108054e-06, "loss": 0.5209, "step": 1187 }, { "epoch": 2.5827670562652894, "grad_norm": 0.07988832145929337, "learning_rate": 9.445885250840301e-06, "loss": 0.5794, "step": 1188 }, { "epoch": 2.5849415602065777, "grad_norm": 0.0628553256392479, "learning_rate": 9.432217948899217e-06, "loss": 0.4845, "step": 1189 }, { "epoch": 2.5871160641478665, "grad_norm": 0.06346394866704941, "learning_rate": 9.418551710895243e-06, "loss": 0.4391, "step": 1190 }, { "epoch": 2.5892905680891545, "grad_norm": 0.06948722898960114, "learning_rate": 9.404886562436826e-06, "loss": 0.5166, "step": 1191 }, { "epoch": 2.5914650720304433, "grad_norm": 0.0666830912232399, "learning_rate": 9.391222529130379e-06, "loss": 0.4775, "step": 1192 }, { "epoch": 2.5936395759717312, "grad_norm": 0.0645584985613823, "learning_rate": 9.377559636580213e-06, "loss": 0.5174, "step": 1193 }, { "epoch": 2.59581407991302, "grad_norm": 0.07166798412799835, "learning_rate": 9.363897910388512e-06, "loss": 0.5654, "step": 1194 }, { "epoch": 2.597988583854308, "grad_norm": 0.06628387421369553, "learning_rate": 9.350237376155269e-06, "loss": 0.4952, "step": 1195 }, { "epoch": 2.600163087795597, "grad_norm": 0.07025283575057983, "learning_rate": 9.33657805947825e-06, "loss": 0.5061, "step": 1196 }, { "epoch": 2.602337591736885, "grad_norm": 0.07984895259141922, "learning_rate": 9.322919985952926e-06, "loss": 0.6174, "step": 1197 }, { "epoch": 2.6045120956781735, "grad_norm": 0.07657834142446518, "learning_rate": 9.309263181172456e-06, "loss": 0.5861, "step": 1198 }, { "epoch": 2.606686599619462, "grad_norm": 0.073959119617939, "learning_rate": 9.295607670727605e-06, "loss": 0.4706, "step": 1199 }, { "epoch": 2.6088611035607503, "grad_norm": 0.07728178799152374, "learning_rate": 9.281953480206725e-06, "loss": 0.4781, "step": 1200 }, { "epoch": 2.6110356075020387, "grad_norm": 0.07513023912906647, "learning_rate": 9.268300635195691e-06, "loss": 0.5963, "step": 1201 }, { "epoch": 2.613210111443327, "grad_norm": 0.06443715840578079, "learning_rate": 9.254649161277853e-06, "loss": 0.5165, "step": 1202 }, { "epoch": 2.6153846153846154, "grad_norm": 0.07779200375080109, "learning_rate": 9.240999084033992e-06, "loss": 0.5717, "step": 1203 }, { "epoch": 2.617559119325904, "grad_norm": 0.09716101735830307, "learning_rate": 9.227350429042278e-06, "loss": 0.4739, "step": 1204 }, { "epoch": 2.619733623267192, "grad_norm": 0.06908995658159256, "learning_rate": 9.213703221878217e-06, "loss": 0.5066, "step": 1205 }, { "epoch": 2.6219081272084805, "grad_norm": 0.06965909898281097, "learning_rate": 9.200057488114585e-06, "loss": 0.5278, "step": 1206 }, { "epoch": 2.624082631149769, "grad_norm": 0.06530669331550598, "learning_rate": 9.18641325332142e-06, "loss": 0.5114, "step": 1207 }, { "epoch": 2.6262571350910573, "grad_norm": 0.06892990320920944, "learning_rate": 9.172770543065932e-06, "loss": 0.4887, "step": 1208 }, { "epoch": 2.6284316390323457, "grad_norm": 0.06376764178276062, "learning_rate": 9.15912938291249e-06, "loss": 0.4603, "step": 1209 }, { "epoch": 2.630606142973634, "grad_norm": 0.07485900074243546, "learning_rate": 9.14548979842255e-06, "loss": 0.6621, "step": 1210 }, { "epoch": 2.6327806469149224, "grad_norm": 0.07980825752019882, "learning_rate": 9.131851815154614e-06, "loss": 0.5252, "step": 1211 }, { "epoch": 2.634955150856211, "grad_norm": 0.07648638635873795, "learning_rate": 9.118215458664185e-06, "loss": 0.605, "step": 1212 }, { "epoch": 2.637129654797499, "grad_norm": 0.07584600150585175, "learning_rate": 9.104580754503721e-06, "loss": 0.5432, "step": 1213 }, { "epoch": 2.6393041587387875, "grad_norm": 0.07356035709381104, "learning_rate": 9.09094772822258e-06, "loss": 0.5562, "step": 1214 }, { "epoch": 2.6414786626800764, "grad_norm": 0.2912083566188812, "learning_rate": 9.07731640536698e-06, "loss": 0.5654, "step": 1215 }, { "epoch": 2.6436531666213643, "grad_norm": 0.07361391186714172, "learning_rate": 9.063686811479945e-06, "loss": 0.5519, "step": 1216 }, { "epoch": 2.645827670562653, "grad_norm": 0.07117127627134323, "learning_rate": 9.050058972101252e-06, "loss": 0.5289, "step": 1217 }, { "epoch": 2.648002174503941, "grad_norm": 0.07450400292873383, "learning_rate": 9.036432912767403e-06, "loss": 0.5601, "step": 1218 }, { "epoch": 2.65017667844523, "grad_norm": 0.07678133994340897, "learning_rate": 9.02280865901156e-06, "loss": 0.5273, "step": 1219 }, { "epoch": 2.6523511823865182, "grad_norm": 0.06322172284126282, "learning_rate": 9.00918623636349e-06, "loss": 0.5058, "step": 1220 }, { "epoch": 2.6545256863278066, "grad_norm": 0.07787950336933136, "learning_rate": 8.99556567034955e-06, "loss": 0.5897, "step": 1221 }, { "epoch": 2.656700190269095, "grad_norm": 0.06649017333984375, "learning_rate": 8.981946986492604e-06, "loss": 0.534, "step": 1222 }, { "epoch": 2.6588746942103834, "grad_norm": 0.06583254039287567, "learning_rate": 8.968330210311985e-06, "loss": 0.4991, "step": 1223 }, { "epoch": 2.6610491981516717, "grad_norm": 0.06641215085983276, "learning_rate": 8.954715367323468e-06, "loss": 0.5184, "step": 1224 }, { "epoch": 2.66322370209296, "grad_norm": 0.0740467831492424, "learning_rate": 8.941102483039188e-06, "loss": 0.6149, "step": 1225 }, { "epoch": 2.6653982060342485, "grad_norm": 0.07603827863931656, "learning_rate": 8.92749158296762e-06, "loss": 0.5968, "step": 1226 }, { "epoch": 2.667572709975537, "grad_norm": 0.061364464461803436, "learning_rate": 8.913882692613522e-06, "loss": 0.4614, "step": 1227 }, { "epoch": 2.6697472139168252, "grad_norm": 0.06896529346704483, "learning_rate": 8.900275837477879e-06, "loss": 0.5992, "step": 1228 }, { "epoch": 2.6719217178581136, "grad_norm": 0.09197258204221725, "learning_rate": 8.886671043057863e-06, "loss": 0.7607, "step": 1229 }, { "epoch": 2.674096221799402, "grad_norm": 0.06679330766201019, "learning_rate": 8.87306833484679e-06, "loss": 0.5869, "step": 1230 }, { "epoch": 2.6762707257406904, "grad_norm": 0.06584884226322174, "learning_rate": 8.859467738334068e-06, "loss": 0.5444, "step": 1231 }, { "epoch": 2.6784452296819787, "grad_norm": 0.08094929158687592, "learning_rate": 8.845869279005133e-06, "loss": 0.7009, "step": 1232 }, { "epoch": 2.680619733623267, "grad_norm": 0.06764519214630127, "learning_rate": 8.83227298234144e-06, "loss": 0.5303, "step": 1233 }, { "epoch": 2.6827942375645555, "grad_norm": 0.0661405622959137, "learning_rate": 8.818678873820368e-06, "loss": 0.534, "step": 1234 }, { "epoch": 2.684968741505844, "grad_norm": 0.06515363603830338, "learning_rate": 8.805086978915215e-06, "loss": 0.4875, "step": 1235 }, { "epoch": 2.6871432454471322, "grad_norm": 0.06673622131347656, "learning_rate": 8.791497323095116e-06, "loss": 0.4684, "step": 1236 }, { "epoch": 2.6893177493884206, "grad_norm": 0.06529201567173004, "learning_rate": 8.777909931825019e-06, "loss": 0.4879, "step": 1237 }, { "epoch": 2.6914922533297094, "grad_norm": 0.07003599405288696, "learning_rate": 8.76432483056563e-06, "loss": 0.6087, "step": 1238 }, { "epoch": 2.6936667572709974, "grad_norm": 0.0646577849984169, "learning_rate": 8.750742044773354e-06, "loss": 0.4659, "step": 1239 }, { "epoch": 2.695841261212286, "grad_norm": 0.070692278444767, "learning_rate": 8.737161599900267e-06, "loss": 0.5848, "step": 1240 }, { "epoch": 2.698015765153574, "grad_norm": 0.07127923518419266, "learning_rate": 8.723583521394054e-06, "loss": 0.6001, "step": 1241 }, { "epoch": 2.700190269094863, "grad_norm": 0.07424425333738327, "learning_rate": 8.71000783469797e-06, "loss": 0.5347, "step": 1242 }, { "epoch": 2.702364773036151, "grad_norm": 0.0665578842163086, "learning_rate": 8.696434565250776e-06, "loss": 0.4981, "step": 1243 }, { "epoch": 2.7045392769774397, "grad_norm": 0.05703388899564743, "learning_rate": 8.682863738486722e-06, "loss": 0.4264, "step": 1244 }, { "epoch": 2.706713780918728, "grad_norm": 0.06786542385816574, "learning_rate": 8.669295379835467e-06, "loss": 0.4847, "step": 1245 }, { "epoch": 2.7088882848600164, "grad_norm": 0.07562483102083206, "learning_rate": 8.655729514722044e-06, "loss": 0.5688, "step": 1246 }, { "epoch": 2.711062788801305, "grad_norm": 0.06452401727437973, "learning_rate": 8.642166168566828e-06, "loss": 0.4938, "step": 1247 }, { "epoch": 2.713237292742593, "grad_norm": 0.06834254413843155, "learning_rate": 8.628605366785459e-06, "loss": 0.5977, "step": 1248 }, { "epoch": 2.7154117966838816, "grad_norm": 0.06163864582777023, "learning_rate": 8.615047134788812e-06, "loss": 0.4607, "step": 1249 }, { "epoch": 2.71758630062517, "grad_norm": 0.08505834639072418, "learning_rate": 8.601491497982956e-06, "loss": 0.6696, "step": 1250 }, { "epoch": 2.7197608045664583, "grad_norm": 0.07000508159399033, "learning_rate": 8.58793848176909e-06, "loss": 0.5007, "step": 1251 }, { "epoch": 2.7219353085077467, "grad_norm": 0.06656965613365173, "learning_rate": 8.574388111543493e-06, "loss": 0.4986, "step": 1252 }, { "epoch": 2.724109812449035, "grad_norm": 0.07175467908382416, "learning_rate": 8.560840412697507e-06, "loss": 0.5402, "step": 1253 }, { "epoch": 2.7262843163903234, "grad_norm": 0.06514792889356613, "learning_rate": 8.547295410617453e-06, "loss": 0.5569, "step": 1254 }, { "epoch": 2.728458820331612, "grad_norm": 0.07471940666437149, "learning_rate": 8.533753130684596e-06, "loss": 0.5551, "step": 1255 }, { "epoch": 2.7306333242729, "grad_norm": 0.06646154075860977, "learning_rate": 8.52021359827512e-06, "loss": 0.4631, "step": 1256 }, { "epoch": 2.7328078282141886, "grad_norm": 0.07374048978090286, "learning_rate": 8.506676838760033e-06, "loss": 0.5824, "step": 1257 }, { "epoch": 2.734982332155477, "grad_norm": 0.0707075446844101, "learning_rate": 8.49314287750517e-06, "loss": 0.5653, "step": 1258 }, { "epoch": 2.7371568360967653, "grad_norm": 0.06668878346681595, "learning_rate": 8.479611739871114e-06, "loss": 0.5073, "step": 1259 }, { "epoch": 2.7393313400380537, "grad_norm": 0.07342474162578583, "learning_rate": 8.466083451213145e-06, "loss": 0.5356, "step": 1260 }, { "epoch": 2.741505843979342, "grad_norm": 0.08089195191860199, "learning_rate": 8.452558036881234e-06, "loss": 0.5539, "step": 1261 }, { "epoch": 2.7436803479206304, "grad_norm": 0.0667836144566536, "learning_rate": 8.439035522219935e-06, "loss": 0.5701, "step": 1262 }, { "epoch": 2.7458548518619192, "grad_norm": 0.06957834959030151, "learning_rate": 8.425515932568383e-06, "loss": 0.5612, "step": 1263 }, { "epoch": 2.748029355803207, "grad_norm": 0.06730025261640549, "learning_rate": 8.411999293260234e-06, "loss": 0.5687, "step": 1264 }, { "epoch": 2.750203859744496, "grad_norm": 0.06978683173656464, "learning_rate": 8.398485629623613e-06, "loss": 0.5887, "step": 1265 }, { "epoch": 2.752378363685784, "grad_norm": 0.06374546885490417, "learning_rate": 8.384974966981063e-06, "loss": 0.4457, "step": 1266 }, { "epoch": 2.7545528676270727, "grad_norm": 0.07195010036230087, "learning_rate": 8.371467330649512e-06, "loss": 0.5339, "step": 1267 }, { "epoch": 2.756727371568361, "grad_norm": 0.06214047223329544, "learning_rate": 8.357962745940215e-06, "loss": 0.4956, "step": 1268 }, { "epoch": 2.7589018755096495, "grad_norm": 0.07307221740484238, "learning_rate": 8.3444612381587e-06, "loss": 0.5156, "step": 1269 }, { "epoch": 2.761076379450938, "grad_norm": 0.06946876645088196, "learning_rate": 8.330962832604747e-06, "loss": 0.556, "step": 1270 }, { "epoch": 2.7632508833922262, "grad_norm": 0.06867927312850952, "learning_rate": 8.317467554572306e-06, "loss": 0.6105, "step": 1271 }, { "epoch": 2.7654253873335146, "grad_norm": 0.07031640410423279, "learning_rate": 8.303975429349473e-06, "loss": 0.523, "step": 1272 }, { "epoch": 2.767599891274803, "grad_norm": 0.06510616838932037, "learning_rate": 8.29048648221844e-06, "loss": 0.5486, "step": 1273 }, { "epoch": 2.7697743952160914, "grad_norm": 0.0716143324971199, "learning_rate": 8.27700073845544e-06, "loss": 0.468, "step": 1274 }, { "epoch": 2.7719488991573797, "grad_norm": 0.07420039921998978, "learning_rate": 8.263518223330698e-06, "loss": 0.552, "step": 1275 }, { "epoch": 2.774123403098668, "grad_norm": 0.07435648143291473, "learning_rate": 8.2500389621084e-06, "loss": 0.6724, "step": 1276 }, { "epoch": 2.7762979070399565, "grad_norm": 0.08702632039785385, "learning_rate": 8.236562980046627e-06, "loss": 0.4825, "step": 1277 }, { "epoch": 2.778472410981245, "grad_norm": 0.0787324458360672, "learning_rate": 8.223090302397313e-06, "loss": 0.5513, "step": 1278 }, { "epoch": 2.7806469149225332, "grad_norm": 0.06984580308198929, "learning_rate": 8.209620954406216e-06, "loss": 0.5721, "step": 1279 }, { "epoch": 2.7828214188638216, "grad_norm": 0.08016245067119598, "learning_rate": 8.19615496131283e-06, "loss": 0.6304, "step": 1280 }, { "epoch": 2.78499592280511, "grad_norm": 0.0605941116809845, "learning_rate": 8.182692348350386e-06, "loss": 0.4696, "step": 1281 }, { "epoch": 2.7871704267463984, "grad_norm": 0.0750289261341095, "learning_rate": 8.169233140745768e-06, "loss": 0.6998, "step": 1282 }, { "epoch": 2.7893449306876867, "grad_norm": 0.059781454503536224, "learning_rate": 8.155777363719472e-06, "loss": 0.4756, "step": 1283 }, { "epoch": 2.791519434628975, "grad_norm": 0.0617891401052475, "learning_rate": 8.142325042485592e-06, "loss": 0.5018, "step": 1284 }, { "epoch": 2.7936939385702635, "grad_norm": 0.08844419568777084, "learning_rate": 8.128876202251719e-06, "loss": 0.5425, "step": 1285 }, { "epoch": 2.7958684425115523, "grad_norm": 0.06936722248792648, "learning_rate": 8.115430868218931e-06, "loss": 0.601, "step": 1286 }, { "epoch": 2.7980429464528402, "grad_norm": 0.06772015988826752, "learning_rate": 8.101989065581742e-06, "loss": 0.5046, "step": 1287 }, { "epoch": 2.800217450394129, "grad_norm": 0.06812779605388641, "learning_rate": 8.088550819528043e-06, "loss": 0.5386, "step": 1288 }, { "epoch": 2.802391954335417, "grad_norm": 0.06669382750988007, "learning_rate": 8.075116155239056e-06, "loss": 0.439, "step": 1289 }, { "epoch": 2.804566458276706, "grad_norm": 0.06634386628866196, "learning_rate": 8.0616850978893e-06, "loss": 0.5296, "step": 1290 }, { "epoch": 2.806740962217994, "grad_norm": 0.06960185617208481, "learning_rate": 8.048257672646534e-06, "loss": 0.6223, "step": 1291 }, { "epoch": 2.8089154661592826, "grad_norm": 0.07757288962602615, "learning_rate": 8.034833904671698e-06, "loss": 0.5247, "step": 1292 }, { "epoch": 2.811089970100571, "grad_norm": 0.07060252875089645, "learning_rate": 8.021413819118903e-06, "loss": 0.5828, "step": 1293 }, { "epoch": 2.8132644740418593, "grad_norm": 0.06385531276464462, "learning_rate": 8.007997441135335e-06, "loss": 0.514, "step": 1294 }, { "epoch": 2.8154389779831477, "grad_norm": 0.06099158152937889, "learning_rate": 7.994584795861248e-06, "loss": 0.376, "step": 1295 }, { "epoch": 2.817613481924436, "grad_norm": 0.07344204187393188, "learning_rate": 7.9811759084299e-06, "loss": 0.561, "step": 1296 }, { "epoch": 2.8197879858657244, "grad_norm": 0.06653238832950592, "learning_rate": 7.967770803967498e-06, "loss": 0.507, "step": 1297 }, { "epoch": 2.821962489807013, "grad_norm": 0.0675218403339386, "learning_rate": 7.954369507593169e-06, "loss": 0.5395, "step": 1298 }, { "epoch": 2.824136993748301, "grad_norm": 0.06554129719734192, "learning_rate": 7.940972044418902e-06, "loss": 0.5475, "step": 1299 }, { "epoch": 2.8263114976895896, "grad_norm": 0.07828188687562943, "learning_rate": 7.927578439549506e-06, "loss": 0.5617, "step": 1300 }, { "epoch": 2.828486001630878, "grad_norm": 0.06951795518398285, "learning_rate": 7.914188718082549e-06, "loss": 0.5009, "step": 1301 }, { "epoch": 2.8306605055721663, "grad_norm": 0.07718538492918015, "learning_rate": 7.900802905108343e-06, "loss": 0.5268, "step": 1302 }, { "epoch": 2.8328350095134547, "grad_norm": 0.06495605409145355, "learning_rate": 7.887421025709851e-06, "loss": 0.4985, "step": 1303 }, { "epoch": 2.835009513454743, "grad_norm": 0.06442841142416, "learning_rate": 7.874043104962683e-06, "loss": 0.5232, "step": 1304 }, { "epoch": 2.8371840173960314, "grad_norm": 0.06869803369045258, "learning_rate": 7.860669167935028e-06, "loss": 0.5875, "step": 1305 }, { "epoch": 2.83935852133732, "grad_norm": 0.07667848467826843, "learning_rate": 7.847299239687597e-06, "loss": 0.5912, "step": 1306 }, { "epoch": 2.841533025278608, "grad_norm": 0.06196955218911171, "learning_rate": 7.833933345273616e-06, "loss": 0.4724, "step": 1307 }, { "epoch": 2.8437075292198966, "grad_norm": 0.06765759736299515, "learning_rate": 7.820571509738724e-06, "loss": 0.5933, "step": 1308 }, { "epoch": 2.8458820331611854, "grad_norm": 0.0725003257393837, "learning_rate": 7.807213758120965e-06, "loss": 0.4934, "step": 1309 }, { "epoch": 2.8480565371024733, "grad_norm": 0.07609444111585617, "learning_rate": 7.793860115450744e-06, "loss": 0.7234, "step": 1310 }, { "epoch": 2.850231041043762, "grad_norm": 0.07180480659008026, "learning_rate": 7.780510606750742e-06, "loss": 0.5824, "step": 1311 }, { "epoch": 2.85240554498505, "grad_norm": 0.06511262059211731, "learning_rate": 7.767165257035907e-06, "loss": 0.5233, "step": 1312 }, { "epoch": 2.854580048926339, "grad_norm": 0.06903766095638275, "learning_rate": 7.753824091313398e-06, "loss": 0.5619, "step": 1313 }, { "epoch": 2.856754552867627, "grad_norm": 0.06484322994947433, "learning_rate": 7.740487134582527e-06, "loss": 0.4709, "step": 1314 }, { "epoch": 2.8589290568089156, "grad_norm": 0.07196953892707825, "learning_rate": 7.727154411834712e-06, "loss": 0.5998, "step": 1315 }, { "epoch": 2.861103560750204, "grad_norm": 0.07472864538431168, "learning_rate": 7.713825948053457e-06, "loss": 0.5151, "step": 1316 }, { "epoch": 2.8632780646914924, "grad_norm": 0.06778397411108017, "learning_rate": 7.700501768214266e-06, "loss": 0.5724, "step": 1317 }, { "epoch": 2.8654525686327807, "grad_norm": 0.06853329390287399, "learning_rate": 7.687181897284625e-06, "loss": 0.522, "step": 1318 }, { "epoch": 2.867627072574069, "grad_norm": 0.10920082032680511, "learning_rate": 7.673866360223947e-06, "loss": 0.6462, "step": 1319 }, { "epoch": 2.8698015765153575, "grad_norm": 0.06963001936674118, "learning_rate": 7.660555181983517e-06, "loss": 0.5073, "step": 1320 }, { "epoch": 2.871976080456646, "grad_norm": 0.07501815259456635, "learning_rate": 7.647248387506457e-06, "loss": 0.4686, "step": 1321 }, { "epoch": 2.8741505843979342, "grad_norm": 0.07118894904851913, "learning_rate": 7.633946001727673e-06, "loss": 0.5234, "step": 1322 }, { "epoch": 2.8763250883392226, "grad_norm": 0.06615978479385376, "learning_rate": 7.620648049573815e-06, "loss": 0.4894, "step": 1323 }, { "epoch": 2.878499592280511, "grad_norm": 0.06646072864532471, "learning_rate": 7.607354555963211e-06, "loss": 0.5409, "step": 1324 }, { "epoch": 2.8806740962217994, "grad_norm": 0.06410951912403107, "learning_rate": 7.5940655458058575e-06, "loss": 0.5105, "step": 1325 }, { "epoch": 2.8828486001630877, "grad_norm": 0.07363732159137726, "learning_rate": 7.580781044003324e-06, "loss": 0.5786, "step": 1326 }, { "epoch": 2.885023104104376, "grad_norm": 0.06527000665664673, "learning_rate": 7.5675010754487485e-06, "loss": 0.505, "step": 1327 }, { "epoch": 2.8871976080456645, "grad_norm": 0.07309246808290482, "learning_rate": 7.5542256650267746e-06, "loss": 0.5937, "step": 1328 }, { "epoch": 2.889372111986953, "grad_norm": 0.08027065545320511, "learning_rate": 7.540954837613488e-06, "loss": 0.5917, "step": 1329 }, { "epoch": 2.8915466159282412, "grad_norm": 0.08110599219799042, "learning_rate": 7.527688618076413e-06, "loss": 0.5205, "step": 1330 }, { "epoch": 2.8937211198695296, "grad_norm": 0.06107301265001297, "learning_rate": 7.514427031274416e-06, "loss": 0.5032, "step": 1331 }, { "epoch": 2.895895623810818, "grad_norm": 0.06903573125600815, "learning_rate": 7.501170102057691e-06, "loss": 0.5268, "step": 1332 }, { "epoch": 2.8980701277521064, "grad_norm": 0.07560260593891144, "learning_rate": 7.487917855267712e-06, "loss": 0.5703, "step": 1333 }, { "epoch": 2.900244631693395, "grad_norm": 0.0747012048959732, "learning_rate": 7.474670315737165e-06, "loss": 0.531, "step": 1334 }, { "epoch": 2.902419135634683, "grad_norm": 0.07406128942966461, "learning_rate": 7.461427508289922e-06, "loss": 0.5802, "step": 1335 }, { "epoch": 2.904593639575972, "grad_norm": 0.06978411972522736, "learning_rate": 7.448189457740991e-06, "loss": 0.4532, "step": 1336 }, { "epoch": 2.90676814351726, "grad_norm": 0.061741672456264496, "learning_rate": 7.434956188896464e-06, "loss": 0.511, "step": 1337 }, { "epoch": 2.9089426474585487, "grad_norm": 0.07843194156885147, "learning_rate": 7.421727726553463e-06, "loss": 0.6214, "step": 1338 }, { "epoch": 2.911117151399837, "grad_norm": 0.08303431421518326, "learning_rate": 7.408504095500124e-06, "loss": 0.8387, "step": 1339 }, { "epoch": 2.9132916553411254, "grad_norm": 0.06255075335502625, "learning_rate": 7.395285320515513e-06, "loss": 0.512, "step": 1340 }, { "epoch": 2.915466159282414, "grad_norm": 0.07399558275938034, "learning_rate": 7.382071426369597e-06, "loss": 0.6102, "step": 1341 }, { "epoch": 2.917640663223702, "grad_norm": 0.08228182792663574, "learning_rate": 7.368862437823211e-06, "loss": 0.6831, "step": 1342 }, { "epoch": 2.9198151671649906, "grad_norm": 0.0692390650510788, "learning_rate": 7.355658379627981e-06, "loss": 0.5133, "step": 1343 }, { "epoch": 2.921989671106279, "grad_norm": 0.06947391480207443, "learning_rate": 7.342459276526301e-06, "loss": 0.499, "step": 1344 }, { "epoch": 2.9241641750475673, "grad_norm": 0.06507532298564911, "learning_rate": 7.329265153251285e-06, "loss": 0.5273, "step": 1345 }, { "epoch": 2.9263386789888557, "grad_norm": 0.06730987876653671, "learning_rate": 7.316076034526712e-06, "loss": 0.4775, "step": 1346 }, { "epoch": 2.928513182930144, "grad_norm": 0.0776253491640091, "learning_rate": 7.302891945066975e-06, "loss": 0.4829, "step": 1347 }, { "epoch": 2.9306876868714324, "grad_norm": 0.07368741184473038, "learning_rate": 7.289712909577056e-06, "loss": 0.5377, "step": 1348 }, { "epoch": 2.932862190812721, "grad_norm": 0.06706392019987106, "learning_rate": 7.276538952752459e-06, "loss": 0.4498, "step": 1349 }, { "epoch": 2.935036694754009, "grad_norm": 0.06259384751319885, "learning_rate": 7.263370099279173e-06, "loss": 0.5458, "step": 1350 }, { "epoch": 2.9372111986952976, "grad_norm": 0.07185040414333344, "learning_rate": 7.250206373833626e-06, "loss": 0.596, "step": 1351 }, { "epoch": 2.939385702636586, "grad_norm": 0.10021599382162094, "learning_rate": 7.2370478010826285e-06, "loss": 0.5385, "step": 1352 }, { "epoch": 2.9415602065778743, "grad_norm": 0.0687226951122284, "learning_rate": 7.223894405683355e-06, "loss": 0.5835, "step": 1353 }, { "epoch": 2.9437347105191627, "grad_norm": 0.06232772767543793, "learning_rate": 7.210746212283255e-06, "loss": 0.487, "step": 1354 }, { "epoch": 2.945909214460451, "grad_norm": 0.07023363560438156, "learning_rate": 7.197603245520042e-06, "loss": 0.6531, "step": 1355 }, { "epoch": 2.9480837184017394, "grad_norm": 0.06999558955430984, "learning_rate": 7.1844655300216405e-06, "loss": 0.5648, "step": 1356 }, { "epoch": 2.9502582223430283, "grad_norm": 0.06736975163221359, "learning_rate": 7.171333090406123e-06, "loss": 0.4642, "step": 1357 }, { "epoch": 2.952432726284316, "grad_norm": 0.06202093884348869, "learning_rate": 7.158205951281681e-06, "loss": 0.5079, "step": 1358 }, { "epoch": 2.954607230225605, "grad_norm": 0.06691930443048477, "learning_rate": 7.1450841372465806e-06, "loss": 0.5108, "step": 1359 }, { "epoch": 2.956781734166893, "grad_norm": 0.07289564609527588, "learning_rate": 7.131967672889101e-06, "loss": 0.6934, "step": 1360 }, { "epoch": 2.9589562381081818, "grad_norm": 0.06416785717010498, "learning_rate": 7.11885658278749e-06, "loss": 0.5921, "step": 1361 }, { "epoch": 2.9611307420494697, "grad_norm": 0.06721733510494232, "learning_rate": 7.105750891509951e-06, "loss": 0.5276, "step": 1362 }, { "epoch": 2.9633052459907585, "grad_norm": 0.06849218904972076, "learning_rate": 7.092650623614541e-06, "loss": 0.6581, "step": 1363 }, { "epoch": 2.965479749932047, "grad_norm": 0.07301518321037292, "learning_rate": 7.079555803649169e-06, "loss": 0.5055, "step": 1364 }, { "epoch": 2.9676542538733353, "grad_norm": 0.06906985491514206, "learning_rate": 7.066466456151541e-06, "loss": 0.543, "step": 1365 }, { "epoch": 2.9698287578146236, "grad_norm": 0.07445051521062851, "learning_rate": 7.053382605649094e-06, "loss": 0.6043, "step": 1366 }, { "epoch": 2.972003261755912, "grad_norm": 0.06771831959486008, "learning_rate": 7.040304276658971e-06, "loss": 0.4743, "step": 1367 }, { "epoch": 2.9741777656972004, "grad_norm": 0.0739327147603035, "learning_rate": 7.027231493687974e-06, "loss": 0.5989, "step": 1368 }, { "epoch": 2.9763522696384888, "grad_norm": 0.07092243432998657, "learning_rate": 7.014164281232508e-06, "loss": 0.5477, "step": 1369 }, { "epoch": 2.978526773579777, "grad_norm": 0.0664169043302536, "learning_rate": 7.001102663778533e-06, "loss": 0.5318, "step": 1370 }, { "epoch": 2.9807012775210655, "grad_norm": 0.06561508029699326, "learning_rate": 6.988046665801536e-06, "loss": 0.5195, "step": 1371 }, { "epoch": 2.982875781462354, "grad_norm": 0.05872855708003044, "learning_rate": 6.97499631176647e-06, "loss": 0.439, "step": 1372 }, { "epoch": 2.9850502854036423, "grad_norm": 0.07005491852760315, "learning_rate": 6.961951626127707e-06, "loss": 0.5556, "step": 1373 }, { "epoch": 2.9872247893449306, "grad_norm": 0.060724273324012756, "learning_rate": 6.948912633329008e-06, "loss": 0.4953, "step": 1374 }, { "epoch": 2.989399293286219, "grad_norm": 0.07059051096439362, "learning_rate": 6.935879357803453e-06, "loss": 0.5721, "step": 1375 }, { "epoch": 2.9915737972275074, "grad_norm": 0.0659119039773941, "learning_rate": 6.922851823973422e-06, "loss": 0.5169, "step": 1376 }, { "epoch": 2.9937483011687958, "grad_norm": 0.07893026620149612, "learning_rate": 6.909830056250527e-06, "loss": 0.4338, "step": 1377 }, { "epoch": 2.995922805110084, "grad_norm": 0.07408333569765091, "learning_rate": 6.896814079035575e-06, "loss": 0.6015, "step": 1378 }, { "epoch": 2.9980973090513725, "grad_norm": 0.07099418342113495, "learning_rate": 6.883803916718534e-06, "loss": 0.5266, "step": 1379 }, { "epoch": 3.0, "grad_norm": 0.072501540184021, "learning_rate": 6.870799593678459e-06, "loss": 0.5303, "step": 1380 }, { "epoch": 3.0, "eval_loss": 0.5516069531440735, "eval_runtime": 13.5972, "eval_samples_per_second": 5.442, "eval_steps_per_second": 5.442, "step": 1380 }, { "epoch": 3.0021745039412884, "grad_norm": 0.0665617436170578, "learning_rate": 6.8578011342834705e-06, "loss": 0.5155, "step": 1381 }, { "epoch": 3.0043490078825767, "grad_norm": 0.06587549299001694, "learning_rate": 6.844808562890709e-06, "loss": 0.4649, "step": 1382 }, { "epoch": 3.006523511823865, "grad_norm": 0.06332812458276749, "learning_rate": 6.831821903846274e-06, "loss": 0.5312, "step": 1383 }, { "epoch": 3.0086980157651535, "grad_norm": 0.0684223398566246, "learning_rate": 6.818841181485179e-06, "loss": 0.6175, "step": 1384 }, { "epoch": 3.010872519706442, "grad_norm": 0.08035726845264435, "learning_rate": 6.80586642013133e-06, "loss": 0.5844, "step": 1385 }, { "epoch": 3.0130470236477302, "grad_norm": 0.07305752485990524, "learning_rate": 6.7928976440974504e-06, "loss": 0.6432, "step": 1386 }, { "epoch": 3.0152215275890186, "grad_norm": 0.06890946626663208, "learning_rate": 6.77993487768505e-06, "loss": 0.5206, "step": 1387 }, { "epoch": 3.017396031530307, "grad_norm": 0.07270307838916779, "learning_rate": 6.766978145184386e-06, "loss": 0.5629, "step": 1388 }, { "epoch": 3.0195705354715954, "grad_norm": 0.06503846496343613, "learning_rate": 6.7540274708743955e-06, "loss": 0.5037, "step": 1389 }, { "epoch": 3.0217450394128837, "grad_norm": 0.07120536267757416, "learning_rate": 6.741082879022671e-06, "loss": 0.5367, "step": 1390 }, { "epoch": 3.023919543354172, "grad_norm": 0.06598308682441711, "learning_rate": 6.728144393885411e-06, "loss": 0.4871, "step": 1391 }, { "epoch": 3.026094047295461, "grad_norm": 0.06610432267189026, "learning_rate": 6.715212039707364e-06, "loss": 0.4791, "step": 1392 }, { "epoch": 3.0282685512367493, "grad_norm": 0.07477118074893951, "learning_rate": 6.7022858407217895e-06, "loss": 0.5088, "step": 1393 }, { "epoch": 3.0304430551780377, "grad_norm": 0.07251028716564178, "learning_rate": 6.689365821150421e-06, "loss": 0.4589, "step": 1394 }, { "epoch": 3.032617559119326, "grad_norm": 0.07237108051776886, "learning_rate": 6.6764520052034054e-06, "loss": 0.5186, "step": 1395 }, { "epoch": 3.0347920630606144, "grad_norm": 0.07138099521398544, "learning_rate": 6.6635444170792665e-06, "loss": 0.5382, "step": 1396 }, { "epoch": 3.036966567001903, "grad_norm": 0.07056163996458054, "learning_rate": 6.650643080964863e-06, "loss": 0.5116, "step": 1397 }, { "epoch": 3.039141070943191, "grad_norm": 0.07845355570316315, "learning_rate": 6.637748021035331e-06, "loss": 0.6569, "step": 1398 }, { "epoch": 3.0413155748844796, "grad_norm": 0.07624215632677078, "learning_rate": 6.624859261454047e-06, "loss": 0.6075, "step": 1399 }, { "epoch": 3.043490078825768, "grad_norm": 0.06629661470651627, "learning_rate": 6.61197682637259e-06, "loss": 0.5827, "step": 1400 }, { "epoch": 3.0456645827670563, "grad_norm": 0.07915958017110825, "learning_rate": 6.599100739930677e-06, "loss": 0.5737, "step": 1401 }, { "epoch": 3.0478390867083447, "grad_norm": 0.06905808299779892, "learning_rate": 6.586231026256139e-06, "loss": 0.552, "step": 1402 }, { "epoch": 3.050013590649633, "grad_norm": 0.06818052381277084, "learning_rate": 6.573367709464854e-06, "loss": 0.4281, "step": 1403 }, { "epoch": 3.0521880945909214, "grad_norm": 0.06372939050197601, "learning_rate": 6.560510813660719e-06, "loss": 0.4565, "step": 1404 }, { "epoch": 3.05436259853221, "grad_norm": 0.06479248404502869, "learning_rate": 6.547660362935603e-06, "loss": 0.4542, "step": 1405 }, { "epoch": 3.056537102473498, "grad_norm": 0.0800362154841423, "learning_rate": 6.534816381369293e-06, "loss": 0.5894, "step": 1406 }, { "epoch": 3.0587116064147866, "grad_norm": 0.07278084009885788, "learning_rate": 6.521978893029452e-06, "loss": 0.6257, "step": 1407 }, { "epoch": 3.060886110356075, "grad_norm": 0.07560984790325165, "learning_rate": 6.509147921971581e-06, "loss": 0.5687, "step": 1408 }, { "epoch": 3.0630606142973633, "grad_norm": 0.06757213920354843, "learning_rate": 6.496323492238966e-06, "loss": 0.5189, "step": 1409 }, { "epoch": 3.0652351182386517, "grad_norm": 0.07011523097753525, "learning_rate": 6.483505627862632e-06, "loss": 0.5406, "step": 1410 }, { "epoch": 3.06740962217994, "grad_norm": 0.0660422071814537, "learning_rate": 6.4706943528613135e-06, "loss": 0.4708, "step": 1411 }, { "epoch": 3.0695841261212284, "grad_norm": 0.0678916648030281, "learning_rate": 6.45788969124138e-06, "loss": 0.4988, "step": 1412 }, { "epoch": 3.071758630062517, "grad_norm": 0.06157972291111946, "learning_rate": 6.44509166699682e-06, "loss": 0.4805, "step": 1413 }, { "epoch": 3.073933134003805, "grad_norm": 0.07780177146196365, "learning_rate": 6.432300304109185e-06, "loss": 0.5044, "step": 1414 }, { "epoch": 3.0761076379450936, "grad_norm": 0.07072564214468002, "learning_rate": 6.419515626547543e-06, "loss": 0.5281, "step": 1415 }, { "epoch": 3.0782821418863824, "grad_norm": 0.07399918138980865, "learning_rate": 6.406737658268425e-06, "loss": 0.6119, "step": 1416 }, { "epoch": 3.0804566458276708, "grad_norm": 0.07210330665111542, "learning_rate": 6.393966423215805e-06, "loss": 0.5283, "step": 1417 }, { "epoch": 3.082631149768959, "grad_norm": 0.0608433336019516, "learning_rate": 6.381201945321032e-06, "loss": 0.4899, "step": 1418 }, { "epoch": 3.0848056537102475, "grad_norm": 0.08315640687942505, "learning_rate": 6.368444248502789e-06, "loss": 0.5193, "step": 1419 }, { "epoch": 3.086980157651536, "grad_norm": 0.06949049234390259, "learning_rate": 6.3556933566670656e-06, "loss": 0.5481, "step": 1420 }, { "epoch": 3.0891546615928243, "grad_norm": 0.06931359320878983, "learning_rate": 6.342949293707084e-06, "loss": 0.4613, "step": 1421 }, { "epoch": 3.0913291655341126, "grad_norm": 0.07117046415805817, "learning_rate": 6.33021208350328e-06, "loss": 0.5552, "step": 1422 }, { "epoch": 3.093503669475401, "grad_norm": 0.07045596092939377, "learning_rate": 6.317481749923249e-06, "loss": 0.5894, "step": 1423 }, { "epoch": 3.0956781734166894, "grad_norm": 0.06807363778352737, "learning_rate": 6.304758316821692e-06, "loss": 0.5042, "step": 1424 }, { "epoch": 3.0978526773579778, "grad_norm": 0.06837022304534912, "learning_rate": 6.292041808040393e-06, "loss": 0.6236, "step": 1425 }, { "epoch": 3.100027181299266, "grad_norm": 0.07531827688217163, "learning_rate": 6.279332247408148e-06, "loss": 0.5989, "step": 1426 }, { "epoch": 3.1022016852405545, "grad_norm": 0.07794953882694244, "learning_rate": 6.266629658740736e-06, "loss": 0.7, "step": 1427 }, { "epoch": 3.104376189181843, "grad_norm": 0.07407008111476898, "learning_rate": 6.25393406584088e-06, "loss": 0.6086, "step": 1428 }, { "epoch": 3.1065506931231313, "grad_norm": 0.07670054584741592, "learning_rate": 6.2412454924981866e-06, "loss": 0.7971, "step": 1429 }, { "epoch": 3.1087251970644196, "grad_norm": 0.06695901602506638, "learning_rate": 6.228563962489106e-06, "loss": 0.4448, "step": 1430 }, { "epoch": 3.110899701005708, "grad_norm": 0.09261689335107803, "learning_rate": 6.215889499576898e-06, "loss": 0.7318, "step": 1431 }, { "epoch": 3.1130742049469964, "grad_norm": 0.06849246472120285, "learning_rate": 6.203222127511577e-06, "loss": 0.5618, "step": 1432 }, { "epoch": 3.1152487088882848, "grad_norm": 0.06765958666801453, "learning_rate": 6.190561870029865e-06, "loss": 0.4821, "step": 1433 }, { "epoch": 3.117423212829573, "grad_norm": 0.06691266596317291, "learning_rate": 6.177908750855164e-06, "loss": 0.5369, "step": 1434 }, { "epoch": 3.1195977167708615, "grad_norm": 0.06501248478889465, "learning_rate": 6.165262793697486e-06, "loss": 0.4885, "step": 1435 }, { "epoch": 3.12177222071215, "grad_norm": 0.06912730634212494, "learning_rate": 6.152624022253429e-06, "loss": 0.5459, "step": 1436 }, { "epoch": 3.1239467246534383, "grad_norm": 0.08368445932865143, "learning_rate": 6.139992460206132e-06, "loss": 0.7838, "step": 1437 }, { "epoch": 3.1261212285947266, "grad_norm": 0.07916920632123947, "learning_rate": 6.127368131225217e-06, "loss": 0.5644, "step": 1438 }, { "epoch": 3.1282957325360154, "grad_norm": 0.07402565330266953, "learning_rate": 6.1147510589667505e-06, "loss": 0.4947, "step": 1439 }, { "epoch": 3.1304702364773034, "grad_norm": 0.08299224078655243, "learning_rate": 6.102141267073207e-06, "loss": 0.5363, "step": 1440 }, { "epoch": 3.132644740418592, "grad_norm": 0.06814849376678467, "learning_rate": 6.089538779173418e-06, "loss": 0.4615, "step": 1441 }, { "epoch": 3.1348192443598806, "grad_norm": 0.06374480575323105, "learning_rate": 6.076943618882525e-06, "loss": 0.4545, "step": 1442 }, { "epoch": 3.136993748301169, "grad_norm": 0.062771737575531, "learning_rate": 6.064355809801943e-06, "loss": 0.5509, "step": 1443 }, { "epoch": 3.1391682522424573, "grad_norm": 0.07458538562059402, "learning_rate": 6.051775375519306e-06, "loss": 0.6024, "step": 1444 }, { "epoch": 3.1413427561837457, "grad_norm": 0.05776078626513481, "learning_rate": 6.039202339608432e-06, "loss": 0.4865, "step": 1445 }, { "epoch": 3.143517260125034, "grad_norm": 0.07012618333101273, "learning_rate": 6.02663672562928e-06, "loss": 0.5067, "step": 1446 }, { "epoch": 3.1456917640663224, "grad_norm": 0.07047531008720398, "learning_rate": 6.014078557127893e-06, "loss": 0.5218, "step": 1447 }, { "epoch": 3.147866268007611, "grad_norm": 0.06444890797138214, "learning_rate": 6.001527857636373e-06, "loss": 0.5192, "step": 1448 }, { "epoch": 3.150040771948899, "grad_norm": 0.06882480531930923, "learning_rate": 5.988984650672813e-06, "loss": 0.5174, "step": 1449 }, { "epoch": 3.1522152758901876, "grad_norm": 0.06989241391420364, "learning_rate": 5.9764489597412744e-06, "loss": 0.5181, "step": 1450 }, { "epoch": 3.154389779831476, "grad_norm": 0.0698571726679802, "learning_rate": 5.963920808331736e-06, "loss": 0.5353, "step": 1451 }, { "epoch": 3.1565642837727643, "grad_norm": 0.06750179082155228, "learning_rate": 5.951400219920046e-06, "loss": 0.5477, "step": 1452 }, { "epoch": 3.1587387877140527, "grad_norm": 0.07366013526916504, "learning_rate": 5.938887217967875e-06, "loss": 0.6285, "step": 1453 }, { "epoch": 3.160913291655341, "grad_norm": 0.0660935789346695, "learning_rate": 5.926381825922689e-06, "loss": 0.5076, "step": 1454 }, { "epoch": 3.1630877955966294, "grad_norm": 0.07364829629659653, "learning_rate": 5.913884067217686e-06, "loss": 0.5424, "step": 1455 }, { "epoch": 3.165262299537918, "grad_norm": 0.0705757662653923, "learning_rate": 5.901393965271762e-06, "loss": 0.5174, "step": 1456 }, { "epoch": 3.167436803479206, "grad_norm": 0.0712607353925705, "learning_rate": 5.888911543489472e-06, "loss": 0.531, "step": 1457 }, { "epoch": 3.1696113074204946, "grad_norm": 0.06296918541193008, "learning_rate": 5.876436825260967e-06, "loss": 0.4628, "step": 1458 }, { "epoch": 3.171785811361783, "grad_norm": 0.06683739274740219, "learning_rate": 5.86396983396197e-06, "loss": 0.4874, "step": 1459 }, { "epoch": 3.1739603153030713, "grad_norm": 0.0695076435804367, "learning_rate": 5.851510592953729e-06, "loss": 0.5598, "step": 1460 }, { "epoch": 3.1761348192443597, "grad_norm": 0.08145108819007874, "learning_rate": 5.839059125582964e-06, "loss": 0.5819, "step": 1461 }, { "epoch": 3.178309323185648, "grad_norm": 0.0863773450255394, "learning_rate": 5.8266154551818225e-06, "loss": 0.4501, "step": 1462 }, { "epoch": 3.1804838271269364, "grad_norm": 0.07229464501142502, "learning_rate": 5.814179605067856e-06, "loss": 0.4857, "step": 1463 }, { "epoch": 3.1826583310682253, "grad_norm": 0.0719052404165268, "learning_rate": 5.801751598543947e-06, "loss": 0.5333, "step": 1464 }, { "epoch": 3.1848328350095136, "grad_norm": 0.07272771745920181, "learning_rate": 5.7893314588982905e-06, "loss": 0.5321, "step": 1465 }, { "epoch": 3.187007338950802, "grad_norm": 0.07322679460048676, "learning_rate": 5.776919209404342e-06, "loss": 0.5754, "step": 1466 }, { "epoch": 3.1891818428920904, "grad_norm": 0.07561824470758438, "learning_rate": 5.764514873320761e-06, "loss": 0.6539, "step": 1467 }, { "epoch": 3.1913563468333788, "grad_norm": 0.0636778250336647, "learning_rate": 5.752118473891379e-06, "loss": 0.4145, "step": 1468 }, { "epoch": 3.193530850774667, "grad_norm": 0.06821925938129425, "learning_rate": 5.739730034345177e-06, "loss": 0.5242, "step": 1469 }, { "epoch": 3.1957053547159555, "grad_norm": 0.07246805727481842, "learning_rate": 5.727349577896194e-06, "loss": 0.5262, "step": 1470 }, { "epoch": 3.197879858657244, "grad_norm": 0.0750991553068161, "learning_rate": 5.714977127743517e-06, "loss": 0.6492, "step": 1471 }, { "epoch": 3.2000543625985323, "grad_norm": 0.07709258049726486, "learning_rate": 5.702612707071245e-06, "loss": 0.491, "step": 1472 }, { "epoch": 3.2022288665398206, "grad_norm": 0.07309066504240036, "learning_rate": 5.6902563390484025e-06, "loss": 0.498, "step": 1473 }, { "epoch": 3.204403370481109, "grad_norm": 0.07608596235513687, "learning_rate": 5.677908046828961e-06, "loss": 0.5772, "step": 1474 }, { "epoch": 3.2065778744223974, "grad_norm": 0.076353520154953, "learning_rate": 5.66556785355173e-06, "loss": 0.5636, "step": 1475 }, { "epoch": 3.2087523783636858, "grad_norm": 0.06181556358933449, "learning_rate": 5.653235782340351e-06, "loss": 0.4989, "step": 1476 }, { "epoch": 3.210926882304974, "grad_norm": 0.06924404203891754, "learning_rate": 5.640911856303255e-06, "loss": 0.5696, "step": 1477 }, { "epoch": 3.2131013862462625, "grad_norm": 0.0647667869925499, "learning_rate": 5.628596098533592e-06, "loss": 0.429, "step": 1478 }, { "epoch": 3.215275890187551, "grad_norm": 0.07007181644439697, "learning_rate": 5.616288532109225e-06, "loss": 0.5754, "step": 1479 }, { "epoch": 3.2174503941288393, "grad_norm": 0.06692586094141006, "learning_rate": 5.603989180092661e-06, "loss": 0.4486, "step": 1480 }, { "epoch": 3.2196248980701276, "grad_norm": 0.06372535228729248, "learning_rate": 5.59169806553101e-06, "loss": 0.5077, "step": 1481 }, { "epoch": 3.221799402011416, "grad_norm": 0.08127161860466003, "learning_rate": 5.579415211455941e-06, "loss": 0.9809, "step": 1482 }, { "epoch": 3.2239739059527044, "grad_norm": 0.07300285249948502, "learning_rate": 5.567140640883666e-06, "loss": 0.5117, "step": 1483 }, { "epoch": 3.2261484098939928, "grad_norm": 0.0771259069442749, "learning_rate": 5.554874376814856e-06, "loss": 0.5193, "step": 1484 }, { "epoch": 3.228322913835281, "grad_norm": 0.06706356257200241, "learning_rate": 5.542616442234618e-06, "loss": 0.5221, "step": 1485 }, { "epoch": 3.2304974177765695, "grad_norm": 0.06944621354341507, "learning_rate": 5.5303668601124615e-06, "loss": 0.5029, "step": 1486 }, { "epoch": 3.2326719217178583, "grad_norm": 0.07193751633167267, "learning_rate": 5.518125653402232e-06, "loss": 0.5924, "step": 1487 }, { "epoch": 3.2348464256591467, "grad_norm": 0.07443910837173462, "learning_rate": 5.505892845042089e-06, "loss": 0.5617, "step": 1488 }, { "epoch": 3.237020929600435, "grad_norm": 0.06898069381713867, "learning_rate": 5.493668457954458e-06, "loss": 0.493, "step": 1489 }, { "epoch": 3.2391954335417235, "grad_norm": 0.0673757940530777, "learning_rate": 5.481452515045974e-06, "loss": 0.5501, "step": 1490 }, { "epoch": 3.241369937483012, "grad_norm": 0.08749575167894363, "learning_rate": 5.469245039207452e-06, "loss": 0.5735, "step": 1491 }, { "epoch": 3.2435444414243, "grad_norm": 0.06609432399272919, "learning_rate": 5.457046053313844e-06, "loss": 0.4586, "step": 1492 }, { "epoch": 3.2457189453655886, "grad_norm": 0.0705268532037735, "learning_rate": 5.444855580224197e-06, "loss": 0.5275, "step": 1493 }, { "epoch": 3.247893449306877, "grad_norm": 0.06885748356580734, "learning_rate": 5.432673642781595e-06, "loss": 0.5414, "step": 1494 }, { "epoch": 3.2500679532481653, "grad_norm": 0.06610836088657379, "learning_rate": 5.420500263813141e-06, "loss": 0.5158, "step": 1495 }, { "epoch": 3.2522424571894537, "grad_norm": 0.06912900507450104, "learning_rate": 5.4083354661298816e-06, "loss": 0.5321, "step": 1496 }, { "epoch": 3.254416961130742, "grad_norm": 0.07417958229780197, "learning_rate": 5.39617927252681e-06, "loss": 0.5163, "step": 1497 }, { "epoch": 3.2565914650720305, "grad_norm": 0.06548818200826645, "learning_rate": 5.384031705782776e-06, "loss": 0.4727, "step": 1498 }, { "epoch": 3.258765969013319, "grad_norm": 0.0695524513721466, "learning_rate": 5.371892788660465e-06, "loss": 0.5171, "step": 1499 }, { "epoch": 3.260940472954607, "grad_norm": 0.07387632131576538, "learning_rate": 5.3597625439063685e-06, "loss": 0.6274, "step": 1500 }, { "epoch": 3.2631149768958956, "grad_norm": 0.09006468951702118, "learning_rate": 5.347640994250709e-06, "loss": 0.6872, "step": 1501 }, { "epoch": 3.265289480837184, "grad_norm": 0.06723899394273758, "learning_rate": 5.335528162407428e-06, "loss": 0.5406, "step": 1502 }, { "epoch": 3.2674639847784723, "grad_norm": 0.07188381254673004, "learning_rate": 5.3234240710741335e-06, "loss": 0.5613, "step": 1503 }, { "epoch": 3.2696384887197607, "grad_norm": 0.06514464318752289, "learning_rate": 5.311328742932045e-06, "loss": 0.4948, "step": 1504 }, { "epoch": 3.271812992661049, "grad_norm": 0.06981280446052551, "learning_rate": 5.299242200645959e-06, "loss": 0.5161, "step": 1505 }, { "epoch": 3.2739874966023375, "grad_norm": 0.07136043906211853, "learning_rate": 5.28716446686423e-06, "loss": 0.5586, "step": 1506 }, { "epoch": 3.276162000543626, "grad_norm": 0.08772621303796768, "learning_rate": 5.275095564218683e-06, "loss": 0.5153, "step": 1507 }, { "epoch": 3.278336504484914, "grad_norm": 0.0654069259762764, "learning_rate": 5.263035515324601e-06, "loss": 0.4687, "step": 1508 }, { "epoch": 3.2805110084262026, "grad_norm": 0.06877514719963074, "learning_rate": 5.2509843427806895e-06, "loss": 0.4796, "step": 1509 }, { "epoch": 3.2826855123674914, "grad_norm": 0.07644962519407272, "learning_rate": 5.238942069169e-06, "loss": 0.6195, "step": 1510 }, { "epoch": 3.2848600163087793, "grad_norm": 0.06615381687879562, "learning_rate": 5.226908717054923e-06, "loss": 0.4852, "step": 1511 }, { "epoch": 3.287034520250068, "grad_norm": 0.06828905642032623, "learning_rate": 5.214884308987136e-06, "loss": 0.4183, "step": 1512 }, { "epoch": 3.2892090241913565, "grad_norm": 0.07456020265817642, "learning_rate": 5.202868867497542e-06, "loss": 0.5605, "step": 1513 }, { "epoch": 3.291383528132645, "grad_norm": 0.05766000971198082, "learning_rate": 5.1908624151012444e-06, "loss": 0.4267, "step": 1514 }, { "epoch": 3.2935580320739333, "grad_norm": 0.06423773616552353, "learning_rate": 5.178864974296511e-06, "loss": 0.5164, "step": 1515 }, { "epoch": 3.2957325360152216, "grad_norm": 0.08719263970851898, "learning_rate": 5.166876567564725e-06, "loss": 0.6394, "step": 1516 }, { "epoch": 3.29790703995651, "grad_norm": 0.0771080031991005, "learning_rate": 5.154897217370325e-06, "loss": 0.5582, "step": 1517 }, { "epoch": 3.3000815438977984, "grad_norm": 0.06579708307981491, "learning_rate": 5.142926946160799e-06, "loss": 0.4431, "step": 1518 }, { "epoch": 3.3022560478390868, "grad_norm": 0.07202615588903427, "learning_rate": 5.1309657763665984e-06, "loss": 0.6164, "step": 1519 }, { "epoch": 3.304430551780375, "grad_norm": 0.06840148568153381, "learning_rate": 5.119013730401152e-06, "loss": 0.5064, "step": 1520 }, { "epoch": 3.3066050557216635, "grad_norm": 0.07182037830352783, "learning_rate": 5.107070830660765e-06, "loss": 0.5071, "step": 1521 }, { "epoch": 3.308779559662952, "grad_norm": 0.07210621237754822, "learning_rate": 5.0951370995246104e-06, "loss": 0.4517, "step": 1522 }, { "epoch": 3.3109540636042403, "grad_norm": 0.06664469093084335, "learning_rate": 5.08321255935469e-06, "loss": 0.5221, "step": 1523 }, { "epoch": 3.3131285675455286, "grad_norm": 0.06676912307739258, "learning_rate": 5.0712972324957686e-06, "loss": 0.5516, "step": 1524 }, { "epoch": 3.315303071486817, "grad_norm": 0.06652951240539551, "learning_rate": 5.059391141275358e-06, "loss": 0.4839, "step": 1525 }, { "epoch": 3.3174775754281054, "grad_norm": 0.07276313006877899, "learning_rate": 5.047494308003666e-06, "loss": 0.5368, "step": 1526 }, { "epoch": 3.3196520793693938, "grad_norm": 0.07062160223722458, "learning_rate": 5.0356067549735384e-06, "loss": 0.5689, "step": 1527 }, { "epoch": 3.321826583310682, "grad_norm": 0.06839644908905029, "learning_rate": 5.023728504460441e-06, "loss": 0.5474, "step": 1528 }, { "epoch": 3.3240010872519705, "grad_norm": 0.07201429456472397, "learning_rate": 5.011859578722408e-06, "loss": 0.5872, "step": 1529 }, { "epoch": 3.326175591193259, "grad_norm": 0.06830208748579025, "learning_rate": 5.000000000000003e-06, "loss": 0.5062, "step": 1530 }, { "epoch": 3.3283500951345473, "grad_norm": 0.06735843420028687, "learning_rate": 4.9881497905162625e-06, "loss": 0.5939, "step": 1531 }, { "epoch": 3.3305245990758356, "grad_norm": 0.07623067498207092, "learning_rate": 4.976308972476682e-06, "loss": 0.6393, "step": 1532 }, { "epoch": 3.3326991030171245, "grad_norm": 0.07525830715894699, "learning_rate": 4.964477568069146e-06, "loss": 0.4551, "step": 1533 }, { "epoch": 3.3348736069584124, "grad_norm": 0.06960330903530121, "learning_rate": 4.952655599463907e-06, "loss": 0.5311, "step": 1534 }, { "epoch": 3.337048110899701, "grad_norm": 0.06668455898761749, "learning_rate": 4.940843088813537e-06, "loss": 0.4814, "step": 1535 }, { "epoch": 3.3392226148409896, "grad_norm": 0.07341435551643372, "learning_rate": 4.929040058252882e-06, "loss": 0.5188, "step": 1536 }, { "epoch": 3.341397118782278, "grad_norm": 0.06989917904138565, "learning_rate": 4.917246529899017e-06, "loss": 0.5785, "step": 1537 }, { "epoch": 3.3435716227235663, "grad_norm": 0.06539507955312729, "learning_rate": 4.9054625258512226e-06, "loss": 0.6527, "step": 1538 }, { "epoch": 3.3457461266648547, "grad_norm": 0.09122632443904877, "learning_rate": 4.893688068190933e-06, "loss": 0.5692, "step": 1539 }, { "epoch": 3.347920630606143, "grad_norm": 0.07049881666898727, "learning_rate": 4.881923178981681e-06, "loss": 0.5568, "step": 1540 }, { "epoch": 3.3500951345474315, "grad_norm": 0.06411641836166382, "learning_rate": 4.870167880269085e-06, "loss": 0.4351, "step": 1541 }, { "epoch": 3.35226963848872, "grad_norm": 0.08124033361673355, "learning_rate": 4.858422194080777e-06, "loss": 0.5654, "step": 1542 }, { "epoch": 3.354444142430008, "grad_norm": 0.0763922855257988, "learning_rate": 4.846686142426389e-06, "loss": 0.6137, "step": 1543 }, { "epoch": 3.3566186463712966, "grad_norm": 0.07945924997329712, "learning_rate": 4.834959747297498e-06, "loss": 0.5628, "step": 1544 }, { "epoch": 3.358793150312585, "grad_norm": 0.07352941483259201, "learning_rate": 4.823243030667576e-06, "loss": 0.6252, "step": 1545 }, { "epoch": 3.3609676542538733, "grad_norm": 0.06980722397565842, "learning_rate": 4.811536014491972e-06, "loss": 0.6106, "step": 1546 }, { "epoch": 3.3631421581951617, "grad_norm": 0.08611784130334854, "learning_rate": 4.799838720707847e-06, "loss": 0.5496, "step": 1547 }, { "epoch": 3.36531666213645, "grad_norm": 0.06872288882732391, "learning_rate": 4.788151171234149e-06, "loss": 0.4895, "step": 1548 }, { "epoch": 3.3674911660777385, "grad_norm": 0.07305479049682617, "learning_rate": 4.7764733879715706e-06, "loss": 0.5674, "step": 1549 }, { "epoch": 3.369665670019027, "grad_norm": 0.06990917772054672, "learning_rate": 4.764805392802497e-06, "loss": 0.5014, "step": 1550 }, { "epoch": 3.371840173960315, "grad_norm": 0.07433905452489853, "learning_rate": 4.753147207590972e-06, "loss": 0.565, "step": 1551 }, { "epoch": 3.3740146779016036, "grad_norm": 0.06916124373674393, "learning_rate": 4.741498854182659e-06, "loss": 0.5088, "step": 1552 }, { "epoch": 3.376189181842892, "grad_norm": 0.070643350481987, "learning_rate": 4.729860354404805e-06, "loss": 0.6583, "step": 1553 }, { "epoch": 3.3783636857841803, "grad_norm": 0.06502692401409149, "learning_rate": 4.718231730066179e-06, "loss": 0.5288, "step": 1554 }, { "epoch": 3.3805381897254687, "grad_norm": 0.07164804637432098, "learning_rate": 4.70661300295706e-06, "loss": 0.5078, "step": 1555 }, { "epoch": 3.382712693666757, "grad_norm": 0.06688214838504791, "learning_rate": 4.695004194849166e-06, "loss": 0.5021, "step": 1556 }, { "epoch": 3.3848871976080455, "grad_norm": 0.06833060085773468, "learning_rate": 4.683405327495638e-06, "loss": 0.4982, "step": 1557 }, { "epoch": 3.3870617015493343, "grad_norm": 0.07673270255327225, "learning_rate": 4.671816422630993e-06, "loss": 0.5069, "step": 1558 }, { "epoch": 3.389236205490622, "grad_norm": 0.07603961229324341, "learning_rate": 4.66023750197107e-06, "loss": 0.6878, "step": 1559 }, { "epoch": 3.391410709431911, "grad_norm": 0.06626172363758087, "learning_rate": 4.648668587212998e-06, "loss": 0.47, "step": 1560 }, { "epoch": 3.3935852133731994, "grad_norm": 0.07545891404151917, "learning_rate": 4.637109700035166e-06, "loss": 0.5171, "step": 1561 }, { "epoch": 3.395759717314488, "grad_norm": 0.07178366929292679, "learning_rate": 4.625560862097174e-06, "loss": 0.5424, "step": 1562 }, { "epoch": 3.397934221255776, "grad_norm": 0.06831347942352295, "learning_rate": 4.614022095039776e-06, "loss": 0.5185, "step": 1563 }, { "epoch": 3.4001087251970645, "grad_norm": 0.06941493600606918, "learning_rate": 4.6024934204848745e-06, "loss": 0.6289, "step": 1564 }, { "epoch": 3.402283229138353, "grad_norm": 0.07439285516738892, "learning_rate": 4.5909748600354395e-06, "loss": 0.5527, "step": 1565 }, { "epoch": 3.4044577330796413, "grad_norm": 0.06580470502376556, "learning_rate": 4.579466435275506e-06, "loss": 0.5656, "step": 1566 }, { "epoch": 3.4066322370209297, "grad_norm": 0.07183687388896942, "learning_rate": 4.567968167770113e-06, "loss": 0.5052, "step": 1567 }, { "epoch": 3.408806740962218, "grad_norm": 0.06774251908063889, "learning_rate": 4.556480079065253e-06, "loss": 0.5257, "step": 1568 }, { "epoch": 3.4109812449035064, "grad_norm": 0.07196760177612305, "learning_rate": 4.545002190687865e-06, "loss": 0.5614, "step": 1569 }, { "epoch": 3.413155748844795, "grad_norm": 0.06887226551771164, "learning_rate": 4.533534524145756e-06, "loss": 0.4698, "step": 1570 }, { "epoch": 3.415330252786083, "grad_norm": 0.07339056581258774, "learning_rate": 4.522077100927591e-06, "loss": 0.5851, "step": 1571 }, { "epoch": 3.4175047567273715, "grad_norm": 0.07777920365333557, "learning_rate": 4.510629942502839e-06, "loss": 0.6272, "step": 1572 }, { "epoch": 3.41967926066866, "grad_norm": 0.06552041321992874, "learning_rate": 4.499193070321729e-06, "loss": 0.4894, "step": 1573 }, { "epoch": 3.4218537646099483, "grad_norm": 0.06856339424848557, "learning_rate": 4.487766505815215e-06, "loss": 0.4799, "step": 1574 }, { "epoch": 3.4240282685512367, "grad_norm": 0.06671535223722458, "learning_rate": 4.476350270394942e-06, "loss": 0.5321, "step": 1575 }, { "epoch": 3.426202772492525, "grad_norm": 0.06572539359331131, "learning_rate": 4.4649443854532005e-06, "loss": 0.503, "step": 1576 }, { "epoch": 3.4283772764338134, "grad_norm": 0.06928521394729614, "learning_rate": 4.453548872362875e-06, "loss": 0.4712, "step": 1577 }, { "epoch": 3.430551780375102, "grad_norm": 0.0818856880068779, "learning_rate": 4.442163752477429e-06, "loss": 0.5539, "step": 1578 }, { "epoch": 3.43272628431639, "grad_norm": 0.07081060856580734, "learning_rate": 4.430789047130836e-06, "loss": 0.5313, "step": 1579 }, { "epoch": 3.4349007882576785, "grad_norm": 0.06708762794733047, "learning_rate": 4.419424777637565e-06, "loss": 0.4778, "step": 1580 }, { "epoch": 3.4370752921989673, "grad_norm": 0.059369493275880814, "learning_rate": 4.408070965292534e-06, "loss": 0.4272, "step": 1581 }, { "epoch": 3.4392497961402553, "grad_norm": 0.07420162856578827, "learning_rate": 4.396727631371049e-06, "loss": 0.5157, "step": 1582 }, { "epoch": 3.441424300081544, "grad_norm": 0.06663721799850464, "learning_rate": 4.385394797128791e-06, "loss": 0.4737, "step": 1583 }, { "epoch": 3.4435988040228325, "grad_norm": 0.0721513032913208, "learning_rate": 4.374072483801769e-06, "loss": 0.5582, "step": 1584 }, { "epoch": 3.445773307964121, "grad_norm": 0.07577421516180038, "learning_rate": 4.362760712606278e-06, "loss": 0.4891, "step": 1585 }, { "epoch": 3.447947811905409, "grad_norm": 0.06901760399341583, "learning_rate": 4.351459504738844e-06, "loss": 0.5231, "step": 1586 }, { "epoch": 3.4501223158466976, "grad_norm": 0.07240114361047745, "learning_rate": 4.340168881376222e-06, "loss": 0.4832, "step": 1587 }, { "epoch": 3.452296819787986, "grad_norm": 0.09390680491924286, "learning_rate": 4.32888886367531e-06, "loss": 0.4841, "step": 1588 }, { "epoch": 3.4544713237292743, "grad_norm": 0.06724991649389267, "learning_rate": 4.31761947277315e-06, "loss": 0.4726, "step": 1589 }, { "epoch": 3.4566458276705627, "grad_norm": 0.07455750554800034, "learning_rate": 4.306360729786867e-06, "loss": 0.6207, "step": 1590 }, { "epoch": 3.458820331611851, "grad_norm": 0.06928347796201706, "learning_rate": 4.295112655813622e-06, "loss": 0.5948, "step": 1591 }, { "epoch": 3.4609948355531395, "grad_norm": 0.06793619692325592, "learning_rate": 4.283875271930603e-06, "loss": 0.4938, "step": 1592 }, { "epoch": 3.463169339494428, "grad_norm": 0.07624325156211853, "learning_rate": 4.272648599194948e-06, "loss": 0.5101, "step": 1593 }, { "epoch": 3.465343843435716, "grad_norm": 0.07188382744789124, "learning_rate": 4.261432658643736e-06, "loss": 0.5282, "step": 1594 }, { "epoch": 3.4675183473770046, "grad_norm": 0.06761882454156876, "learning_rate": 4.2502274712939355e-06, "loss": 0.5629, "step": 1595 }, { "epoch": 3.469692851318293, "grad_norm": 0.0818469449877739, "learning_rate": 4.2390330581423564e-06, "loss": 0.6544, "step": 1596 }, { "epoch": 3.4718673552595813, "grad_norm": 0.07057392597198486, "learning_rate": 4.227849440165623e-06, "loss": 0.5442, "step": 1597 }, { "epoch": 3.4740418592008697, "grad_norm": 0.06822724640369415, "learning_rate": 4.216676638320135e-06, "loss": 0.5586, "step": 1598 }, { "epoch": 3.476216363142158, "grad_norm": 0.08195033669471741, "learning_rate": 4.205514673542025e-06, "loss": 0.4655, "step": 1599 }, { "epoch": 3.4783908670834465, "grad_norm": 0.07015779614448547, "learning_rate": 4.1943635667471095e-06, "loss": 0.5585, "step": 1600 }, { "epoch": 3.480565371024735, "grad_norm": 0.06148897483944893, "learning_rate": 4.18322333883087e-06, "loss": 0.5128, "step": 1601 }, { "epoch": 3.482739874966023, "grad_norm": 0.07323181629180908, "learning_rate": 4.1720940106683915e-06, "loss": 0.4473, "step": 1602 }, { "epoch": 3.4849143789073116, "grad_norm": 0.07334697246551514, "learning_rate": 4.1609756031143445e-06, "loss": 0.5552, "step": 1603 }, { "epoch": 3.4870888828486, "grad_norm": 0.0720314234495163, "learning_rate": 4.149868137002934e-06, "loss": 0.6026, "step": 1604 }, { "epoch": 3.4892633867898883, "grad_norm": 0.0617394745349884, "learning_rate": 4.138771633147856e-06, "loss": 0.5025, "step": 1605 }, { "epoch": 3.491437890731177, "grad_norm": 0.06516285240650177, "learning_rate": 4.127686112342268e-06, "loss": 0.4616, "step": 1606 }, { "epoch": 3.4936123946724655, "grad_norm": 0.08338207751512527, "learning_rate": 4.11661159535875e-06, "loss": 0.6076, "step": 1607 }, { "epoch": 3.495786898613754, "grad_norm": 0.07722815871238708, "learning_rate": 4.105548102949265e-06, "loss": 0.6141, "step": 1608 }, { "epoch": 3.4979614025550423, "grad_norm": 0.0740092396736145, "learning_rate": 4.0944956558451055e-06, "loss": 0.5238, "step": 1609 }, { "epoch": 3.5001359064963307, "grad_norm": 0.06997238099575043, "learning_rate": 4.083454274756881e-06, "loss": 0.5724, "step": 1610 }, { "epoch": 3.502310410437619, "grad_norm": 0.07041463255882263, "learning_rate": 4.0724239803744524e-06, "loss": 0.4798, "step": 1611 }, { "epoch": 3.5044849143789074, "grad_norm": 0.06757752597332001, "learning_rate": 4.061404793366914e-06, "loss": 0.4933, "step": 1612 }, { "epoch": 3.506659418320196, "grad_norm": 0.07441062480211258, "learning_rate": 4.05039673438255e-06, "loss": 0.5521, "step": 1613 }, { "epoch": 3.508833922261484, "grad_norm": 0.08457400649785995, "learning_rate": 4.039399824048777e-06, "loss": 0.6206, "step": 1614 }, { "epoch": 3.5110084262027725, "grad_norm": 0.0716620609164238, "learning_rate": 4.028414082972141e-06, "loss": 0.5603, "step": 1615 }, { "epoch": 3.513182930144061, "grad_norm": 0.07795149087905884, "learning_rate": 4.017439531738238e-06, "loss": 0.5924, "step": 1616 }, { "epoch": 3.5153574340853493, "grad_norm": 0.06564310938119888, "learning_rate": 4.00647619091171e-06, "loss": 0.4997, "step": 1617 }, { "epoch": 3.5175319380266377, "grad_norm": 0.07460669428110123, "learning_rate": 3.9955240810361926e-06, "loss": 0.5491, "step": 1618 }, { "epoch": 3.519706441967926, "grad_norm": 0.0668177604675293, "learning_rate": 3.9845832226342696e-06, "loss": 0.5085, "step": 1619 }, { "epoch": 3.5218809459092144, "grad_norm": 0.07337861508131027, "learning_rate": 3.973653636207437e-06, "loss": 0.5576, "step": 1620 }, { "epoch": 3.524055449850503, "grad_norm": 0.06678582727909088, "learning_rate": 3.962735342236082e-06, "loss": 0.5016, "step": 1621 }, { "epoch": 3.526229953791791, "grad_norm": 0.06612756848335266, "learning_rate": 3.9518283611794286e-06, "loss": 0.5991, "step": 1622 }, { "epoch": 3.5284044577330795, "grad_norm": 0.06388785690069199, "learning_rate": 3.940932713475489e-06, "loss": 0.48, "step": 1623 }, { "epoch": 3.530578961674368, "grad_norm": 0.07913524657487869, "learning_rate": 3.930048419541057e-06, "loss": 0.5862, "step": 1624 }, { "epoch": 3.5327534656156563, "grad_norm": 0.0750771164894104, "learning_rate": 3.919175499771635e-06, "loss": 0.5568, "step": 1625 }, { "epoch": 3.5349279695569447, "grad_norm": 0.07405316084623337, "learning_rate": 3.908313974541422e-06, "loss": 0.4458, "step": 1626 }, { "epoch": 3.5371024734982335, "grad_norm": 0.07131489366292953, "learning_rate": 3.897463864203266e-06, "loss": 0.563, "step": 1627 }, { "epoch": 3.5392769774395214, "grad_norm": 0.08221332728862762, "learning_rate": 3.886625189088617e-06, "loss": 0.4813, "step": 1628 }, { "epoch": 3.5414514813808102, "grad_norm": 0.07384412735700607, "learning_rate": 3.875797969507502e-06, "loss": 0.6229, "step": 1629 }, { "epoch": 3.543625985322098, "grad_norm": 0.06452514976263046, "learning_rate": 3.864982225748481e-06, "loss": 0.5414, "step": 1630 }, { "epoch": 3.545800489263387, "grad_norm": 0.07242099940776825, "learning_rate": 3.854177978078617e-06, "loss": 0.6115, "step": 1631 }, { "epoch": 3.547974993204675, "grad_norm": 0.07028739899396896, "learning_rate": 3.8433852467434175e-06, "loss": 0.6015, "step": 1632 }, { "epoch": 3.5501494971459637, "grad_norm": 0.07434529066085815, "learning_rate": 3.832604051966825e-06, "loss": 0.7877, "step": 1633 }, { "epoch": 3.552324001087252, "grad_norm": 0.07795484364032745, "learning_rate": 3.821834413951148e-06, "loss": 0.5229, "step": 1634 }, { "epoch": 3.5544985050285405, "grad_norm": 0.0649869292974472, "learning_rate": 3.8110763528770543e-06, "loss": 0.4506, "step": 1635 }, { "epoch": 3.556673008969829, "grad_norm": 0.08068155497312546, "learning_rate": 3.8003298889035146e-06, "loss": 0.5019, "step": 1636 }, { "epoch": 3.5588475129111172, "grad_norm": 0.07373400777578354, "learning_rate": 3.789595042167763e-06, "loss": 0.5502, "step": 1637 }, { "epoch": 3.5610220168524056, "grad_norm": 0.08254169672727585, "learning_rate": 3.7788718327852625e-06, "loss": 0.6376, "step": 1638 }, { "epoch": 3.563196520793694, "grad_norm": 0.07297603040933609, "learning_rate": 3.768160280849681e-06, "loss": 0.4586, "step": 1639 }, { "epoch": 3.5653710247349824, "grad_norm": 0.06847668439149857, "learning_rate": 3.7574604064328336e-06, "loss": 0.494, "step": 1640 }, { "epoch": 3.5675455286762707, "grad_norm": 0.07045603543519974, "learning_rate": 3.7467722295846596e-06, "loss": 0.5178, "step": 1641 }, { "epoch": 3.569720032617559, "grad_norm": 0.06430304050445557, "learning_rate": 3.7360957703331734e-06, "loss": 0.4414, "step": 1642 }, { "epoch": 3.5718945365588475, "grad_norm": 0.0768921747803688, "learning_rate": 3.725431048684428e-06, "loss": 0.5433, "step": 1643 }, { "epoch": 3.574069040500136, "grad_norm": 0.08268947154283524, "learning_rate": 3.7147780846224922e-06, "loss": 0.7086, "step": 1644 }, { "epoch": 3.5762435444414242, "grad_norm": 0.06697680056095123, "learning_rate": 3.704136898109403e-06, "loss": 0.482, "step": 1645 }, { "epoch": 3.5784180483827126, "grad_norm": 0.06842968612909317, "learning_rate": 3.693507509085115e-06, "loss": 0.5546, "step": 1646 }, { "epoch": 3.580592552324001, "grad_norm": 0.09143637120723724, "learning_rate": 3.6828899374674933e-06, "loss": 0.5008, "step": 1647 }, { "epoch": 3.5827670562652894, "grad_norm": 0.08130817115306854, "learning_rate": 3.6722842031522423e-06, "loss": 0.5707, "step": 1648 }, { "epoch": 3.5849415602065777, "grad_norm": 0.07450143247842789, "learning_rate": 3.661690326012897e-06, "loss": 0.5512, "step": 1649 }, { "epoch": 3.5871160641478665, "grad_norm": 0.07833284139633179, "learning_rate": 3.651108325900773e-06, "loss": 0.7612, "step": 1650 }, { "epoch": 3.5892905680891545, "grad_norm": 0.06853444129228592, "learning_rate": 3.640538222644925e-06, "loss": 0.5491, "step": 1651 }, { "epoch": 3.5914650720304433, "grad_norm": 0.07012240588665009, "learning_rate": 3.6299800360521133e-06, "loss": 0.6195, "step": 1652 }, { "epoch": 3.5936395759717312, "grad_norm": 0.08076727390289307, "learning_rate": 3.619433785906775e-06, "loss": 0.5506, "step": 1653 }, { "epoch": 3.59581407991302, "grad_norm": 0.057920973747968674, "learning_rate": 3.6088994919709795e-06, "loss": 0.4377, "step": 1654 }, { "epoch": 3.597988583854308, "grad_norm": 0.06937568634748459, "learning_rate": 3.5983771739843855e-06, "loss": 0.56, "step": 1655 }, { "epoch": 3.600163087795597, "grad_norm": 0.06845054775476456, "learning_rate": 3.587866851664219e-06, "loss": 0.6253, "step": 1656 }, { "epoch": 3.602337591736885, "grad_norm": 0.07248114794492722, "learning_rate": 3.5773685447052177e-06, "loss": 0.5382, "step": 1657 }, { "epoch": 3.6045120956781735, "grad_norm": 0.06525639444589615, "learning_rate": 3.566882272779614e-06, "loss": 0.4878, "step": 1658 }, { "epoch": 3.606686599619462, "grad_norm": 0.07298415899276733, "learning_rate": 3.556408055537087e-06, "loss": 0.5085, "step": 1659 }, { "epoch": 3.6088611035607503, "grad_norm": 0.07356373965740204, "learning_rate": 3.5459459126047226e-06, "loss": 0.4669, "step": 1660 }, { "epoch": 3.6110356075020387, "grad_norm": 0.08281216025352478, "learning_rate": 3.535495863586981e-06, "loss": 0.7012, "step": 1661 }, { "epoch": 3.613210111443327, "grad_norm": 0.07526984065771103, "learning_rate": 3.525057928065664e-06, "loss": 0.4667, "step": 1662 }, { "epoch": 3.6153846153846154, "grad_norm": 0.06773357093334198, "learning_rate": 3.514632125599876e-06, "loss": 0.4426, "step": 1663 }, { "epoch": 3.617559119325904, "grad_norm": 0.06305951625108719, "learning_rate": 3.5042184757259844e-06, "loss": 0.4835, "step": 1664 }, { "epoch": 3.619733623267192, "grad_norm": 0.06962303072214127, "learning_rate": 3.493816997957582e-06, "loss": 0.526, "step": 1665 }, { "epoch": 3.6219081272084805, "grad_norm": 0.06271009892225266, "learning_rate": 3.483427711785449e-06, "loss": 0.5395, "step": 1666 }, { "epoch": 3.624082631149769, "grad_norm": 0.06922087073326111, "learning_rate": 3.4730506366775306e-06, "loss": 0.5308, "step": 1667 }, { "epoch": 3.6262571350910573, "grad_norm": 0.07890098541975021, "learning_rate": 3.462685792078888e-06, "loss": 0.4658, "step": 1668 }, { "epoch": 3.6284316390323457, "grad_norm": 0.07264728844165802, "learning_rate": 3.452333197411657e-06, "loss": 0.6395, "step": 1669 }, { "epoch": 3.630606142973634, "grad_norm": 0.0841958299279213, "learning_rate": 3.4419928720750274e-06, "loss": 0.6548, "step": 1670 }, { "epoch": 3.6327806469149224, "grad_norm": 0.06342515349388123, "learning_rate": 3.4316648354451896e-06, "loss": 0.4721, "step": 1671 }, { "epoch": 3.634955150856211, "grad_norm": 0.06943533569574356, "learning_rate": 3.4213491068753135e-06, "loss": 0.5108, "step": 1672 }, { "epoch": 3.637129654797499, "grad_norm": 0.06724522262811661, "learning_rate": 3.4110457056955095e-06, "loss": 0.4769, "step": 1673 }, { "epoch": 3.6393041587387875, "grad_norm": 0.07475198060274124, "learning_rate": 3.4007546512127764e-06, "loss": 0.5348, "step": 1674 }, { "epoch": 3.6414786626800764, "grad_norm": 0.07332072407007217, "learning_rate": 3.3904759627109828e-06, "loss": 0.5632, "step": 1675 }, { "epoch": 3.6436531666213643, "grad_norm": 0.07312526553869247, "learning_rate": 3.380209659450827e-06, "loss": 0.4416, "step": 1676 }, { "epoch": 3.645827670562653, "grad_norm": 0.0672493577003479, "learning_rate": 3.369955760669802e-06, "loss": 0.5342, "step": 1677 }, { "epoch": 3.648002174503941, "grad_norm": 0.05725240334868431, "learning_rate": 3.359714285582146e-06, "loss": 0.4776, "step": 1678 }, { "epoch": 3.65017667844523, "grad_norm": 0.07789724320173264, "learning_rate": 3.3494852533788313e-06, "loss": 0.7188, "step": 1679 }, { "epoch": 3.6523511823865182, "grad_norm": 0.06502869725227356, "learning_rate": 3.339268683227499e-06, "loss": 0.4975, "step": 1680 }, { "epoch": 3.6545256863278066, "grad_norm": 0.07857832312583923, "learning_rate": 3.329064594272451e-06, "loss": 0.5907, "step": 1681 }, { "epoch": 3.656700190269095, "grad_norm": 0.07362460345029831, "learning_rate": 3.3188730056345974e-06, "loss": 0.5447, "step": 1682 }, { "epoch": 3.6588746942103834, "grad_norm": 0.0682707279920578, "learning_rate": 3.308693936411421e-06, "loss": 0.5348, "step": 1683 }, { "epoch": 3.6610491981516717, "grad_norm": 0.0751461461186409, "learning_rate": 3.2985274056769445e-06, "loss": 0.6185, "step": 1684 }, { "epoch": 3.66322370209296, "grad_norm": 0.0691172257065773, "learning_rate": 3.288373432481703e-06, "loss": 0.5284, "step": 1685 }, { "epoch": 3.6653982060342485, "grad_norm": 0.07042500376701355, "learning_rate": 3.2782320358526933e-06, "loss": 0.5179, "step": 1686 }, { "epoch": 3.667572709975537, "grad_norm": 0.07773671299219131, "learning_rate": 3.2681032347933537e-06, "loss": 0.6666, "step": 1687 }, { "epoch": 3.6697472139168252, "grad_norm": 0.06808001548051834, "learning_rate": 3.2579870482835117e-06, "loss": 0.6169, "step": 1688 }, { "epoch": 3.6719217178581136, "grad_norm": 0.08082466572523117, "learning_rate": 3.247883495279358e-06, "loss": 0.5433, "step": 1689 }, { "epoch": 3.674096221799402, "grad_norm": 0.07562743127346039, "learning_rate": 3.2377925947134137e-06, "loss": 0.5795, "step": 1690 }, { "epoch": 3.6762707257406904, "grad_norm": 0.06471718102693558, "learning_rate": 3.2277143654944944e-06, "loss": 0.4838, "step": 1691 }, { "epoch": 3.6784452296819787, "grad_norm": 0.07267529517412186, "learning_rate": 3.21764882650766e-06, "loss": 0.5506, "step": 1692 }, { "epoch": 3.680619733623267, "grad_norm": 0.07455439120531082, "learning_rate": 3.2075959966142055e-06, "loss": 0.51, "step": 1693 }, { "epoch": 3.6827942375645555, "grad_norm": 0.07243385910987854, "learning_rate": 3.197555894651596e-06, "loss": 0.7525, "step": 1694 }, { "epoch": 3.684968741505844, "grad_norm": 0.07046650350093842, "learning_rate": 3.1875285394334575e-06, "loss": 0.5229, "step": 1695 }, { "epoch": 3.6871432454471322, "grad_norm": 0.07451114803552628, "learning_rate": 3.17751394974953e-06, "loss": 0.5637, "step": 1696 }, { "epoch": 3.6893177493884206, "grad_norm": 0.07024112343788147, "learning_rate": 3.1675121443656266e-06, "loss": 0.5274, "step": 1697 }, { "epoch": 3.6914922533297094, "grad_norm": 0.07463346421718597, "learning_rate": 3.157523142023604e-06, "loss": 0.5544, "step": 1698 }, { "epoch": 3.6936667572709974, "grad_norm": 0.07062430679798126, "learning_rate": 3.1475469614413346e-06, "loss": 0.5495, "step": 1699 }, { "epoch": 3.695841261212286, "grad_norm": 0.06651204824447632, "learning_rate": 3.1375836213126653e-06, "loss": 0.494, "step": 1700 }, { "epoch": 3.698015765153574, "grad_norm": 0.07253233343362808, "learning_rate": 3.1276331403073733e-06, "loss": 0.5205, "step": 1701 }, { "epoch": 3.700190269094863, "grad_norm": 0.06712613999843597, "learning_rate": 3.117695537071149e-06, "loss": 0.4983, "step": 1702 }, { "epoch": 3.702364773036151, "grad_norm": 0.08063866943120956, "learning_rate": 3.107770830225543e-06, "loss": 0.5301, "step": 1703 }, { "epoch": 3.7045392769774397, "grad_norm": 0.07819189876317978, "learning_rate": 3.0978590383679476e-06, "loss": 0.5487, "step": 1704 }, { "epoch": 3.706713780918728, "grad_norm": 0.07367454469203949, "learning_rate": 3.087960180071553e-06, "loss": 0.4528, "step": 1705 }, { "epoch": 3.7088882848600164, "grad_norm": 0.07035141438245773, "learning_rate": 3.078074273885312e-06, "loss": 0.5687, "step": 1706 }, { "epoch": 3.711062788801305, "grad_norm": 0.06731324642896652, "learning_rate": 3.0682013383339026e-06, "loss": 0.5475, "step": 1707 }, { "epoch": 3.713237292742593, "grad_norm": 0.0657249391078949, "learning_rate": 3.0583413919177063e-06, "loss": 0.5508, "step": 1708 }, { "epoch": 3.7154117966838816, "grad_norm": 0.07525421679019928, "learning_rate": 3.048494453112765e-06, "loss": 0.5472, "step": 1709 }, { "epoch": 3.71758630062517, "grad_norm": 0.07167118787765503, "learning_rate": 3.0386605403707347e-06, "loss": 0.5643, "step": 1710 }, { "epoch": 3.7197608045664583, "grad_norm": 0.06669241935014725, "learning_rate": 3.0288396721188786e-06, "loss": 0.4601, "step": 1711 }, { "epoch": 3.7219353085077467, "grad_norm": 0.06917154043912888, "learning_rate": 3.0190318667600003e-06, "loss": 0.5071, "step": 1712 }, { "epoch": 3.724109812449035, "grad_norm": 0.07591681182384491, "learning_rate": 3.0092371426724397e-06, "loss": 0.5349, "step": 1713 }, { "epoch": 3.7262843163903234, "grad_norm": 0.06457411497831345, "learning_rate": 2.999455518210018e-06, "loss": 0.594, "step": 1714 }, { "epoch": 3.728458820331612, "grad_norm": 0.06983805447816849, "learning_rate": 2.9896870117020073e-06, "loss": 0.4973, "step": 1715 }, { "epoch": 3.7306333242729, "grad_norm": 0.08502628654241562, "learning_rate": 2.979931641453104e-06, "loss": 0.5753, "step": 1716 }, { "epoch": 3.7328078282141886, "grad_norm": 0.0713811069726944, "learning_rate": 2.970189425743383e-06, "loss": 0.5497, "step": 1717 }, { "epoch": 3.734982332155477, "grad_norm": 0.07074355334043503, "learning_rate": 2.9604603828282753e-06, "loss": 0.4959, "step": 1718 }, { "epoch": 3.7371568360967653, "grad_norm": 0.06658228486776352, "learning_rate": 2.95074453093853e-06, "loss": 0.4888, "step": 1719 }, { "epoch": 3.7393313400380537, "grad_norm": 0.061382051557302475, "learning_rate": 2.9410418882801682e-06, "loss": 0.4196, "step": 1720 }, { "epoch": 3.741505843979342, "grad_norm": 0.07730218768119812, "learning_rate": 2.9313524730344644e-06, "loss": 0.6339, "step": 1721 }, { "epoch": 3.7436803479206304, "grad_norm": 0.069411501288414, "learning_rate": 2.92167630335791e-06, "loss": 0.5303, "step": 1722 }, { "epoch": 3.7458548518619192, "grad_norm": 0.07142267376184464, "learning_rate": 2.9120133973821762e-06, "loss": 0.4916, "step": 1723 }, { "epoch": 3.748029355803207, "grad_norm": 0.06808903068304062, "learning_rate": 2.902363773214072e-06, "loss": 0.5114, "step": 1724 }, { "epoch": 3.750203859744496, "grad_norm": 0.08015533536672592, "learning_rate": 2.8927274489355296e-06, "loss": 0.5861, "step": 1725 }, { "epoch": 3.752378363685784, "grad_norm": 0.0726238563656807, "learning_rate": 2.883104442603547e-06, "loss": 0.5695, "step": 1726 }, { "epoch": 3.7545528676270727, "grad_norm": 0.07819824665784836, "learning_rate": 2.873494772250176e-06, "loss": 0.6058, "step": 1727 }, { "epoch": 3.756727371568361, "grad_norm": 0.06799522042274475, "learning_rate": 2.8638984558824777e-06, "loss": 0.5952, "step": 1728 }, { "epoch": 3.7589018755096495, "grad_norm": 0.0610821507871151, "learning_rate": 2.8543155114824873e-06, "loss": 0.4343, "step": 1729 }, { "epoch": 3.761076379450938, "grad_norm": 0.07584638893604279, "learning_rate": 2.844745957007178e-06, "loss": 0.4831, "step": 1730 }, { "epoch": 3.7632508833922262, "grad_norm": 0.0664713978767395, "learning_rate": 2.8351898103884413e-06, "loss": 0.4227, "step": 1731 }, { "epoch": 3.7654253873335146, "grad_norm": 0.06976176053285599, "learning_rate": 2.8256470895330447e-06, "loss": 0.5436, "step": 1732 }, { "epoch": 3.767599891274803, "grad_norm": 0.07358802855014801, "learning_rate": 2.8161178123225873e-06, "loss": 0.5634, "step": 1733 }, { "epoch": 3.7697743952160914, "grad_norm": 0.08484029769897461, "learning_rate": 2.8066019966134907e-06, "loss": 0.6033, "step": 1734 }, { "epoch": 3.7719488991573797, "grad_norm": 0.07865934073925018, "learning_rate": 2.797099660236937e-06, "loss": 0.6154, "step": 1735 }, { "epoch": 3.774123403098668, "grad_norm": 0.0672420933842659, "learning_rate": 2.7876108209988616e-06, "loss": 0.5501, "step": 1736 }, { "epoch": 3.7762979070399565, "grad_norm": 0.06630543619394302, "learning_rate": 2.778135496679908e-06, "loss": 0.4751, "step": 1737 }, { "epoch": 3.778472410981245, "grad_norm": 0.07948421686887741, "learning_rate": 2.768673705035384e-06, "loss": 0.4987, "step": 1738 }, { "epoch": 3.7806469149225332, "grad_norm": 0.06351299583911896, "learning_rate": 2.7592254637952533e-06, "loss": 0.6002, "step": 1739 }, { "epoch": 3.7828214188638216, "grad_norm": 0.06591910868883133, "learning_rate": 2.749790790664074e-06, "loss": 0.5531, "step": 1740 }, { "epoch": 3.78499592280511, "grad_norm": 0.06655463576316833, "learning_rate": 2.7403697033209907e-06, "loss": 0.6152, "step": 1741 }, { "epoch": 3.7871704267463984, "grad_norm": 0.07367121428251266, "learning_rate": 2.7309622194196905e-06, "loss": 0.5623, "step": 1742 }, { "epoch": 3.7893449306876867, "grad_norm": 0.06732293218374252, "learning_rate": 2.721568356588362e-06, "loss": 0.5196, "step": 1743 }, { "epoch": 3.791519434628975, "grad_norm": 0.06692995131015778, "learning_rate": 2.712188132429671e-06, "loss": 0.529, "step": 1744 }, { "epoch": 3.7936939385702635, "grad_norm": 0.07911719381809235, "learning_rate": 2.702821564520732e-06, "loss": 0.6498, "step": 1745 }, { "epoch": 3.7958684425115523, "grad_norm": 0.06963081657886505, "learning_rate": 2.6934686704130698e-06, "loss": 0.552, "step": 1746 }, { "epoch": 3.7980429464528402, "grad_norm": 0.0674765333533287, "learning_rate": 2.684129467632579e-06, "loss": 0.4721, "step": 1747 }, { "epoch": 3.800217450394129, "grad_norm": 0.0657779648900032, "learning_rate": 2.6748039736795086e-06, "loss": 0.4862, "step": 1748 }, { "epoch": 3.802391954335417, "grad_norm": 0.07589337229728699, "learning_rate": 2.6654922060284074e-06, "loss": 0.6502, "step": 1749 }, { "epoch": 3.804566458276706, "grad_norm": 0.09340006113052368, "learning_rate": 2.6561941821281145e-06, "loss": 0.6387, "step": 1750 }, { "epoch": 3.806740962217994, "grad_norm": 0.06813382357358932, "learning_rate": 2.6469099194017144e-06, "loss": 0.5154, "step": 1751 }, { "epoch": 3.8089154661592826, "grad_norm": 0.06691460311412811, "learning_rate": 2.637639435246497e-06, "loss": 0.48, "step": 1752 }, { "epoch": 3.811089970100571, "grad_norm": 0.07078904658555984, "learning_rate": 2.6283827470339375e-06, "loss": 0.4978, "step": 1753 }, { "epoch": 3.8132644740418593, "grad_norm": 0.07327672839164734, "learning_rate": 2.6191398721096605e-06, "loss": 0.6086, "step": 1754 }, { "epoch": 3.8154389779831477, "grad_norm": 0.06401863694190979, "learning_rate": 2.6099108277934105e-06, "loss": 0.477, "step": 1755 }, { "epoch": 3.817613481924436, "grad_norm": 0.06660480052232742, "learning_rate": 2.6006956313790056e-06, "loss": 0.5266, "step": 1756 }, { "epoch": 3.8197879858657244, "grad_norm": 0.07315203547477722, "learning_rate": 2.591494300134325e-06, "loss": 0.5963, "step": 1757 }, { "epoch": 3.821962489807013, "grad_norm": 0.06544604152441025, "learning_rate": 2.5823068513012593e-06, "loss": 0.5314, "step": 1758 }, { "epoch": 3.824136993748301, "grad_norm": 0.06898277997970581, "learning_rate": 2.5731333020956883e-06, "loss": 0.5801, "step": 1759 }, { "epoch": 3.8263114976895896, "grad_norm": 0.06796306371688843, "learning_rate": 2.5639736697074525e-06, "loss": 0.4562, "step": 1760 }, { "epoch": 3.828486001630878, "grad_norm": 0.07054129987955093, "learning_rate": 2.5548279713003e-06, "loss": 0.5773, "step": 1761 }, { "epoch": 3.8306605055721663, "grad_norm": 0.07102838903665543, "learning_rate": 2.545696224011884e-06, "loss": 0.5648, "step": 1762 }, { "epoch": 3.8328350095134547, "grad_norm": 0.07269541174173355, "learning_rate": 2.5365784449537033e-06, "loss": 0.5573, "step": 1763 }, { "epoch": 3.835009513454743, "grad_norm": 0.06210189312696457, "learning_rate": 2.527474651211089e-06, "loss": 0.4722, "step": 1764 }, { "epoch": 3.8371840173960314, "grad_norm": 0.06390105932950974, "learning_rate": 2.518384859843168e-06, "loss": 0.4492, "step": 1765 }, { "epoch": 3.83935852133732, "grad_norm": 0.07029172033071518, "learning_rate": 2.509309087882823e-06, "loss": 0.551, "step": 1766 }, { "epoch": 3.841533025278608, "grad_norm": 0.07960562407970428, "learning_rate": 2.5002473523366643e-06, "loss": 0.6309, "step": 1767 }, { "epoch": 3.8437075292198966, "grad_norm": 0.0710235983133316, "learning_rate": 2.4911996701850083e-06, "loss": 0.4686, "step": 1768 }, { "epoch": 3.8458820331611854, "grad_norm": 0.07544839382171631, "learning_rate": 2.482166058381836e-06, "loss": 0.5914, "step": 1769 }, { "epoch": 3.8480565371024733, "grad_norm": 0.06782782077789307, "learning_rate": 2.4731465338547556e-06, "loss": 0.5075, "step": 1770 }, { "epoch": 3.850231041043762, "grad_norm": 0.07701101154088974, "learning_rate": 2.4641411135049877e-06, "loss": 0.6276, "step": 1771 }, { "epoch": 3.85240554498505, "grad_norm": 0.07802244275808334, "learning_rate": 2.455149814207314e-06, "loss": 0.544, "step": 1772 }, { "epoch": 3.854580048926339, "grad_norm": 0.07010866701602936, "learning_rate": 2.4461726528100615e-06, "loss": 0.7571, "step": 1773 }, { "epoch": 3.856754552867627, "grad_norm": 0.0763280913233757, "learning_rate": 2.437209646135068e-06, "loss": 0.5449, "step": 1774 }, { "epoch": 3.8589290568089156, "grad_norm": 0.06945566833019257, "learning_rate": 2.428260810977641e-06, "loss": 0.4845, "step": 1775 }, { "epoch": 3.861103560750204, "grad_norm": 0.07453656941652298, "learning_rate": 2.4193261641065304e-06, "loss": 0.5424, "step": 1776 }, { "epoch": 3.8632780646914924, "grad_norm": 0.0743182897567749, "learning_rate": 2.4104057222639067e-06, "loss": 0.567, "step": 1777 }, { "epoch": 3.8654525686327807, "grad_norm": 0.06793923676013947, "learning_rate": 2.401499502165324e-06, "loss": 0.5874, "step": 1778 }, { "epoch": 3.867627072574069, "grad_norm": 0.06789164245128632, "learning_rate": 2.392607520499677e-06, "loss": 0.508, "step": 1779 }, { "epoch": 3.8698015765153575, "grad_norm": 0.07682343572378159, "learning_rate": 2.3837297939291893e-06, "loss": 0.5371, "step": 1780 }, { "epoch": 3.871976080456646, "grad_norm": 0.07278100401163101, "learning_rate": 2.3748663390893644e-06, "loss": 0.5716, "step": 1781 }, { "epoch": 3.8741505843979342, "grad_norm": 0.06296555697917938, "learning_rate": 2.3660171725889703e-06, "loss": 0.5043, "step": 1782 }, { "epoch": 3.8763250883392226, "grad_norm": 0.07118958979845047, "learning_rate": 2.3571823110099988e-06, "loss": 0.5332, "step": 1783 }, { "epoch": 3.878499592280511, "grad_norm": 0.07014200091362, "learning_rate": 2.3483617709076312e-06, "loss": 0.5111, "step": 1784 }, { "epoch": 3.8806740962217994, "grad_norm": 0.06942564994096756, "learning_rate": 2.339555568810221e-06, "loss": 0.544, "step": 1785 }, { "epoch": 3.8828486001630877, "grad_norm": 0.07621092349290848, "learning_rate": 2.330763721219246e-06, "loss": 0.5287, "step": 1786 }, { "epoch": 3.885023104104376, "grad_norm": 0.06831474602222443, "learning_rate": 2.32198624460929e-06, "loss": 0.477, "step": 1787 }, { "epoch": 3.8871976080456645, "grad_norm": 0.061214007437229156, "learning_rate": 2.3132231554280138e-06, "loss": 0.4988, "step": 1788 }, { "epoch": 3.889372111986953, "grad_norm": 0.08054604381322861, "learning_rate": 2.3044744700961065e-06, "loss": 0.62, "step": 1789 }, { "epoch": 3.8915466159282412, "grad_norm": 0.07515312731266022, "learning_rate": 2.2957402050072717e-06, "loss": 0.5849, "step": 1790 }, { "epoch": 3.8937211198695296, "grad_norm": 0.06406334042549133, "learning_rate": 2.287020376528193e-06, "loss": 0.4796, "step": 1791 }, { "epoch": 3.895895623810818, "grad_norm": 0.07906486839056015, "learning_rate": 2.2783150009985054e-06, "loss": 0.5931, "step": 1792 }, { "epoch": 3.8980701277521064, "grad_norm": 0.06431597471237183, "learning_rate": 2.2696240947307525e-06, "loss": 0.4808, "step": 1793 }, { "epoch": 3.900244631693395, "grad_norm": 0.06779953837394714, "learning_rate": 2.260947674010372e-06, "loss": 0.4272, "step": 1794 }, { "epoch": 3.902419135634683, "grad_norm": 0.06702446937561035, "learning_rate": 2.252285755095652e-06, "loss": 0.572, "step": 1795 }, { "epoch": 3.904593639575972, "grad_norm": 0.07164770364761353, "learning_rate": 2.2436383542177108e-06, "loss": 0.5218, "step": 1796 }, { "epoch": 3.90676814351726, "grad_norm": 0.07402224093675613, "learning_rate": 2.235005487580466e-06, "loss": 0.4927, "step": 1797 }, { "epoch": 3.9089426474585487, "grad_norm": 0.07265682518482208, "learning_rate": 2.22638717136059e-06, "loss": 0.5991, "step": 1798 }, { "epoch": 3.911117151399837, "grad_norm": 0.07680661231279373, "learning_rate": 2.2177834217074932e-06, "loss": 0.4775, "step": 1799 }, { "epoch": 3.9132916553411254, "grad_norm": 0.0685998946428299, "learning_rate": 2.209194254743295e-06, "loss": 0.542, "step": 1800 }, { "epoch": 3.915466159282414, "grad_norm": 0.06849497556686401, "learning_rate": 2.2006196865627905e-06, "loss": 0.5235, "step": 1801 }, { "epoch": 3.917640663223702, "grad_norm": 0.07466372847557068, "learning_rate": 2.192059733233408e-06, "loss": 0.5455, "step": 1802 }, { "epoch": 3.9198151671649906, "grad_norm": 0.07011424005031586, "learning_rate": 2.183514410795202e-06, "loss": 0.5182, "step": 1803 }, { "epoch": 3.921989671106279, "grad_norm": 0.06741403788328171, "learning_rate": 2.1749837352608005e-06, "loss": 0.5105, "step": 1804 }, { "epoch": 3.9241641750475673, "grad_norm": 0.0761314257979393, "learning_rate": 2.166467722615394e-06, "loss": 0.5384, "step": 1805 }, { "epoch": 3.9263386789888557, "grad_norm": 0.06282424181699753, "learning_rate": 2.1579663888166956e-06, "loss": 0.4929, "step": 1806 }, { "epoch": 3.928513182930144, "grad_norm": 0.0685649961233139, "learning_rate": 2.1494797497949036e-06, "loss": 0.534, "step": 1807 }, { "epoch": 3.9306876868714324, "grad_norm": 0.0830850601196289, "learning_rate": 2.1410078214526953e-06, "loss": 0.5967, "step": 1808 }, { "epoch": 3.932862190812721, "grad_norm": 0.07687776535749435, "learning_rate": 2.132550619665168e-06, "loss": 0.5368, "step": 1809 }, { "epoch": 3.935036694754009, "grad_norm": 0.07222627848386765, "learning_rate": 2.124108160279832e-06, "loss": 0.4286, "step": 1810 }, { "epoch": 3.9372111986952976, "grad_norm": 0.08053463697433472, "learning_rate": 2.1156804591165736e-06, "loss": 0.5677, "step": 1811 }, { "epoch": 3.939385702636586, "grad_norm": 0.06609556078910828, "learning_rate": 2.1072675319676184e-06, "loss": 0.5114, "step": 1812 }, { "epoch": 3.9415602065778743, "grad_norm": 0.07078705728054047, "learning_rate": 2.0988693945975102e-06, "loss": 0.519, "step": 1813 }, { "epoch": 3.9437347105191627, "grad_norm": 0.0681135281920433, "learning_rate": 2.090486062743081e-06, "loss": 0.5823, "step": 1814 }, { "epoch": 3.945909214460451, "grad_norm": 0.06471305340528488, "learning_rate": 2.0821175521134208e-06, "loss": 0.5476, "step": 1815 }, { "epoch": 3.9480837184017394, "grad_norm": 0.06947237253189087, "learning_rate": 2.0737638783898384e-06, "loss": 0.569, "step": 1816 }, { "epoch": 3.9502582223430283, "grad_norm": 0.09378840029239655, "learning_rate": 2.0654250572258528e-06, "loss": 0.5615, "step": 1817 }, { "epoch": 3.952432726284316, "grad_norm": 0.08408486098051071, "learning_rate": 2.05710110424714e-06, "loss": 0.6018, "step": 1818 }, { "epoch": 3.954607230225605, "grad_norm": 0.07226504385471344, "learning_rate": 2.048792035051521e-06, "loss": 0.5716, "step": 1819 }, { "epoch": 3.956781734166893, "grad_norm": 0.07047218829393387, "learning_rate": 2.0404978652089325e-06, "loss": 0.5019, "step": 1820 }, { "epoch": 3.9589562381081818, "grad_norm": 0.07000525295734406, "learning_rate": 2.0322186102613793e-06, "loss": 0.4871, "step": 1821 }, { "epoch": 3.9611307420494697, "grad_norm": 0.07984933257102966, "learning_rate": 2.0239542857229245e-06, "loss": 0.5927, "step": 1822 }, { "epoch": 3.9633052459907585, "grad_norm": 0.07771391421556473, "learning_rate": 2.0157049070796564e-06, "loss": 0.6755, "step": 1823 }, { "epoch": 3.965479749932047, "grad_norm": 0.0712956115603447, "learning_rate": 2.0074704897896556e-06, "loss": 0.5396, "step": 1824 }, { "epoch": 3.9676542538733353, "grad_norm": 0.07874945551156998, "learning_rate": 1.999251049282962e-06, "loss": 0.6836, "step": 1825 }, { "epoch": 3.9698287578146236, "grad_norm": 0.07228855788707733, "learning_rate": 1.9910466009615604e-06, "loss": 0.5425, "step": 1826 }, { "epoch": 3.972003261755912, "grad_norm": 0.07415413856506348, "learning_rate": 1.982857160199334e-06, "loss": 0.5871, "step": 1827 }, { "epoch": 3.9741777656972004, "grad_norm": 0.06711491197347641, "learning_rate": 1.9746827423420478e-06, "loss": 0.5216, "step": 1828 }, { "epoch": 3.9763522696384888, "grad_norm": 0.06881233304738998, "learning_rate": 1.966523362707321e-06, "loss": 0.542, "step": 1829 }, { "epoch": 3.978526773579777, "grad_norm": 0.0708330050110817, "learning_rate": 1.9583790365845823e-06, "loss": 0.5453, "step": 1830 }, { "epoch": 3.9807012775210655, "grad_norm": 0.07197567075490952, "learning_rate": 1.950249779235065e-06, "loss": 0.5874, "step": 1831 }, { "epoch": 3.982875781462354, "grad_norm": 0.073598213493824, "learning_rate": 1.9421356058917528e-06, "loss": 0.4704, "step": 1832 }, { "epoch": 3.9850502854036423, "grad_norm": 0.07013476639986038, "learning_rate": 1.9340365317593744e-06, "loss": 0.5471, "step": 1833 }, { "epoch": 3.9872247893449306, "grad_norm": 0.07178087532520294, "learning_rate": 1.9259525720143646e-06, "loss": 0.5096, "step": 1834 }, { "epoch": 3.989399293286219, "grad_norm": 0.09097673743963242, "learning_rate": 1.917883741804829e-06, "loss": 0.517, "step": 1835 }, { "epoch": 3.9915737972275074, "grad_norm": 0.07236514985561371, "learning_rate": 1.9098300562505266e-06, "loss": 0.5691, "step": 1836 }, { "epoch": 3.9937483011687958, "grad_norm": 0.07668840885162354, "learning_rate": 1.901791530442838e-06, "loss": 0.6515, "step": 1837 }, { "epoch": 3.995922805110084, "grad_norm": 0.07180533558130264, "learning_rate": 1.8937681794447404e-06, "loss": 0.4562, "step": 1838 }, { "epoch": 3.9980973090513725, "grad_norm": 0.0725080668926239, "learning_rate": 1.8857600182907676e-06, "loss": 0.5681, "step": 1839 }, { "epoch": 4.0, "grad_norm": 0.07456816732883453, "learning_rate": 1.877767061986997e-06, "loss": 0.4672, "step": 1840 }, { "epoch": 4.0, "eval_loss": 0.5494338274002075, "eval_runtime": 13.8638, "eval_samples_per_second": 5.338, "eval_steps_per_second": 5.338, "step": 1840 }, { "epoch": 4.002174503941289, "grad_norm": 0.06814319640398026, "learning_rate": 1.8697893255110088e-06, "loss": 0.5601, "step": 1841 }, { "epoch": 4.004349007882577, "grad_norm": 0.06645607203245163, "learning_rate": 1.8618268238118674e-06, "loss": 0.5667, "step": 1842 }, { "epoch": 4.006523511823866, "grad_norm": 0.06618447601795197, "learning_rate": 1.853879571810091e-06, "loss": 0.4843, "step": 1843 }, { "epoch": 4.0086980157651535, "grad_norm": 0.06761840730905533, "learning_rate": 1.8459475843976193e-06, "loss": 0.4997, "step": 1844 }, { "epoch": 4.010872519706442, "grad_norm": 0.0698138028383255, "learning_rate": 1.8380308764377841e-06, "loss": 0.5577, "step": 1845 }, { "epoch": 4.01304702364773, "grad_norm": 0.06386274844408035, "learning_rate": 1.8301294627652943e-06, "loss": 0.4672, "step": 1846 }, { "epoch": 4.015221527589019, "grad_norm": 0.09613420069217682, "learning_rate": 1.8222433581861986e-06, "loss": 0.5956, "step": 1847 }, { "epoch": 4.017396031530307, "grad_norm": 0.06942781060934067, "learning_rate": 1.814372577477851e-06, "loss": 0.4983, "step": 1848 }, { "epoch": 4.019570535471596, "grad_norm": 0.07094738632440567, "learning_rate": 1.8065171353889e-06, "loss": 0.489, "step": 1849 }, { "epoch": 4.021745039412884, "grad_norm": 0.06631860882043839, "learning_rate": 1.7986770466392445e-06, "loss": 0.5642, "step": 1850 }, { "epoch": 4.023919543354173, "grad_norm": 0.07038282603025436, "learning_rate": 1.7908523259200195e-06, "loss": 0.4959, "step": 1851 }, { "epoch": 4.0260940472954605, "grad_norm": 0.06894353777170181, "learning_rate": 1.7830429878935618e-06, "loss": 0.5011, "step": 1852 }, { "epoch": 4.028268551236749, "grad_norm": 0.0667705163359642, "learning_rate": 1.7752490471933769e-06, "loss": 0.4485, "step": 1853 }, { "epoch": 4.030443055178037, "grad_norm": 0.08019091188907623, "learning_rate": 1.767470518424129e-06, "loss": 0.6615, "step": 1854 }, { "epoch": 4.032617559119326, "grad_norm": 0.08734645694494247, "learning_rate": 1.75970741616159e-06, "loss": 0.6365, "step": 1855 }, { "epoch": 4.034792063060614, "grad_norm": 0.0614079087972641, "learning_rate": 1.7519597549526347e-06, "loss": 0.4914, "step": 1856 }, { "epoch": 4.036966567001903, "grad_norm": 0.07433779537677765, "learning_rate": 1.7442275493152039e-06, "loss": 0.5131, "step": 1857 }, { "epoch": 4.039141070943191, "grad_norm": 0.06433860212564468, "learning_rate": 1.7365108137382692e-06, "loss": 0.446, "step": 1858 }, { "epoch": 4.04131557488448, "grad_norm": 0.07831351459026337, "learning_rate": 1.7288095626818169e-06, "loss": 0.5007, "step": 1859 }, { "epoch": 4.0434900788257675, "grad_norm": 0.06625545769929886, "learning_rate": 1.7211238105768213e-06, "loss": 0.4865, "step": 1860 }, { "epoch": 4.045664582767056, "grad_norm": 0.08103837072849274, "learning_rate": 1.7134535718252142e-06, "loss": 0.6024, "step": 1861 }, { "epoch": 4.047839086708344, "grad_norm": 0.07211139053106308, "learning_rate": 1.7057988607998487e-06, "loss": 0.5114, "step": 1862 }, { "epoch": 4.050013590649633, "grad_norm": 0.08600890636444092, "learning_rate": 1.6981596918444953e-06, "loss": 0.5867, "step": 1863 }, { "epoch": 4.052188094590922, "grad_norm": 0.06723609566688538, "learning_rate": 1.6905360792737857e-06, "loss": 0.5504, "step": 1864 }, { "epoch": 4.05436259853221, "grad_norm": 0.06886590272188187, "learning_rate": 1.6829280373732126e-06, "loss": 0.538, "step": 1865 }, { "epoch": 4.056537102473499, "grad_norm": 0.06821097433567047, "learning_rate": 1.6753355803990912e-06, "loss": 0.5226, "step": 1866 }, { "epoch": 4.058711606414787, "grad_norm": 0.06816242635250092, "learning_rate": 1.6677587225785264e-06, "loss": 0.5037, "step": 1867 }, { "epoch": 4.060886110356075, "grad_norm": 0.07124412059783936, "learning_rate": 1.6601974781093943e-06, "loss": 0.5423, "step": 1868 }, { "epoch": 4.063060614297363, "grad_norm": 0.0751076489686966, "learning_rate": 1.6526518611603182e-06, "loss": 0.4986, "step": 1869 }, { "epoch": 4.065235118238652, "grad_norm": 0.0749116763472557, "learning_rate": 1.6451218858706374e-06, "loss": 0.5209, "step": 1870 }, { "epoch": 4.06740962217994, "grad_norm": 0.0729384794831276, "learning_rate": 1.6376075663503732e-06, "loss": 0.5356, "step": 1871 }, { "epoch": 4.069584126121229, "grad_norm": 0.07148271799087524, "learning_rate": 1.6301089166802232e-06, "loss": 0.541, "step": 1872 }, { "epoch": 4.071758630062517, "grad_norm": 0.06868808716535568, "learning_rate": 1.6226259509115083e-06, "loss": 0.6125, "step": 1873 }, { "epoch": 4.073933134003806, "grad_norm": 0.06676565110683441, "learning_rate": 1.6151586830661704e-06, "loss": 0.5028, "step": 1874 }, { "epoch": 4.076107637945094, "grad_norm": 0.0644635483622551, "learning_rate": 1.607707127136734e-06, "loss": 0.4954, "step": 1875 }, { "epoch": 4.078282141886382, "grad_norm": 0.06508057564496994, "learning_rate": 1.600271297086279e-06, "loss": 0.514, "step": 1876 }, { "epoch": 4.08045664582767, "grad_norm": 0.06991713494062424, "learning_rate": 1.5928512068484158e-06, "loss": 0.5257, "step": 1877 }, { "epoch": 4.082631149768959, "grad_norm": 0.06579417735338211, "learning_rate": 1.5854468703272663e-06, "loss": 0.5114, "step": 1878 }, { "epoch": 4.084805653710247, "grad_norm": 0.07120195031166077, "learning_rate": 1.5780583013974294e-06, "loss": 0.6072, "step": 1879 }, { "epoch": 4.086980157651536, "grad_norm": 0.07006710022687912, "learning_rate": 1.57068551390396e-06, "loss": 0.4737, "step": 1880 }, { "epoch": 4.089154661592824, "grad_norm": 0.07203473895788193, "learning_rate": 1.5633285216623384e-06, "loss": 0.6901, "step": 1881 }, { "epoch": 4.091329165534113, "grad_norm": 0.07875218987464905, "learning_rate": 1.5559873384584445e-06, "loss": 0.5215, "step": 1882 }, { "epoch": 4.093503669475401, "grad_norm": 0.06638798862695694, "learning_rate": 1.548661978048539e-06, "loss": 0.545, "step": 1883 }, { "epoch": 4.095678173416689, "grad_norm": 0.06834139674901962, "learning_rate": 1.5413524541592372e-06, "loss": 0.4953, "step": 1884 }, { "epoch": 4.097852677357977, "grad_norm": 0.07506418973207474, "learning_rate": 1.5340587804874662e-06, "loss": 0.5581, "step": 1885 }, { "epoch": 4.100027181299266, "grad_norm": 0.07798907905817032, "learning_rate": 1.5267809707004665e-06, "loss": 0.5574, "step": 1886 }, { "epoch": 4.102201685240554, "grad_norm": 0.07272151112556458, "learning_rate": 1.5195190384357405e-06, "loss": 0.5808, "step": 1887 }, { "epoch": 4.104376189181843, "grad_norm": 0.06265170127153397, "learning_rate": 1.5122729973010454e-06, "loss": 0.4413, "step": 1888 }, { "epoch": 4.106550693123132, "grad_norm": 0.0679401382803917, "learning_rate": 1.5050428608743606e-06, "loss": 0.6279, "step": 1889 }, { "epoch": 4.10872519706442, "grad_norm": 0.07039714604616165, "learning_rate": 1.4978286427038602e-06, "loss": 0.4881, "step": 1890 }, { "epoch": 4.1108997010057085, "grad_norm": 0.06845873594284058, "learning_rate": 1.4906303563078871e-06, "loss": 0.4997, "step": 1891 }, { "epoch": 4.113074204946996, "grad_norm": 0.07568681985139847, "learning_rate": 1.4834480151749365e-06, "loss": 0.7321, "step": 1892 }, { "epoch": 4.115248708888285, "grad_norm": 0.06754462420940399, "learning_rate": 1.4762816327636242e-06, "loss": 0.4583, "step": 1893 }, { "epoch": 4.117423212829573, "grad_norm": 0.06954129040241241, "learning_rate": 1.4691312225026554e-06, "loss": 0.4918, "step": 1894 }, { "epoch": 4.119597716770862, "grad_norm": 0.07597080618143082, "learning_rate": 1.4619967977908157e-06, "loss": 0.496, "step": 1895 }, { "epoch": 4.12177222071215, "grad_norm": 0.06444435566663742, "learning_rate": 1.454878371996924e-06, "loss": 0.5194, "step": 1896 }, { "epoch": 4.123946724653439, "grad_norm": 0.07335682213306427, "learning_rate": 1.4477759584598294e-06, "loss": 0.5142, "step": 1897 }, { "epoch": 4.126121228594727, "grad_norm": 0.06388718634843826, "learning_rate": 1.440689570488376e-06, "loss": 0.5322, "step": 1898 }, { "epoch": 4.1282957325360154, "grad_norm": 0.06969299912452698, "learning_rate": 1.4336192213613742e-06, "loss": 0.5629, "step": 1899 }, { "epoch": 4.130470236477303, "grad_norm": 0.06335947662591934, "learning_rate": 1.4265649243275782e-06, "loss": 0.512, "step": 1900 }, { "epoch": 4.132644740418592, "grad_norm": 0.07037252187728882, "learning_rate": 1.4195266926056694e-06, "loss": 0.5814, "step": 1901 }, { "epoch": 4.13481924435988, "grad_norm": 0.06880515068769455, "learning_rate": 1.4125045393842219e-06, "loss": 0.4584, "step": 1902 }, { "epoch": 4.136993748301169, "grad_norm": 0.07320540398359299, "learning_rate": 1.405498477821685e-06, "loss": 0.5326, "step": 1903 }, { "epoch": 4.139168252242457, "grad_norm": 0.06588432192802429, "learning_rate": 1.3985085210463479e-06, "loss": 0.5181, "step": 1904 }, { "epoch": 4.141342756183746, "grad_norm": 0.0709570124745369, "learning_rate": 1.3915346821563235e-06, "loss": 0.5264, "step": 1905 }, { "epoch": 4.143517260125034, "grad_norm": 0.06728381663560867, "learning_rate": 1.384576974219526e-06, "loss": 0.4719, "step": 1906 }, { "epoch": 4.1456917640663224, "grad_norm": 0.07169146090745926, "learning_rate": 1.3776354102736423e-06, "loss": 0.5003, "step": 1907 }, { "epoch": 4.14786626800761, "grad_norm": 0.06266964972019196, "learning_rate": 1.3707100033261035e-06, "loss": 0.4523, "step": 1908 }, { "epoch": 4.150040771948899, "grad_norm": 0.06978844106197357, "learning_rate": 1.3638007663540697e-06, "loss": 0.5555, "step": 1909 }, { "epoch": 4.152215275890187, "grad_norm": 0.061596207320690155, "learning_rate": 1.3569077123043973e-06, "loss": 0.4213, "step": 1910 }, { "epoch": 4.154389779831476, "grad_norm": 0.07783644646406174, "learning_rate": 1.3500308540936203e-06, "loss": 0.6093, "step": 1911 }, { "epoch": 4.156564283772765, "grad_norm": 0.09196025133132935, "learning_rate": 1.3431702046079276e-06, "loss": 0.6781, "step": 1912 }, { "epoch": 4.158738787714053, "grad_norm": 0.06989976763725281, "learning_rate": 1.336325776703128e-06, "loss": 0.5267, "step": 1913 }, { "epoch": 4.1609132916553415, "grad_norm": 0.09241233766078949, "learning_rate": 1.3294975832046353e-06, "loss": 0.5967, "step": 1914 }, { "epoch": 4.1630877955966294, "grad_norm": 0.07054842263460159, "learning_rate": 1.322685636907447e-06, "loss": 0.5188, "step": 1915 }, { "epoch": 4.165262299537918, "grad_norm": 0.07163341343402863, "learning_rate": 1.3158899505761147e-06, "loss": 0.5593, "step": 1916 }, { "epoch": 4.167436803479206, "grad_norm": 0.06760688126087189, "learning_rate": 1.3091105369447166e-06, "loss": 0.5123, "step": 1917 }, { "epoch": 4.169611307420495, "grad_norm": 0.06981354206800461, "learning_rate": 1.3023474087168453e-06, "loss": 0.4516, "step": 1918 }, { "epoch": 4.171785811361783, "grad_norm": 0.07176142185926437, "learning_rate": 1.2956005785655689e-06, "loss": 0.5262, "step": 1919 }, { "epoch": 4.173960315303072, "grad_norm": 0.0662393644452095, "learning_rate": 1.2888700591334225e-06, "loss": 0.5322, "step": 1920 }, { "epoch": 4.17613481924436, "grad_norm": 0.0860685184597969, "learning_rate": 1.282155863032377e-06, "loss": 0.7697, "step": 1921 }, { "epoch": 4.1783093231856485, "grad_norm": 0.06831642985343933, "learning_rate": 1.2754580028438102e-06, "loss": 0.4985, "step": 1922 }, { "epoch": 4.1804838271269364, "grad_norm": 0.07947570830583572, "learning_rate": 1.2687764911184908e-06, "loss": 0.5813, "step": 1923 }, { "epoch": 4.182658331068225, "grad_norm": 0.0603213757276535, "learning_rate": 1.2621113403765561e-06, "loss": 0.4629, "step": 1924 }, { "epoch": 4.184832835009513, "grad_norm": 0.0684674009680748, "learning_rate": 1.2554625631074846e-06, "loss": 0.4647, "step": 1925 }, { "epoch": 4.187007338950802, "grad_norm": 0.06759068369865417, "learning_rate": 1.2488301717700735e-06, "loss": 0.4942, "step": 1926 }, { "epoch": 4.18918184289209, "grad_norm": 0.07188436388969421, "learning_rate": 1.2422141787924136e-06, "loss": 0.4771, "step": 1927 }, { "epoch": 4.191356346833379, "grad_norm": 0.07443997263908386, "learning_rate": 1.2356145965718646e-06, "loss": 0.5795, "step": 1928 }, { "epoch": 4.193530850774667, "grad_norm": 0.07631922513246536, "learning_rate": 1.2290314374750423e-06, "loss": 0.6142, "step": 1929 }, { "epoch": 4.1957053547159555, "grad_norm": 0.06583473831415176, "learning_rate": 1.2224647138377854e-06, "loss": 0.524, "step": 1930 }, { "epoch": 4.1978798586572434, "grad_norm": 0.07295595854520798, "learning_rate": 1.21591443796513e-06, "loss": 0.5212, "step": 1931 }, { "epoch": 4.200054362598532, "grad_norm": 0.07355212420225143, "learning_rate": 1.209380622131301e-06, "loss": 0.4975, "step": 1932 }, { "epoch": 4.20222886653982, "grad_norm": 0.07224130630493164, "learning_rate": 1.202863278579669e-06, "loss": 0.6124, "step": 1933 }, { "epoch": 4.204403370481109, "grad_norm": 0.06853799521923065, "learning_rate": 1.1963624195227464e-06, "loss": 0.5083, "step": 1934 }, { "epoch": 4.206577874422397, "grad_norm": 0.06510897725820541, "learning_rate": 1.1898780571421554e-06, "loss": 0.4937, "step": 1935 }, { "epoch": 4.208752378363686, "grad_norm": 0.06679749488830566, "learning_rate": 1.183410203588601e-06, "loss": 0.4623, "step": 1936 }, { "epoch": 4.210926882304975, "grad_norm": 0.08363625407218933, "learning_rate": 1.1769588709818535e-06, "loss": 0.5029, "step": 1937 }, { "epoch": 4.2131013862462625, "grad_norm": 0.06147165969014168, "learning_rate": 1.1705240714107301e-06, "loss": 0.458, "step": 1938 }, { "epoch": 4.215275890187551, "grad_norm": 0.06801257282495499, "learning_rate": 1.1641058169330688e-06, "loss": 0.5042, "step": 1939 }, { "epoch": 4.217450394128839, "grad_norm": 0.0713082104921341, "learning_rate": 1.1577041195756954e-06, "loss": 0.4897, "step": 1940 }, { "epoch": 4.219624898070128, "grad_norm": 0.0625024288892746, "learning_rate": 1.1513189913344214e-06, "loss": 0.4717, "step": 1941 }, { "epoch": 4.221799402011416, "grad_norm": 0.07821156829595566, "learning_rate": 1.1449504441740022e-06, "loss": 0.7098, "step": 1942 }, { "epoch": 4.223973905952705, "grad_norm": 0.0768701359629631, "learning_rate": 1.138598490028121e-06, "loss": 0.5489, "step": 1943 }, { "epoch": 4.226148409893993, "grad_norm": 0.08276499062776566, "learning_rate": 1.132263140799381e-06, "loss": 0.4909, "step": 1944 }, { "epoch": 4.228322913835282, "grad_norm": 0.06984122842550278, "learning_rate": 1.1259444083592585e-06, "loss": 0.5314, "step": 1945 }, { "epoch": 4.2304974177765695, "grad_norm": 0.06368063390254974, "learning_rate": 1.1196423045480942e-06, "loss": 0.4914, "step": 1946 }, { "epoch": 4.232671921717858, "grad_norm": 0.07892414182424545, "learning_rate": 1.1133568411750729e-06, "loss": 0.6499, "step": 1947 }, { "epoch": 4.234846425659146, "grad_norm": 0.06923570483922958, "learning_rate": 1.1070880300181963e-06, "loss": 0.5284, "step": 1948 }, { "epoch": 4.237020929600435, "grad_norm": 0.06931766867637634, "learning_rate": 1.1008358828242595e-06, "loss": 0.569, "step": 1949 }, { "epoch": 4.239195433541723, "grad_norm": 0.07106120139360428, "learning_rate": 1.0946004113088381e-06, "loss": 0.5564, "step": 1950 }, { "epoch": 4.241369937483012, "grad_norm": 0.06361215561628342, "learning_rate": 1.088381627156253e-06, "loss": 0.445, "step": 1951 }, { "epoch": 4.2435444414243, "grad_norm": 0.07185090333223343, "learning_rate": 1.0821795420195592e-06, "loss": 0.6468, "step": 1952 }, { "epoch": 4.245718945365589, "grad_norm": 0.07368568331003189, "learning_rate": 1.0759941675205221e-06, "loss": 0.495, "step": 1953 }, { "epoch": 4.2478934493068765, "grad_norm": 0.06581132113933563, "learning_rate": 1.0698255152495895e-06, "loss": 0.6441, "step": 1954 }, { "epoch": 4.250067953248165, "grad_norm": 0.06936019659042358, "learning_rate": 1.0636735967658785e-06, "loss": 0.5955, "step": 1955 }, { "epoch": 4.252242457189453, "grad_norm": 0.06856555491685867, "learning_rate": 1.0575384235971463e-06, "loss": 0.4753, "step": 1956 }, { "epoch": 4.254416961130742, "grad_norm": 0.06685930490493774, "learning_rate": 1.0514200072397695e-06, "loss": 0.4758, "step": 1957 }, { "epoch": 4.256591465072031, "grad_norm": 0.08133382350206375, "learning_rate": 1.045318359158737e-06, "loss": 0.5222, "step": 1958 }, { "epoch": 4.258765969013319, "grad_norm": 0.08194360136985779, "learning_rate": 1.0392334907876022e-06, "loss": 0.6832, "step": 1959 }, { "epoch": 4.260940472954607, "grad_norm": 0.07147635519504547, "learning_rate": 1.033165413528483e-06, "loss": 0.6186, "step": 1960 }, { "epoch": 4.263114976895896, "grad_norm": 0.0699101909995079, "learning_rate": 1.0271141387520322e-06, "loss": 0.5217, "step": 1961 }, { "epoch": 4.265289480837184, "grad_norm": 0.06448724865913391, "learning_rate": 1.0210796777974196e-06, "loss": 0.4278, "step": 1962 }, { "epoch": 4.267463984778472, "grad_norm": 0.09461896121501923, "learning_rate": 1.0150620419723022e-06, "loss": 0.506, "step": 1963 }, { "epoch": 4.269638488719761, "grad_norm": 0.0815594494342804, "learning_rate": 1.009061242552818e-06, "loss": 0.5856, "step": 1964 }, { "epoch": 4.271812992661049, "grad_norm": 0.08406014740467072, "learning_rate": 1.0030772907835484e-06, "loss": 0.7076, "step": 1965 }, { "epoch": 4.273987496602338, "grad_norm": 0.06767473369836807, "learning_rate": 9.971101978775056e-07, "loss": 0.4883, "step": 1966 }, { "epoch": 4.276162000543626, "grad_norm": 0.06697352975606918, "learning_rate": 9.911599750161183e-07, "loss": 0.5135, "step": 1967 }, { "epoch": 4.278336504484915, "grad_norm": 0.07337699830532074, "learning_rate": 9.852266333491955e-07, "loss": 0.5832, "step": 1968 }, { "epoch": 4.280511008426203, "grad_norm": 0.06913413852453232, "learning_rate": 9.793101839949148e-07, "loss": 0.497, "step": 1969 }, { "epoch": 4.282685512367491, "grad_norm": 0.07249415665864944, "learning_rate": 9.734106380398022e-07, "loss": 0.4305, "step": 1970 }, { "epoch": 4.284860016308779, "grad_norm": 0.06467798352241516, "learning_rate": 9.675280065387117e-07, "loss": 0.506, "step": 1971 }, { "epoch": 4.287034520250068, "grad_norm": 0.07246431708335876, "learning_rate": 9.616623005147952e-07, "loss": 0.5587, "step": 1972 }, { "epoch": 4.289209024191356, "grad_norm": 0.07415547221899033, "learning_rate": 9.558135309594941e-07, "loss": 0.6007, "step": 1973 }, { "epoch": 4.291383528132645, "grad_norm": 0.07107709348201752, "learning_rate": 9.499817088325103e-07, "loss": 0.558, "step": 1974 }, { "epoch": 4.293558032073933, "grad_norm": 0.0705399364233017, "learning_rate": 9.441668450617924e-07, "loss": 0.545, "step": 1975 }, { "epoch": 4.295732536015222, "grad_norm": 0.07318824529647827, "learning_rate": 9.383689505435112e-07, "loss": 0.5828, "step": 1976 }, { "epoch": 4.29790703995651, "grad_norm": 0.07265083491802216, "learning_rate": 9.325880361420336e-07, "loss": 0.5302, "step": 1977 }, { "epoch": 4.300081543897798, "grad_norm": 0.07368820160627365, "learning_rate": 9.26824112689918e-07, "loss": 0.5033, "step": 1978 }, { "epoch": 4.302256047839086, "grad_norm": 0.06669095903635025, "learning_rate": 9.210771909878769e-07, "loss": 0.5036, "step": 1979 }, { "epoch": 4.304430551780375, "grad_norm": 0.06157700717449188, "learning_rate": 9.153472818047627e-07, "loss": 0.5461, "step": 1980 }, { "epoch": 4.306605055721663, "grad_norm": 0.060206010937690735, "learning_rate": 9.0963439587756e-07, "loss": 0.4659, "step": 1981 }, { "epoch": 4.308779559662952, "grad_norm": 0.08186089247465134, "learning_rate": 9.039385439113435e-07, "loss": 0.6162, "step": 1982 }, { "epoch": 4.310954063604241, "grad_norm": 0.06879046559333801, "learning_rate": 8.982597365792711e-07, "loss": 0.5037, "step": 1983 }, { "epoch": 4.313128567545529, "grad_norm": 0.07082629203796387, "learning_rate": 8.925979845225641e-07, "loss": 0.4884, "step": 1984 }, { "epoch": 4.3153030714868175, "grad_norm": 0.07803993672132492, "learning_rate": 8.869532983504859e-07, "loss": 0.5432, "step": 1985 }, { "epoch": 4.317477575428105, "grad_norm": 0.06588912010192871, "learning_rate": 8.813256886403165e-07, "loss": 0.5262, "step": 1986 }, { "epoch": 4.319652079369394, "grad_norm": 0.06609542667865753, "learning_rate": 8.757151659373408e-07, "loss": 0.535, "step": 1987 }, { "epoch": 4.321826583310682, "grad_norm": 0.07395874708890915, "learning_rate": 8.701217407548245e-07, "loss": 0.5925, "step": 1988 }, { "epoch": 4.324001087251971, "grad_norm": 0.06963888555765152, "learning_rate": 8.645454235739903e-07, "loss": 0.5796, "step": 1989 }, { "epoch": 4.326175591193259, "grad_norm": 0.0705757588148117, "learning_rate": 8.58986224844014e-07, "loss": 0.5032, "step": 1990 }, { "epoch": 4.328350095134548, "grad_norm": 0.07324569672346115, "learning_rate": 8.534441549819849e-07, "loss": 0.4928, "step": 1991 }, { "epoch": 4.330524599075836, "grad_norm": 0.06855554133653641, "learning_rate": 8.479192243728962e-07, "loss": 0.5318, "step": 1992 }, { "epoch": 4.3326991030171245, "grad_norm": 0.06802328675985336, "learning_rate": 8.424114433696296e-07, "loss": 0.4993, "step": 1993 }, { "epoch": 4.334873606958412, "grad_norm": 0.06989164650440216, "learning_rate": 8.369208222929248e-07, "loss": 0.5864, "step": 1994 }, { "epoch": 4.337048110899701, "grad_norm": 0.06718356907367706, "learning_rate": 8.31447371431372e-07, "loss": 0.5387, "step": 1995 }, { "epoch": 4.339222614840989, "grad_norm": 0.06938335299491882, "learning_rate": 8.259911010413846e-07, "loss": 0.4848, "step": 1996 }, { "epoch": 4.341397118782278, "grad_norm": 0.07358980178833008, "learning_rate": 8.205520213471808e-07, "loss": 0.4676, "step": 1997 }, { "epoch": 4.343571622723566, "grad_norm": 0.0745391845703125, "learning_rate": 8.151301425407699e-07, "loss": 0.4489, "step": 1998 }, { "epoch": 4.345746126664855, "grad_norm": 0.07397934049367905, "learning_rate": 8.097254747819272e-07, "loss": 0.6033, "step": 1999 }, { "epoch": 4.347920630606143, "grad_norm": 0.06908058375120163, "learning_rate": 8.043380281981739e-07, "loss": 0.6221, "step": 2000 }, { "epoch": 4.3500951345474315, "grad_norm": 0.07788637280464172, "learning_rate": 7.9896781288477e-07, "loss": 0.4631, "step": 2001 }, { "epoch": 4.352269638488719, "grad_norm": 0.0669802576303482, "learning_rate": 7.936148389046772e-07, "loss": 0.5119, "step": 2002 }, { "epoch": 4.354444142430008, "grad_norm": 0.07455005496740341, "learning_rate": 7.882791162885505e-07, "loss": 0.5543, "step": 2003 }, { "epoch": 4.356618646371296, "grad_norm": 0.06506702303886414, "learning_rate": 7.829606550347313e-07, "loss": 0.4767, "step": 2004 }, { "epoch": 4.358793150312585, "grad_norm": 0.0729403868317604, "learning_rate": 7.776594651091995e-07, "loss": 0.5132, "step": 2005 }, { "epoch": 4.360967654253873, "grad_norm": 0.07717705518007278, "learning_rate": 7.723755564455771e-07, "loss": 0.5808, "step": 2006 }, { "epoch": 4.363142158195162, "grad_norm": 0.08073990046977997, "learning_rate": 7.671089389451059e-07, "loss": 0.6565, "step": 2007 }, { "epoch": 4.3653166621364505, "grad_norm": 0.06128135323524475, "learning_rate": 7.618596224766283e-07, "loss": 0.3988, "step": 2008 }, { "epoch": 4.3674911660777385, "grad_norm": 0.0795668289065361, "learning_rate": 7.566276168765585e-07, "loss": 0.536, "step": 2009 }, { "epoch": 4.369665670019027, "grad_norm": 0.07770153880119324, "learning_rate": 7.514129319488839e-07, "loss": 0.4908, "step": 2010 }, { "epoch": 4.371840173960315, "grad_norm": 0.07897308468818665, "learning_rate": 7.462155774651258e-07, "loss": 0.5445, "step": 2011 }, { "epoch": 4.374014677901604, "grad_norm": 0.06489072740077972, "learning_rate": 7.41035563164334e-07, "loss": 0.4931, "step": 2012 }, { "epoch": 4.376189181842892, "grad_norm": 0.07399582862854004, "learning_rate": 7.358728987530728e-07, "loss": 0.6113, "step": 2013 }, { "epoch": 4.378363685784181, "grad_norm": 0.06987323611974716, "learning_rate": 7.30727593905386e-07, "loss": 0.4726, "step": 2014 }, { "epoch": 4.380538189725469, "grad_norm": 0.07739045470952988, "learning_rate": 7.255996582627878e-07, "loss": 0.5777, "step": 2015 }, { "epoch": 4.3827126936667575, "grad_norm": 0.06581728160381317, "learning_rate": 7.204891014342552e-07, "loss": 0.4841, "step": 2016 }, { "epoch": 4.3848871976080455, "grad_norm": 0.08110526204109192, "learning_rate": 7.15395932996188e-07, "loss": 0.5832, "step": 2017 }, { "epoch": 4.387061701549334, "grad_norm": 0.07114668935537338, "learning_rate": 7.103201624924105e-07, "loss": 0.512, "step": 2018 }, { "epoch": 4.389236205490622, "grad_norm": 0.07365507632493973, "learning_rate": 7.052617994341449e-07, "loss": 0.5595, "step": 2019 }, { "epoch": 4.391410709431911, "grad_norm": 0.07155247777700424, "learning_rate": 7.002208532999933e-07, "loss": 0.5394, "step": 2020 }, { "epoch": 4.393585213373199, "grad_norm": 0.06850498914718628, "learning_rate": 6.951973335359164e-07, "loss": 0.4869, "step": 2021 }, { "epoch": 4.395759717314488, "grad_norm": 0.08481892198324203, "learning_rate": 6.901912495552332e-07, "loss": 0.6875, "step": 2022 }, { "epoch": 4.397934221255776, "grad_norm": 0.07639505714178085, "learning_rate": 6.852026107385756e-07, "loss": 0.6507, "step": 2023 }, { "epoch": 4.4001087251970645, "grad_norm": 0.07097730785608292, "learning_rate": 6.802314264338994e-07, "loss": 0.534, "step": 2024 }, { "epoch": 4.4022832291383525, "grad_norm": 0.06911318004131317, "learning_rate": 6.752777059564431e-07, "loss": 0.6052, "step": 2025 }, { "epoch": 4.404457733079641, "grad_norm": 0.06838548183441162, "learning_rate": 6.703414585887224e-07, "loss": 0.6303, "step": 2026 }, { "epoch": 4.406632237020929, "grad_norm": 0.0710616335272789, "learning_rate": 6.654226935805197e-07, "loss": 0.484, "step": 2027 }, { "epoch": 4.408806740962218, "grad_norm": 0.07024000585079193, "learning_rate": 6.605214201488485e-07, "loss": 0.5073, "step": 2028 }, { "epoch": 4.410981244903507, "grad_norm": 0.06726878136396408, "learning_rate": 6.556376474779469e-07, "loss": 0.5646, "step": 2029 }, { "epoch": 4.413155748844795, "grad_norm": 0.06882317364215851, "learning_rate": 6.507713847192643e-07, "loss": 0.4906, "step": 2030 }, { "epoch": 4.415330252786083, "grad_norm": 0.0777633935213089, "learning_rate": 6.459226409914332e-07, "loss": 0.6308, "step": 2031 }, { "epoch": 4.4175047567273715, "grad_norm": 0.06918834149837494, "learning_rate": 6.410914253802636e-07, "loss": 0.4799, "step": 2032 }, { "epoch": 4.41967926066866, "grad_norm": 0.07858038693666458, "learning_rate": 6.362777469387182e-07, "loss": 0.7899, "step": 2033 }, { "epoch": 4.421853764609948, "grad_norm": 0.06900496780872345, "learning_rate": 6.314816146868951e-07, "loss": 0.6013, "step": 2034 }, { "epoch": 4.424028268551237, "grad_norm": 0.0603947639465332, "learning_rate": 6.267030376120154e-07, "loss": 0.4777, "step": 2035 }, { "epoch": 4.426202772492525, "grad_norm": 0.07641428709030151, "learning_rate": 6.219420246684094e-07, "loss": 0.5478, "step": 2036 }, { "epoch": 4.428377276433814, "grad_norm": 0.06903336942195892, "learning_rate": 6.171985847774864e-07, "loss": 0.5309, "step": 2037 }, { "epoch": 4.430551780375102, "grad_norm": 0.0790371373295784, "learning_rate": 6.124727268277309e-07, "loss": 0.7011, "step": 2038 }, { "epoch": 4.432726284316391, "grad_norm": 0.07202243059873581, "learning_rate": 6.077644596746834e-07, "loss": 0.4531, "step": 2039 }, { "epoch": 4.4349007882576785, "grad_norm": 0.0744062215089798, "learning_rate": 6.030737921409169e-07, "loss": 0.756, "step": 2040 }, { "epoch": 4.437075292198967, "grad_norm": 0.06340514868497849, "learning_rate": 5.984007330160291e-07, "loss": 0.4688, "step": 2041 }, { "epoch": 4.439249796140255, "grad_norm": 0.07829028367996216, "learning_rate": 5.937452910566221e-07, "loss": 0.5874, "step": 2042 }, { "epoch": 4.441424300081544, "grad_norm": 0.06800806522369385, "learning_rate": 5.891074749862857e-07, "loss": 0.5048, "step": 2043 }, { "epoch": 4.443598804022832, "grad_norm": 0.06820278614759445, "learning_rate": 5.844872934955781e-07, "loss": 0.4637, "step": 2044 }, { "epoch": 4.445773307964121, "grad_norm": 0.07489970326423645, "learning_rate": 5.798847552420184e-07, "loss": 0.5633, "step": 2045 }, { "epoch": 4.447947811905409, "grad_norm": 0.07076244056224823, "learning_rate": 5.752998688500611e-07, "loss": 0.4922, "step": 2046 }, { "epoch": 4.450122315846698, "grad_norm": 0.06336143612861633, "learning_rate": 5.707326429110871e-07, "loss": 0.4616, "step": 2047 }, { "epoch": 4.4522968197879855, "grad_norm": 0.0720994621515274, "learning_rate": 5.661830859833817e-07, "loss": 0.4677, "step": 2048 }, { "epoch": 4.454471323729274, "grad_norm": 0.06567011028528214, "learning_rate": 5.616512065921187e-07, "loss": 0.5201, "step": 2049 }, { "epoch": 4.456645827670562, "grad_norm": 0.07548706978559494, "learning_rate": 5.571370132293552e-07, "loss": 0.5042, "step": 2050 }, { "epoch": 4.458820331611851, "grad_norm": 0.07709793746471405, "learning_rate": 5.526405143539992e-07, "loss": 0.6704, "step": 2051 }, { "epoch": 4.460994835553139, "grad_norm": 0.0783550888299942, "learning_rate": 5.481617183918053e-07, "loss": 0.5324, "step": 2052 }, { "epoch": 4.463169339494428, "grad_norm": 0.06674841791391373, "learning_rate": 5.437006337353556e-07, "loss": 0.5197, "step": 2053 }, { "epoch": 4.465343843435717, "grad_norm": 0.06913979351520538, "learning_rate": 5.392572687440423e-07, "loss": 0.5662, "step": 2054 }, { "epoch": 4.467518347377005, "grad_norm": 0.07597506791353226, "learning_rate": 5.348316317440549e-07, "loss": 0.5207, "step": 2055 }, { "epoch": 4.469692851318293, "grad_norm": 0.08883388340473175, "learning_rate": 5.304237310283655e-07, "loss": 0.5472, "step": 2056 }, { "epoch": 4.471867355259581, "grad_norm": 0.07169458270072937, "learning_rate": 5.26033574856708e-07, "loss": 0.5635, "step": 2057 }, { "epoch": 4.47404185920087, "grad_norm": 0.0716870129108429, "learning_rate": 5.216611714555631e-07, "loss": 0.6265, "step": 2058 }, { "epoch": 4.476216363142158, "grad_norm": 0.07178358733654022, "learning_rate": 5.173065290181544e-07, "loss": 0.7098, "step": 2059 }, { "epoch": 4.478390867083447, "grad_norm": 0.08498340100049973, "learning_rate": 5.129696557044173e-07, "loss": 0.5691, "step": 2060 }, { "epoch": 4.480565371024735, "grad_norm": 0.0688110813498497, "learning_rate": 5.086505596409885e-07, "loss": 0.4541, "step": 2061 }, { "epoch": 4.482739874966024, "grad_norm": 0.0645025447010994, "learning_rate": 5.043492489211999e-07, "loss": 0.4846, "step": 2062 }, { "epoch": 4.484914378907312, "grad_norm": 0.06932462751865387, "learning_rate": 5.000657316050505e-07, "loss": 0.4986, "step": 2063 }, { "epoch": 4.4870888828486, "grad_norm": 0.06804119050502777, "learning_rate": 4.958000157192023e-07, "loss": 0.4611, "step": 2064 }, { "epoch": 4.489263386789888, "grad_norm": 0.06467209756374359, "learning_rate": 4.915521092569553e-07, "loss": 0.5282, "step": 2065 }, { "epoch": 4.491437890731177, "grad_norm": 0.06818696111440659, "learning_rate": 4.873220201782414e-07, "loss": 0.5614, "step": 2066 }, { "epoch": 4.493612394672465, "grad_norm": 0.06696145236492157, "learning_rate": 4.831097564095999e-07, "loss": 0.4718, "step": 2067 }, { "epoch": 4.495786898613754, "grad_norm": 0.0750451534986496, "learning_rate": 4.789153258441737e-07, "loss": 0.5496, "step": 2068 }, { "epoch": 4.497961402555042, "grad_norm": 0.07344862073659897, "learning_rate": 4.747387363416878e-07, "loss": 0.5525, "step": 2069 }, { "epoch": 4.500135906496331, "grad_norm": 0.07685412466526031, "learning_rate": 4.7057999572843516e-07, "loss": 0.4923, "step": 2070 }, { "epoch": 4.502310410437619, "grad_norm": 0.07314995676279068, "learning_rate": 4.6643911179726107e-07, "loss": 0.5738, "step": 2071 }, { "epoch": 4.504484914378907, "grad_norm": 0.07131587713956833, "learning_rate": 4.623160923075498e-07, "loss": 0.6023, "step": 2072 }, { "epoch": 4.506659418320195, "grad_norm": 0.07520224153995514, "learning_rate": 4.582109449852168e-07, "loss": 0.5416, "step": 2073 }, { "epoch": 4.508833922261484, "grad_norm": 0.07269252091646194, "learning_rate": 4.5412367752268094e-07, "loss": 0.59, "step": 2074 }, { "epoch": 4.511008426202773, "grad_norm": 0.07278648018836975, "learning_rate": 4.50054297578858e-07, "loss": 0.5641, "step": 2075 }, { "epoch": 4.513182930144061, "grad_norm": 0.0712626576423645, "learning_rate": 4.4600281277914715e-07, "loss": 0.6812, "step": 2076 }, { "epoch": 4.515357434085349, "grad_norm": 0.076114721596241, "learning_rate": 4.4196923071541334e-07, "loss": 0.5703, "step": 2077 }, { "epoch": 4.517531938026638, "grad_norm": 0.07070983201265335, "learning_rate": 4.3795355894597494e-07, "loss": 0.4503, "step": 2078 }, { "epoch": 4.5197064419679265, "grad_norm": 0.0671825259923935, "learning_rate": 4.3395580499559276e-07, "loss": 0.4964, "step": 2079 }, { "epoch": 4.521880945909214, "grad_norm": 0.06670922785997391, "learning_rate": 4.2997597635544563e-07, "loss": 0.4539, "step": 2080 }, { "epoch": 4.524055449850503, "grad_norm": 0.06983844935894012, "learning_rate": 4.2601408048312585e-07, "loss": 0.5306, "step": 2081 }, { "epoch": 4.526229953791791, "grad_norm": 0.06778093427419662, "learning_rate": 4.2207012480262486e-07, "loss": 0.5243, "step": 2082 }, { "epoch": 4.52840445773308, "grad_norm": 0.0751190334558487, "learning_rate": 4.181441167043154e-07, "loss": 0.5211, "step": 2083 }, { "epoch": 4.530578961674368, "grad_norm": 0.06628070026636124, "learning_rate": 4.1423606354493716e-07, "loss": 0.4655, "step": 2084 }, { "epoch": 4.532753465615657, "grad_norm": 0.07510156184434891, "learning_rate": 4.103459726475889e-07, "loss": 0.5706, "step": 2085 }, { "epoch": 4.534927969556945, "grad_norm": 0.07054702192544937, "learning_rate": 4.0647385130170637e-07, "loss": 0.5498, "step": 2086 }, { "epoch": 4.5371024734982335, "grad_norm": 0.06781338155269623, "learning_rate": 4.026197067630555e-07, "loss": 0.5409, "step": 2087 }, { "epoch": 4.539276977439521, "grad_norm": 0.07024931907653809, "learning_rate": 3.9878354625371927e-07, "loss": 0.5743, "step": 2088 }, { "epoch": 4.54145148138081, "grad_norm": 0.06503729522228241, "learning_rate": 3.949653769620765e-07, "loss": 0.5449, "step": 2089 }, { "epoch": 4.543625985322098, "grad_norm": 0.07308085262775421, "learning_rate": 3.9116520604279285e-07, "loss": 0.4927, "step": 2090 }, { "epoch": 4.545800489263387, "grad_norm": 0.07071816176176071, "learning_rate": 3.8738304061681107e-07, "loss": 0.4774, "step": 2091 }, { "epoch": 4.547974993204675, "grad_norm": 0.08317436277866364, "learning_rate": 3.836188877713354e-07, "loss": 0.6557, "step": 2092 }, { "epoch": 4.550149497145964, "grad_norm": 0.06253214180469513, "learning_rate": 3.7987275455981467e-07, "loss": 0.5006, "step": 2093 }, { "epoch": 4.552324001087252, "grad_norm": 0.07177116721868515, "learning_rate": 3.761446480019315e-07, "loss": 0.6719, "step": 2094 }, { "epoch": 4.5544985050285405, "grad_norm": 0.07539907842874527, "learning_rate": 3.7243457508358784e-07, "loss": 0.5587, "step": 2095 }, { "epoch": 4.556673008969828, "grad_norm": 0.0661344826221466, "learning_rate": 3.6874254275689913e-07, "loss": 0.4681, "step": 2096 }, { "epoch": 4.558847512911117, "grad_norm": 0.07326968759298325, "learning_rate": 3.650685579401692e-07, "loss": 0.587, "step": 2097 }, { "epoch": 4.561022016852405, "grad_norm": 0.06711960583925247, "learning_rate": 3.6141262751788643e-07, "loss": 0.5023, "step": 2098 }, { "epoch": 4.563196520793694, "grad_norm": 0.06824001669883728, "learning_rate": 3.5777475834070985e-07, "loss": 0.4789, "step": 2099 }, { "epoch": 4.565371024734983, "grad_norm": 0.07779508084058762, "learning_rate": 3.541549572254488e-07, "loss": 0.5249, "step": 2100 }, { "epoch": 4.567545528676271, "grad_norm": 0.07398110628128052, "learning_rate": 3.505532309550619e-07, "loss": 0.4807, "step": 2101 }, { "epoch": 4.569720032617559, "grad_norm": 0.07046978175640106, "learning_rate": 3.4696958627863596e-07, "loss": 0.5628, "step": 2102 }, { "epoch": 4.5718945365588475, "grad_norm": 0.06408274173736572, "learning_rate": 3.43404029911375e-07, "loss": 0.482, "step": 2103 }, { "epoch": 4.574069040500136, "grad_norm": 0.07577970623970032, "learning_rate": 3.398565685345878e-07, "loss": 0.5352, "step": 2104 }, { "epoch": 4.576243544441424, "grad_norm": 0.07048167288303375, "learning_rate": 3.3632720879567594e-07, "loss": 0.502, "step": 2105 }, { "epoch": 4.578418048382713, "grad_norm": 0.07222486287355423, "learning_rate": 3.328159573081258e-07, "loss": 0.5791, "step": 2106 }, { "epoch": 4.580592552324001, "grad_norm": 0.0730973407626152, "learning_rate": 3.2932282065148533e-07, "loss": 0.6022, "step": 2107 }, { "epoch": 4.58276705626529, "grad_norm": 0.07026448845863342, "learning_rate": 3.2584780537136206e-07, "loss": 0.509, "step": 2108 }, { "epoch": 4.584941560206578, "grad_norm": 0.07138234376907349, "learning_rate": 3.223909179794027e-07, "loss": 0.4889, "step": 2109 }, { "epoch": 4.5871160641478665, "grad_norm": 0.07017572969198227, "learning_rate": 3.1895216495329116e-07, "loss": 0.5161, "step": 2110 }, { "epoch": 4.5892905680891545, "grad_norm": 0.09332267194986343, "learning_rate": 3.155315527367264e-07, "loss": 0.5711, "step": 2111 }, { "epoch": 4.591465072030443, "grad_norm": 0.06764230132102966, "learning_rate": 3.1212908773941344e-07, "loss": 0.4698, "step": 2112 }, { "epoch": 4.593639575971731, "grad_norm": 0.08266621828079224, "learning_rate": 3.087447763370544e-07, "loss": 0.587, "step": 2113 }, { "epoch": 4.59581407991302, "grad_norm": 0.06525610387325287, "learning_rate": 3.053786248713331e-07, "loss": 0.4705, "step": 2114 }, { "epoch": 4.597988583854308, "grad_norm": 0.05913282185792923, "learning_rate": 3.020306396499062e-07, "loss": 0.4822, "step": 2115 }, { "epoch": 4.600163087795597, "grad_norm": 0.06352110207080841, "learning_rate": 2.9870082694638644e-07, "loss": 0.5093, "step": 2116 }, { "epoch": 4.602337591736885, "grad_norm": 0.07105749845504761, "learning_rate": 2.953891930003372e-07, "loss": 0.5437, "step": 2117 }, { "epoch": 4.6045120956781735, "grad_norm": 0.0710003525018692, "learning_rate": 2.920957440172556e-07, "loss": 0.5006, "step": 2118 }, { "epoch": 4.6066865996194615, "grad_norm": 0.06965562701225281, "learning_rate": 2.888204861685628e-07, "loss": 0.5304, "step": 2119 }, { "epoch": 4.60886110356075, "grad_norm": 0.07342911511659622, "learning_rate": 2.8556342559159513e-07, "loss": 0.6032, "step": 2120 }, { "epoch": 4.611035607502038, "grad_norm": 0.07039739191532135, "learning_rate": 2.82324568389587e-07, "loss": 0.5426, "step": 2121 }, { "epoch": 4.613210111443327, "grad_norm": 0.06918011605739594, "learning_rate": 2.791039206316637e-07, "loss": 0.5254, "step": 2122 }, { "epoch": 4.615384615384615, "grad_norm": 0.08256839215755463, "learning_rate": 2.759014883528288e-07, "loss": 0.7587, "step": 2123 }, { "epoch": 4.617559119325904, "grad_norm": 0.07751629501581192, "learning_rate": 2.727172775539522e-07, "loss": 0.5908, "step": 2124 }, { "epoch": 4.619733623267193, "grad_norm": 0.07983749359846115, "learning_rate": 2.6955129420176193e-07, "loss": 0.6437, "step": 2125 }, { "epoch": 4.6219081272084805, "grad_norm": 0.06842846423387527, "learning_rate": 2.6640354422882706e-07, "loss": 0.4997, "step": 2126 }, { "epoch": 4.6240826311497685, "grad_norm": 0.08924654126167297, "learning_rate": 2.6327403353355264e-07, "loss": 0.5527, "step": 2127 }, { "epoch": 4.626257135091057, "grad_norm": 0.07044955343008041, "learning_rate": 2.601627679801633e-07, "loss": 0.5598, "step": 2128 }, { "epoch": 4.628431639032346, "grad_norm": 0.07956954836845398, "learning_rate": 2.570697533986999e-07, "loss": 0.5219, "step": 2129 }, { "epoch": 4.630606142973634, "grad_norm": 0.07262980192899704, "learning_rate": 2.539949955849985e-07, "loss": 0.5346, "step": 2130 }, { "epoch": 4.632780646914923, "grad_norm": 0.07692865282297134, "learning_rate": 2.5093850030068813e-07, "loss": 0.5181, "step": 2131 }, { "epoch": 4.634955150856211, "grad_norm": 0.07324913889169693, "learning_rate": 2.47900273273175e-07, "loss": 0.541, "step": 2132 }, { "epoch": 4.6371296547975, "grad_norm": 0.07266723364591599, "learning_rate": 2.44880320195634e-07, "loss": 0.5035, "step": 2133 }, { "epoch": 4.6393041587387875, "grad_norm": 0.07445555925369263, "learning_rate": 2.4187864672699843e-07, "loss": 0.5845, "step": 2134 }, { "epoch": 4.641478662680076, "grad_norm": 0.06808460503816605, "learning_rate": 2.3889525849194573e-07, "loss": 0.477, "step": 2135 }, { "epoch": 4.643653166621364, "grad_norm": 0.07705479115247726, "learning_rate": 2.3593016108089172e-07, "loss": 0.5507, "step": 2136 }, { "epoch": 4.645827670562653, "grad_norm": 0.070450060069561, "learning_rate": 2.3298336004997756e-07, "loss": 0.555, "step": 2137 }, { "epoch": 4.648002174503941, "grad_norm": 0.07029717415571213, "learning_rate": 2.3005486092106066e-07, "loss": 0.6549, "step": 2138 }, { "epoch": 4.65017667844523, "grad_norm": 0.06391594558954239, "learning_rate": 2.271446691817014e-07, "loss": 0.5033, "step": 2139 }, { "epoch": 4.652351182386518, "grad_norm": 0.08751088380813599, "learning_rate": 2.2425279028515658e-07, "loss": 0.6931, "step": 2140 }, { "epoch": 4.654525686327807, "grad_norm": 0.07637713849544525, "learning_rate": 2.2137922965036473e-07, "loss": 0.4797, "step": 2141 }, { "epoch": 4.6567001902690945, "grad_norm": 0.08041494339704514, "learning_rate": 2.1852399266194312e-07, "loss": 0.7861, "step": 2142 }, { "epoch": 4.658874694210383, "grad_norm": 0.07553854584693909, "learning_rate": 2.1568708467017197e-07, "loss": 0.5596, "step": 2143 }, { "epoch": 4.661049198151671, "grad_norm": 0.07027629762887955, "learning_rate": 2.1286851099098339e-07, "loss": 0.4992, "step": 2144 }, { "epoch": 4.66322370209296, "grad_norm": 0.06982608139514923, "learning_rate": 2.1006827690595478e-07, "loss": 0.5171, "step": 2145 }, { "epoch": 4.665398206034249, "grad_norm": 0.07245378196239471, "learning_rate": 2.0728638766229993e-07, "loss": 0.4473, "step": 2146 }, { "epoch": 4.667572709975537, "grad_norm": 0.06311751157045364, "learning_rate": 2.0452284847285343e-07, "loss": 0.4723, "step": 2147 }, { "epoch": 4.669747213916825, "grad_norm": 0.07150807231664658, "learning_rate": 2.0177766451607073e-07, "loss": 0.561, "step": 2148 }, { "epoch": 4.671921717858114, "grad_norm": 0.07484742254018784, "learning_rate": 1.990508409360048e-07, "loss": 0.5071, "step": 2149 }, { "epoch": 4.674096221799402, "grad_norm": 0.07232362776994705, "learning_rate": 1.9634238284230945e-07, "loss": 0.5623, "step": 2150 }, { "epoch": 4.67627072574069, "grad_norm": 0.0930938646197319, "learning_rate": 1.9365229531022267e-07, "loss": 0.5944, "step": 2151 }, { "epoch": 4.678445229681979, "grad_norm": 0.08295295387506485, "learning_rate": 1.909805833805589e-07, "loss": 0.6624, "step": 2152 }, { "epoch": 4.680619733623267, "grad_norm": 0.06264510005712509, "learning_rate": 1.8832725205969904e-07, "loss": 0.4351, "step": 2153 }, { "epoch": 4.682794237564556, "grad_norm": 0.09110794216394424, "learning_rate": 1.8569230631958258e-07, "loss": 0.5838, "step": 2154 }, { "epoch": 4.684968741505844, "grad_norm": 0.06513982266187668, "learning_rate": 1.830757510976966e-07, "loss": 0.4692, "step": 2155 }, { "epoch": 4.687143245447133, "grad_norm": 0.08374425768852234, "learning_rate": 1.8047759129706577e-07, "loss": 0.5113, "step": 2156 }, { "epoch": 4.689317749388421, "grad_norm": 0.06577417254447937, "learning_rate": 1.7789783178624898e-07, "loss": 0.5238, "step": 2157 }, { "epoch": 4.691492253329709, "grad_norm": 0.08540230244398117, "learning_rate": 1.7533647739932046e-07, "loss": 0.5851, "step": 2158 }, { "epoch": 4.693666757270997, "grad_norm": 0.06824812293052673, "learning_rate": 1.7279353293586765e-07, "loss": 0.5022, "step": 2159 }, { "epoch": 4.695841261212286, "grad_norm": 0.06775882095098495, "learning_rate": 1.7026900316098217e-07, "loss": 0.4328, "step": 2160 }, { "epoch": 4.698015765153574, "grad_norm": 0.06423519551753998, "learning_rate": 1.6776289280524992e-07, "loss": 0.4183, "step": 2161 }, { "epoch": 4.700190269094863, "grad_norm": 0.06971113383769989, "learning_rate": 1.6527520656473784e-07, "loss": 0.5075, "step": 2162 }, { "epoch": 4.702364773036151, "grad_norm": 0.07357890903949738, "learning_rate": 1.6280594910099257e-07, "loss": 0.5807, "step": 2163 }, { "epoch": 4.70453927697744, "grad_norm": 0.06896945834159851, "learning_rate": 1.6035512504102624e-07, "loss": 0.5634, "step": 2164 }, { "epoch": 4.706713780918728, "grad_norm": 0.06138351187109947, "learning_rate": 1.5792273897730858e-07, "loss": 0.4898, "step": 2165 }, { "epoch": 4.708888284860016, "grad_norm": 0.07171276211738586, "learning_rate": 1.5550879546776364e-07, "loss": 0.6321, "step": 2166 }, { "epoch": 4.711062788801304, "grad_norm": 0.07973835617303848, "learning_rate": 1.531132990357509e-07, "loss": 0.5588, "step": 2167 }, { "epoch": 4.713237292742593, "grad_norm": 0.06782406568527222, "learning_rate": 1.5073625417006855e-07, "loss": 0.5066, "step": 2168 }, { "epoch": 4.715411796683881, "grad_norm": 0.0664784163236618, "learning_rate": 1.4837766532493469e-07, "loss": 0.5394, "step": 2169 }, { "epoch": 4.71758630062517, "grad_norm": 0.07184542715549469, "learning_rate": 1.4603753691998735e-07, "loss": 0.526, "step": 2170 }, { "epoch": 4.719760804566459, "grad_norm": 0.07462891936302185, "learning_rate": 1.4371587334026992e-07, "loss": 0.6412, "step": 2171 }, { "epoch": 4.721935308507747, "grad_norm": 0.07026626169681549, "learning_rate": 1.414126789362269e-07, "loss": 0.5849, "step": 2172 }, { "epoch": 4.724109812449035, "grad_norm": 0.08897142857313156, "learning_rate": 1.391279580236926e-07, "loss": 0.5905, "step": 2173 }, { "epoch": 4.726284316390323, "grad_norm": 0.07412701100111008, "learning_rate": 1.368617148838869e-07, "loss": 0.5757, "step": 2174 }, { "epoch": 4.728458820331612, "grad_norm": 0.06492888927459717, "learning_rate": 1.3461395376340502e-07, "loss": 0.5188, "step": 2175 }, { "epoch": 4.7306333242729, "grad_norm": 0.06708363443613052, "learning_rate": 1.323846788742078e-07, "loss": 0.4958, "step": 2176 }, { "epoch": 4.732807828214189, "grad_norm": 0.07983957231044769, "learning_rate": 1.3017389439361928e-07, "loss": 0.5156, "step": 2177 }, { "epoch": 4.734982332155477, "grad_norm": 0.07370737195014954, "learning_rate": 1.2798160446431006e-07, "loss": 0.4995, "step": 2178 }, { "epoch": 4.737156836096766, "grad_norm": 0.06829570978879929, "learning_rate": 1.2580781319429858e-07, "loss": 0.5571, "step": 2179 }, { "epoch": 4.739331340038054, "grad_norm": 0.0692853182554245, "learning_rate": 1.2365252465694088e-07, "loss": 0.5346, "step": 2180 }, { "epoch": 4.7415058439793425, "grad_norm": 0.06629079580307007, "learning_rate": 1.215157428909175e-07, "loss": 0.5423, "step": 2181 }, { "epoch": 4.74368034792063, "grad_norm": 0.06775284558534622, "learning_rate": 1.193974719002311e-07, "loss": 0.5046, "step": 2182 }, { "epoch": 4.745854851861919, "grad_norm": 0.07466073334217072, "learning_rate": 1.172977156541999e-07, "loss": 0.5888, "step": 2183 }, { "epoch": 4.748029355803207, "grad_norm": 0.07126619666814804, "learning_rate": 1.1521647808744873e-07, "loss": 0.4912, "step": 2184 }, { "epoch": 4.750203859744496, "grad_norm": 0.0688086599111557, "learning_rate": 1.13153763099898e-07, "loss": 0.4766, "step": 2185 }, { "epoch": 4.752378363685784, "grad_norm": 0.07677243649959564, "learning_rate": 1.1110957455676252e-07, "loss": 0.5837, "step": 2186 }, { "epoch": 4.754552867627073, "grad_norm": 0.07109079509973526, "learning_rate": 1.0908391628854042e-07, "loss": 0.5943, "step": 2187 }, { "epoch": 4.756727371568361, "grad_norm": 0.07110583037137985, "learning_rate": 1.0707679209100652e-07, "loss": 0.5458, "step": 2188 }, { "epoch": 4.7589018755096495, "grad_norm": 0.07421605288982391, "learning_rate": 1.0508820572520672e-07, "loss": 0.5281, "step": 2189 }, { "epoch": 4.761076379450937, "grad_norm": 0.07433706521987915, "learning_rate": 1.0311816091744698e-07, "loss": 0.5928, "step": 2190 }, { "epoch": 4.763250883392226, "grad_norm": 0.072684146463871, "learning_rate": 1.011666613592932e-07, "loss": 0.5064, "step": 2191 }, { "epoch": 4.765425387333514, "grad_norm": 0.08320503681898117, "learning_rate": 9.923371070755805e-08, "loss": 0.6043, "step": 2192 }, { "epoch": 4.767599891274803, "grad_norm": 0.07756945490837097, "learning_rate": 9.731931258429638e-08, "loss": 0.5865, "step": 2193 }, { "epoch": 4.769774395216091, "grad_norm": 0.06509677320718765, "learning_rate": 9.54234705767998e-08, "loss": 0.558, "step": 2194 }, { "epoch": 4.77194889915738, "grad_norm": 0.06622958183288574, "learning_rate": 9.354618823758654e-08, "loss": 0.5393, "step": 2195 }, { "epoch": 4.774123403098669, "grad_norm": 0.06812475621700287, "learning_rate": 9.168746908439718e-08, "loss": 0.4872, "step": 2196 }, { "epoch": 4.7762979070399565, "grad_norm": 0.07115953415632248, "learning_rate": 8.984731660019008e-08, "loss": 0.5189, "step": 2197 }, { "epoch": 4.778472410981244, "grad_norm": 0.06483960896730423, "learning_rate": 8.802573423313032e-08, "loss": 0.4798, "step": 2198 }, { "epoch": 4.780646914922533, "grad_norm": 0.06403125077486038, "learning_rate": 8.622272539658416e-08, "loss": 0.5207, "step": 2199 }, { "epoch": 4.782821418863822, "grad_norm": 0.07612393796443939, "learning_rate": 8.443829346911792e-08, "loss": 0.5502, "step": 2200 }, { "epoch": 4.78499592280511, "grad_norm": 0.0718282163143158, "learning_rate": 8.267244179448464e-08, "loss": 0.6272, "step": 2201 }, { "epoch": 4.787170426746399, "grad_norm": 0.06716926395893097, "learning_rate": 8.092517368162078e-08, "loss": 0.5798, "step": 2202 }, { "epoch": 4.789344930687687, "grad_norm": 0.06599205732345581, "learning_rate": 7.919649240464177e-08, "loss": 0.496, "step": 2203 }, { "epoch": 4.791519434628976, "grad_norm": 0.07206237316131592, "learning_rate": 7.748640120283201e-08, "loss": 0.5236, "step": 2204 }, { "epoch": 4.7936939385702635, "grad_norm": 0.08089233189821243, "learning_rate": 7.579490328064265e-08, "loss": 0.5696, "step": 2205 }, { "epoch": 4.795868442511552, "grad_norm": 0.07823186367750168, "learning_rate": 7.412200180768269e-08, "loss": 0.6681, "step": 2206 }, { "epoch": 4.79804294645284, "grad_norm": 0.0736357569694519, "learning_rate": 7.246769991871683e-08, "loss": 0.4911, "step": 2207 }, { "epoch": 4.800217450394129, "grad_norm": 0.0648379698395729, "learning_rate": 7.083200071365204e-08, "loss": 0.5198, "step": 2208 }, { "epoch": 4.802391954335417, "grad_norm": 0.06974142789840698, "learning_rate": 6.921490725754098e-08, "loss": 0.5426, "step": 2209 }, { "epoch": 4.804566458276706, "grad_norm": 0.07555797696113586, "learning_rate": 6.761642258056977e-08, "loss": 0.6071, "step": 2210 }, { "epoch": 4.806740962217994, "grad_norm": 0.08195941895246506, "learning_rate": 6.603654967805684e-08, "loss": 0.5579, "step": 2211 }, { "epoch": 4.808915466159283, "grad_norm": 0.07414057850837708, "learning_rate": 6.447529151044296e-08, "loss": 0.5734, "step": 2212 }, { "epoch": 4.8110899701005705, "grad_norm": 0.06250805407762527, "learning_rate": 6.293265100328682e-08, "loss": 0.4772, "step": 2213 }, { "epoch": 4.813264474041859, "grad_norm": 0.06746966391801834, "learning_rate": 6.140863104726391e-08, "loss": 0.5142, "step": 2214 }, { "epoch": 4.815438977983147, "grad_norm": 0.06834477186203003, "learning_rate": 5.990323449815316e-08, "loss": 0.4558, "step": 2215 }, { "epoch": 4.817613481924436, "grad_norm": 0.07878415286540985, "learning_rate": 5.8416464176840323e-08, "loss": 0.5438, "step": 2216 }, { "epoch": 4.819787985865725, "grad_norm": 0.07121751457452774, "learning_rate": 5.694832286930685e-08, "loss": 0.5189, "step": 2217 }, { "epoch": 4.821962489807013, "grad_norm": 0.0768832266330719, "learning_rate": 5.5498813326624325e-08, "loss": 0.6198, "step": 2218 }, { "epoch": 4.824136993748301, "grad_norm": 0.06909306347370148, "learning_rate": 5.406793826495449e-08, "loss": 0.4569, "step": 2219 }, { "epoch": 4.82631149768959, "grad_norm": 0.0662676990032196, "learning_rate": 5.265570036553813e-08, "loss": 0.4769, "step": 2220 }, { "epoch": 4.828486001630878, "grad_norm": 0.08105573803186417, "learning_rate": 5.126210227469508e-08, "loss": 0.5226, "step": 2221 }, { "epoch": 4.830660505572166, "grad_norm": 0.0637318566441536, "learning_rate": 4.988714660381422e-08, "loss": 0.4955, "step": 2222 }, { "epoch": 4.832835009513455, "grad_norm": 0.07083940505981445, "learning_rate": 4.85308359293557e-08, "loss": 0.5306, "step": 2223 }, { "epoch": 4.835009513454743, "grad_norm": 0.07919806241989136, "learning_rate": 4.7193172792837635e-08, "loss": 0.5124, "step": 2224 }, { "epoch": 4.837184017396032, "grad_norm": 0.08247973769903183, "learning_rate": 4.58741597008372e-08, "loss": 0.6108, "step": 2225 }, { "epoch": 4.83935852133732, "grad_norm": 0.07369326800107956, "learning_rate": 4.457379912498394e-08, "loss": 0.4636, "step": 2226 }, { "epoch": 4.841533025278609, "grad_norm": 0.07369400560855865, "learning_rate": 4.329209350195651e-08, "loss": 0.4994, "step": 2227 }, { "epoch": 4.843707529219897, "grad_norm": 0.07321297377347946, "learning_rate": 4.202904523347484e-08, "loss": 0.5465, "step": 2228 }, { "epoch": 4.845882033161185, "grad_norm": 0.06843553483486176, "learning_rate": 4.078465668629905e-08, "loss": 0.5108, "step": 2229 }, { "epoch": 4.848056537102473, "grad_norm": 0.0723588615655899, "learning_rate": 3.955893019222501e-08, "loss": 0.5083, "step": 2230 }, { "epoch": 4.850231041043762, "grad_norm": 0.07396338135004044, "learning_rate": 3.835186804807656e-08, "loss": 0.5018, "step": 2231 }, { "epoch": 4.85240554498505, "grad_norm": 0.07323028892278671, "learning_rate": 3.716347251570551e-08, "loss": 0.5536, "step": 2232 }, { "epoch": 4.854580048926339, "grad_norm": 0.07941214740276337, "learning_rate": 3.599374582198278e-08, "loss": 0.5812, "step": 2233 }, { "epoch": 4.856754552867627, "grad_norm": 0.0677742138504982, "learning_rate": 3.484269015879838e-08, "loss": 0.5765, "step": 2234 }, { "epoch": 4.858929056808916, "grad_norm": 0.07814174145460129, "learning_rate": 3.371030768305583e-08, "loss": 0.5617, "step": 2235 }, { "epoch": 4.861103560750204, "grad_norm": 0.0678529441356659, "learning_rate": 3.259660051666669e-08, "loss": 0.4754, "step": 2236 }, { "epoch": 4.863278064691492, "grad_norm": 0.07551296055316925, "learning_rate": 3.150157074655047e-08, "loss": 0.5042, "step": 2237 }, { "epoch": 4.86545256863278, "grad_norm": 0.06703916192054749, "learning_rate": 3.042522042462359e-08, "loss": 0.5478, "step": 2238 }, { "epoch": 4.867627072574069, "grad_norm": 0.07536730170249939, "learning_rate": 2.93675515678038e-08, "loss": 0.5533, "step": 2239 }, { "epoch": 4.869801576515357, "grad_norm": 0.07237864285707474, "learning_rate": 2.8328566158002392e-08, "loss": 0.4817, "step": 2240 }, { "epoch": 4.871976080456646, "grad_norm": 0.08149593323469162, "learning_rate": 2.7308266142119788e-08, "loss": 0.5154, "step": 2241 }, { "epoch": 4.874150584397935, "grad_norm": 0.07380037754774094, "learning_rate": 2.6306653432041086e-08, "loss": 0.5855, "step": 2242 }, { "epoch": 4.876325088339223, "grad_norm": 0.06916176527738571, "learning_rate": 2.5323729904637162e-08, "loss": 0.4885, "step": 2243 }, { "epoch": 4.8784995922805106, "grad_norm": 0.06733262538909912, "learning_rate": 2.4359497401758026e-08, "loss": 0.4972, "step": 2244 }, { "epoch": 4.880674096221799, "grad_norm": 0.0801735371351242, "learning_rate": 2.3413957730226144e-08, "loss": 0.516, "step": 2245 }, { "epoch": 4.882848600163088, "grad_norm": 0.07888341695070267, "learning_rate": 2.2487112661840894e-08, "loss": 0.7006, "step": 2246 }, { "epoch": 4.885023104104376, "grad_norm": 0.06968782842159271, "learning_rate": 2.1578963933367448e-08, "loss": 0.4865, "step": 2247 }, { "epoch": 4.887197608045665, "grad_norm": 0.082059845328331, "learning_rate": 2.0689513246540115e-08, "loss": 0.6364, "step": 2248 }, { "epoch": 4.889372111986953, "grad_norm": 0.07065799832344055, "learning_rate": 1.9818762268052348e-08, "loss": 0.5413, "step": 2249 }, { "epoch": 4.891546615928242, "grad_norm": 0.06795171648263931, "learning_rate": 1.896671262955896e-08, "loss": 0.5308, "step": 2250 }, { "epoch": 4.89372111986953, "grad_norm": 0.07398146390914917, "learning_rate": 1.8133365927672785e-08, "loss": 0.5167, "step": 2251 }, { "epoch": 4.895895623810818, "grad_norm": 0.0759168341755867, "learning_rate": 1.731872372395693e-08, "loss": 0.5713, "step": 2252 }, { "epoch": 4.898070127752106, "grad_norm": 0.07015284895896912, "learning_rate": 1.6522787544926977e-08, "loss": 0.5704, "step": 2253 }, { "epoch": 4.900244631693395, "grad_norm": 0.07504396885633469, "learning_rate": 1.574555888204765e-08, "loss": 0.5738, "step": 2254 }, { "epoch": 4.902419135634683, "grad_norm": 0.07114327698945999, "learning_rate": 1.498703919172506e-08, "loss": 0.5041, "step": 2255 }, { "epoch": 4.904593639575972, "grad_norm": 0.06816327571868896, "learning_rate": 1.4247229895311132e-08, "loss": 0.5334, "step": 2256 }, { "epoch": 4.90676814351726, "grad_norm": 0.06439364701509476, "learning_rate": 1.3526132379095835e-08, "loss": 0.5083, "step": 2257 }, { "epoch": 4.908942647458549, "grad_norm": 0.11078181117773056, "learning_rate": 1.282374799430497e-08, "loss": 0.7005, "step": 2258 }, { "epoch": 4.911117151399837, "grad_norm": 0.07011247426271439, "learning_rate": 1.2140078057101269e-08, "loss": 0.5563, "step": 2259 }, { "epoch": 4.913291655341125, "grad_norm": 0.06760277599096298, "learning_rate": 1.147512384857663e-08, "loss": 0.5056, "step": 2260 }, { "epoch": 4.915466159282413, "grad_norm": 0.07695885002613068, "learning_rate": 1.0828886614754342e-08, "loss": 0.5674, "step": 2261 }, { "epoch": 4.917640663223702, "grad_norm": 0.06357279419898987, "learning_rate": 1.0201367566585741e-08, "loss": 0.4784, "step": 2262 }, { "epoch": 4.91981516716499, "grad_norm": 0.070188969373703, "learning_rate": 9.59256787994578e-09, "loss": 0.528, "step": 2263 }, { "epoch": 4.921989671106279, "grad_norm": 0.07261460274457932, "learning_rate": 9.002488695631918e-09, "loss": 0.528, "step": 2264 }, { "epoch": 4.924164175047567, "grad_norm": 0.06748499721288681, "learning_rate": 8.431131119361891e-09, "loss": 0.5607, "step": 2265 }, { "epoch": 4.926338678988856, "grad_norm": 0.06439201533794403, "learning_rate": 7.878496221773724e-09, "loss": 0.5117, "step": 2266 }, { "epoch": 4.9285131829301445, "grad_norm": 0.06791197508573532, "learning_rate": 7.344585038420171e-09, "loss": 0.4925, "step": 2267 }, { "epoch": 4.930687686871432, "grad_norm": 0.06292358785867691, "learning_rate": 6.82939856977094e-09, "loss": 0.5062, "step": 2268 }, { "epoch": 4.93286219081272, "grad_norm": 0.0688910260796547, "learning_rate": 6.3329377812060276e-09, "loss": 0.4842, "step": 2269 }, { "epoch": 4.935036694754009, "grad_norm": 0.08897803723812103, "learning_rate": 5.855203603017945e-09, "loss": 0.5618, "step": 2270 }, { "epoch": 4.937211198695298, "grad_norm": 0.08232980966567993, "learning_rate": 5.396196930407272e-09, "loss": 0.6367, "step": 2271 }, { "epoch": 4.939385702636586, "grad_norm": 0.07337010651826859, "learning_rate": 4.955918623483769e-09, "loss": 0.5163, "step": 2272 }, { "epoch": 4.941560206577875, "grad_norm": 0.07455073297023773, "learning_rate": 4.534369507259717e-09, "loss": 0.6039, "step": 2273 }, { "epoch": 4.943734710519163, "grad_norm": 0.07806413620710373, "learning_rate": 4.1315503716554685e-09, "loss": 0.6523, "step": 2274 }, { "epoch": 4.9459092144604515, "grad_norm": 0.06529662758111954, "learning_rate": 3.747461971492783e-09, "loss": 0.5635, "step": 2275 }, { "epoch": 4.948083718401739, "grad_norm": 0.06418381631374359, "learning_rate": 3.382105026494831e-09, "loss": 0.4515, "step": 2276 }, { "epoch": 4.950258222343028, "grad_norm": 0.07691580802202225, "learning_rate": 3.0354802212839705e-09, "loss": 0.5926, "step": 2277 }, { "epoch": 4.952432726284316, "grad_norm": 0.07175649702548981, "learning_rate": 2.7075882053828605e-09, "loss": 0.4844, "step": 2278 }, { "epoch": 4.954607230225605, "grad_norm": 0.07054652273654938, "learning_rate": 2.398429593212237e-09, "loss": 0.4473, "step": 2279 }, { "epoch": 4.956781734166893, "grad_norm": 0.07644648849964142, "learning_rate": 2.108004964086474e-09, "loss": 0.6114, "step": 2280 }, { "epoch": 4.958956238108182, "grad_norm": 0.06783640384674072, "learning_rate": 1.836314862219135e-09, "loss": 0.5363, "step": 2281 }, { "epoch": 4.96113074204947, "grad_norm": 0.0700225904583931, "learning_rate": 1.58335979671409e-09, "loss": 0.5463, "step": 2282 }, { "epoch": 4.9633052459907585, "grad_norm": 0.06670036911964417, "learning_rate": 1.3491402415710674e-09, "loss": 0.5188, "step": 2283 }, { "epoch": 4.965479749932046, "grad_norm": 0.07620628923177719, "learning_rate": 1.1336566356834334e-09, "loss": 0.5723, "step": 2284 }, { "epoch": 4.967654253873335, "grad_norm": 0.07078179717063904, "learning_rate": 9.369093828326403e-10, "loss": 0.5484, "step": 2285 }, { "epoch": 4.969828757814623, "grad_norm": 0.07602770626544952, "learning_rate": 7.588988516937789e-10, "loss": 0.538, "step": 2286 }, { "epoch": 4.972003261755912, "grad_norm": 0.07233285903930664, "learning_rate": 5.996253758322468e-10, "loss": 0.6317, "step": 2287 }, { "epoch": 4.9741777656972, "grad_norm": 0.07114741951227188, "learning_rate": 4.590892537015279e-10, "loss": 0.6553, "step": 2288 }, { "epoch": 4.976352269638489, "grad_norm": 0.07282888144254684, "learning_rate": 3.3729074864541354e-10, "loss": 0.5303, "step": 2289 }, { "epoch": 4.978526773579777, "grad_norm": 0.07105815410614014, "learning_rate": 2.3423008889467134e-10, "loss": 0.4715, "step": 2290 }, { "epoch": 4.9807012775210655, "grad_norm": 0.0798407644033432, "learning_rate": 1.49907467570376e-10, "loss": 0.7339, "step": 2291 }, { "epoch": 4.982875781462354, "grad_norm": 0.07527700811624527, "learning_rate": 8.432304268057856e-11, "loss": 0.6021, "step": 2292 }, { "epoch": 4.985050285403642, "grad_norm": 0.06803774833679199, "learning_rate": 3.747693711919631e-11, "loss": 0.5128, "step": 2293 }, { "epoch": 4.987224789344931, "grad_norm": 0.06823109835386276, "learning_rate": 9.36923866934336e-12, "loss": 0.4631, "step": 2294 }, { "epoch": 4.989399293286219, "grad_norm": 0.06771142035722733, "learning_rate": 0.0, "loss": 0.502, "step": 2295 }, { "epoch": 4.989399293286219, "eval_loss": 0.5490781664848328, "eval_runtime": 12.7093, "eval_samples_per_second": 5.823, "eval_steps_per_second": 5.823, "step": 2295 }, { "epoch": 4.989399293286219, "step": 2295, "total_flos": 1.750819795768443e+17, "train_loss": 0.5716329248772207, "train_runtime": 9487.0263, "train_samples_per_second": 1.939, "train_steps_per_second": 0.242 } ], "logging_steps": 1, "max_steps": 2295, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.750819795768443e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }