{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9731084776663628, "eval_steps": 137, "global_step": 1644, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.07610916346311569, "learning_rate": 2e-05, "loss": 1.795, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.8087825775146484, "eval_runtime": 75.9539, "eval_samples_per_second": 65.829, "eval_steps_per_second": 16.457, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.0771929994225502, "learning_rate": 4e-05, "loss": 1.7825, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.08941341191530228, "learning_rate": 6e-05, "loss": 1.7737, "step": 3 }, { "epoch": 0.01, "grad_norm": 0.08335491269826889, "learning_rate": 8e-05, "loss": 1.8004, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.08835520595312119, "learning_rate": 0.0001, "loss": 1.8495, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.08816578984260559, "learning_rate": 0.00012, "loss": 1.7758, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.09536299854516983, "learning_rate": 0.00014, "loss": 1.8001, "step": 7 }, { "epoch": 0.01, "grad_norm": 0.07634323835372925, "learning_rate": 0.00016, "loss": 1.7022, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.06886536628007889, "learning_rate": 0.00018, "loss": 1.8428, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.07389801740646362, "learning_rate": 0.0002, "loss": 1.7598, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.06829163432121277, "learning_rate": 0.00019999981517295864, "loss": 1.7479, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.060045819729566574, "learning_rate": 0.0001999992606925178, "loss": 1.7454, "step": 12 }, { "epoch": 0.02, "grad_norm": 0.08187604695558548, "learning_rate": 0.0001999983365607271, "loss": 1.7679, "step": 13 }, { "epoch": 0.03, "grad_norm": 0.05995490401983261, "learning_rate": 0.00019999704278100263, "loss": 1.7599, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.055336710065603256, "learning_rate": 0.00019999537935812698, "loss": 1.8244, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.0541992112994194, "learning_rate": 0.00019999334629824895, "loss": 1.7756, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.05088195204734802, "learning_rate": 0.00019999094360888392, "loss": 1.7352, "step": 17 }, { "epoch": 0.03, "grad_norm": 0.05157861113548279, "learning_rate": 0.00019998817129891346, "loss": 1.7634, "step": 18 }, { "epoch": 0.03, "grad_norm": 0.055710840970277786, "learning_rate": 0.00019998502937858557, "loss": 1.7802, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.055150121450424194, "learning_rate": 0.00019998151785951448, "loss": 1.7445, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.0526655912399292, "learning_rate": 0.0001999776367546806, "loss": 1.6634, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.04809674620628357, "learning_rate": 0.00019997338607843075, "loss": 1.7277, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.049412671476602554, "learning_rate": 0.00019996876584647754, "loss": 1.7357, "step": 23 }, { "epoch": 0.04, "grad_norm": 0.04948608949780464, "learning_rate": 0.00019996377607589997, "loss": 1.7323, "step": 24 }, { "epoch": 0.05, "grad_norm": 0.050225820392370224, "learning_rate": 0.00019995841678514294, "loss": 1.7273, "step": 25 }, { "epoch": 0.05, "grad_norm": 0.05085042864084244, "learning_rate": 0.00019995268799401718, "loss": 1.7564, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.04916631057858467, "learning_rate": 0.00019994658972369948, "loss": 1.7439, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.04791415110230446, "learning_rate": 0.00019994012199673234, "loss": 1.6813, "step": 28 }, { "epoch": 0.05, "grad_norm": 0.04975065216422081, "learning_rate": 0.00019993328483702393, "loss": 1.691, "step": 29 }, { "epoch": 0.05, "grad_norm": 0.055913638323545456, "learning_rate": 0.00019992607826984816, "loss": 1.7242, "step": 30 }, { "epoch": 0.06, "grad_norm": 0.045829374343156815, "learning_rate": 0.00019991850232184435, "loss": 1.7334, "step": 31 }, { "epoch": 0.06, "grad_norm": 0.053105831146240234, "learning_rate": 0.00019991055702101734, "loss": 1.7214, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.04539350047707558, "learning_rate": 0.00019990224239673722, "loss": 1.7698, "step": 33 }, { "epoch": 0.06, "grad_norm": 0.046983517706394196, "learning_rate": 0.00019989355847973932, "loss": 1.6887, "step": 34 }, { "epoch": 0.06, "grad_norm": 0.0471692830324173, "learning_rate": 0.00019988450530212414, "loss": 1.7571, "step": 35 }, { "epoch": 0.07, "grad_norm": 0.046874694526195526, "learning_rate": 0.00019987508289735716, "loss": 1.7558, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.04474163055419922, "learning_rate": 0.00019986529130026857, "loss": 1.7465, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.044651810079813004, "learning_rate": 0.00019985513054705348, "loss": 1.6983, "step": 38 }, { "epoch": 0.07, "grad_norm": 0.04951983690261841, "learning_rate": 0.00019984460067527153, "loss": 1.761, "step": 39 }, { "epoch": 0.07, "grad_norm": 0.04424133151769638, "learning_rate": 0.00019983370172384682, "loss": 1.6383, "step": 40 }, { "epoch": 0.07, "grad_norm": 0.052418872714042664, "learning_rate": 0.00019982243373306772, "loss": 1.779, "step": 41 }, { "epoch": 0.08, "grad_norm": 0.04530750587582588, "learning_rate": 0.0001998107967445869, "loss": 1.6942, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.04790988191962242, "learning_rate": 0.0001997987908014209, "loss": 1.7053, "step": 43 }, { "epoch": 0.08, "grad_norm": 0.04889607056975365, "learning_rate": 0.0001997864159479502, "loss": 1.7275, "step": 44 }, { "epoch": 0.08, "grad_norm": 0.04314807429909706, "learning_rate": 0.00019977367222991893, "loss": 1.7393, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.04405505582690239, "learning_rate": 0.00019976055969443479, "loss": 1.7306, "step": 46 }, { "epoch": 0.09, "grad_norm": 0.04656574875116348, "learning_rate": 0.00019974707838996882, "loss": 1.7686, "step": 47 }, { "epoch": 0.09, "grad_norm": 0.04246290400624275, "learning_rate": 0.00019973322836635518, "loss": 1.7209, "step": 48 }, { "epoch": 0.09, "grad_norm": 0.05493748560547829, "learning_rate": 0.00019971900967479106, "loss": 1.7155, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.0450466088950634, "learning_rate": 0.0001997044223678364, "loss": 1.6604, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.08634985238313675, "learning_rate": 0.00019968946649941382, "loss": 1.7321, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.04310084879398346, "learning_rate": 0.00019967414212480831, "loss": 1.7281, "step": 52 }, { "epoch": 0.1, "grad_norm": 0.04666193947196007, "learning_rate": 0.000199658449300667, "loss": 1.6787, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.04957772046327591, "learning_rate": 0.00019964238808499907, "loss": 1.6919, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.0421697273850441, "learning_rate": 0.00019962595853717548, "loss": 1.7245, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.04654068127274513, "learning_rate": 0.0001996091607179287, "loss": 1.7123, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.04076274484395981, "learning_rate": 0.00019959199468935258, "loss": 1.7066, "step": 57 }, { "epoch": 0.11, "grad_norm": 0.04215634986758232, "learning_rate": 0.00019957446051490198, "loss": 1.7748, "step": 58 }, { "epoch": 0.11, "grad_norm": 0.04252045601606369, "learning_rate": 0.0001995565582593928, "loss": 1.7396, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.04455842077732086, "learning_rate": 0.00019953828798900135, "loss": 1.7236, "step": 60 }, { "epoch": 0.11, "grad_norm": 0.044083647429943085, "learning_rate": 0.0001995196497712645, "loss": 1.7416, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.04511955380439758, "learning_rate": 0.00019950064367507916, "loss": 1.7481, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.0424315445125103, "learning_rate": 0.00019948126977070217, "loss": 1.7712, "step": 63 }, { "epoch": 0.12, "grad_norm": 0.04309271275997162, "learning_rate": 0.00019946152812974993, "loss": 1.6927, "step": 64 }, { "epoch": 0.12, "grad_norm": 0.042915165424346924, "learning_rate": 0.00019944141882519817, "loss": 1.7465, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.05950941890478134, "learning_rate": 0.00019942094193138186, "loss": 1.7035, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.042048510164022446, "learning_rate": 0.0001994000975239946, "loss": 1.7521, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.041577938944101334, "learning_rate": 0.00019937888568008862, "loss": 1.7439, "step": 68 }, { "epoch": 0.13, "grad_norm": 0.04538682475686073, "learning_rate": 0.00019935730647807436, "loss": 1.7528, "step": 69 }, { "epoch": 0.13, "grad_norm": 0.04102981090545654, "learning_rate": 0.00019933535999772025, "loss": 1.6828, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.04318905994296074, "learning_rate": 0.00019931304632015228, "loss": 1.7532, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.043007493019104004, "learning_rate": 0.00019929036552785397, "loss": 1.7353, "step": 72 }, { "epoch": 0.13, "grad_norm": 0.04308176040649414, "learning_rate": 0.00019926731770466568, "loss": 1.6882, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.04227353632450104, "learning_rate": 0.00019924390293578472, "loss": 1.7302, "step": 74 }, { "epoch": 0.14, "grad_norm": 0.0429629310965538, "learning_rate": 0.0001992201213077647, "loss": 1.6822, "step": 75 }, { "epoch": 0.14, "grad_norm": 0.042203355580568314, "learning_rate": 0.00019919597290851538, "loss": 1.7601, "step": 76 }, { "epoch": 0.14, "grad_norm": 0.04265713319182396, "learning_rate": 0.00019917145782730232, "loss": 1.7725, "step": 77 }, { "epoch": 0.14, "grad_norm": 0.04848012328147888, "learning_rate": 0.00019914657615474653, "loss": 1.7587, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.042650256305933, "learning_rate": 0.00019912132798282408, "loss": 1.7422, "step": 79 }, { "epoch": 0.15, "grad_norm": 0.04107372462749481, "learning_rate": 0.00019909571340486593, "loss": 1.7059, "step": 80 }, { "epoch": 0.15, "grad_norm": 0.04788720980286598, "learning_rate": 0.00019906973251555734, "loss": 1.7205, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.041231803596019745, "learning_rate": 0.0001990433854109378, "loss": 1.7277, "step": 82 }, { "epoch": 0.15, "grad_norm": 0.04246293380856514, "learning_rate": 0.0001990166721884004, "loss": 1.7739, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.04331424832344055, "learning_rate": 0.00019898959294669167, "loss": 1.6913, "step": 84 }, { "epoch": 0.15, "grad_norm": 0.04720227047801018, "learning_rate": 0.00019896214778591115, "loss": 1.7079, "step": 85 }, { "epoch": 0.16, "grad_norm": 0.05255519971251488, "learning_rate": 0.00019893433680751103, "loss": 1.7182, "step": 86 }, { "epoch": 0.16, "grad_norm": 0.042392294853925705, "learning_rate": 0.00019890616011429568, "loss": 1.778, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.043008286505937576, "learning_rate": 0.0001988776178104214, "loss": 1.7518, "step": 88 }, { "epoch": 0.16, "grad_norm": 0.044135116040706635, "learning_rate": 0.00019884871000139595, "loss": 1.7534, "step": 89 }, { "epoch": 0.16, "grad_norm": 0.041827455163002014, "learning_rate": 0.00019881943679407832, "loss": 1.7291, "step": 90 }, { "epoch": 0.17, "grad_norm": 0.05515114963054657, "learning_rate": 0.00019878979829667803, "loss": 1.7471, "step": 91 }, { "epoch": 0.17, "grad_norm": 0.040826503187417984, "learning_rate": 0.00019875979461875503, "loss": 1.6408, "step": 92 }, { "epoch": 0.17, "grad_norm": 0.04585504159331322, "learning_rate": 0.00019872942587121915, "loss": 1.6874, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.04665527120232582, "learning_rate": 0.00019869869216632968, "loss": 1.6968, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.046703219413757324, "learning_rate": 0.000198667593617695, "loss": 1.7401, "step": 95 }, { "epoch": 0.18, "grad_norm": 0.04115475341677666, "learning_rate": 0.00019863613034027224, "loss": 1.7227, "step": 96 }, { "epoch": 0.18, "grad_norm": 0.04217168688774109, "learning_rate": 0.00019860430245036663, "loss": 1.7268, "step": 97 }, { "epoch": 0.18, "grad_norm": 0.044889383018016815, "learning_rate": 0.00019857211006563125, "loss": 1.7006, "step": 98 }, { "epoch": 0.18, "grad_norm": 0.04161443933844566, "learning_rate": 0.00019853955330506663, "loss": 1.7266, "step": 99 }, { "epoch": 0.18, "grad_norm": 0.042708829045295715, "learning_rate": 0.00019850663228902012, "loss": 1.7314, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.046648308634757996, "learning_rate": 0.00019847334713918557, "loss": 1.7362, "step": 101 }, { "epoch": 0.19, "grad_norm": 0.04414999857544899, "learning_rate": 0.00019843969797860294, "loss": 1.7065, "step": 102 }, { "epoch": 0.19, "grad_norm": 0.04574083164334297, "learning_rate": 0.00019840568493165772, "loss": 1.7333, "step": 103 }, { "epoch": 0.19, "grad_norm": 0.041924796998500824, "learning_rate": 0.0001983713081240805, "loss": 1.6517, "step": 104 }, { "epoch": 0.19, "grad_norm": 0.04238827899098396, "learning_rate": 0.00019833656768294662, "loss": 1.776, "step": 105 }, { "epoch": 0.19, "grad_norm": 0.04292167350649834, "learning_rate": 0.00019830146373667548, "loss": 1.6601, "step": 106 }, { "epoch": 0.2, "grad_norm": 0.0433412566781044, "learning_rate": 0.00019826599641503025, "loss": 1.6841, "step": 107 }, { "epoch": 0.2, "grad_norm": 0.04201202839612961, "learning_rate": 0.00019823016584911735, "loss": 1.764, "step": 108 }, { "epoch": 0.2, "grad_norm": 0.04234587028622627, "learning_rate": 0.00019819397217138595, "loss": 1.7243, "step": 109 }, { "epoch": 0.2, "grad_norm": 0.04268571734428406, "learning_rate": 0.0001981574155156274, "loss": 1.7656, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.041506245732307434, "learning_rate": 0.00019812049601697492, "loss": 1.6636, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.04152766987681389, "learning_rate": 0.00019808321381190294, "loss": 1.7478, "step": 112 }, { "epoch": 0.21, "grad_norm": 0.041750356554985046, "learning_rate": 0.00019804556903822663, "loss": 1.7518, "step": 113 }, { "epoch": 0.21, "grad_norm": 0.04935223609209061, "learning_rate": 0.00019800756183510144, "loss": 1.7673, "step": 114 }, { "epoch": 0.21, "grad_norm": 0.042300984263420105, "learning_rate": 0.00019796919234302255, "loss": 1.7753, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.04224342852830887, "learning_rate": 0.00019793046070382437, "loss": 1.7226, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.044274065643548965, "learning_rate": 0.00019789136706067998, "loss": 1.7065, "step": 117 }, { "epoch": 0.22, "grad_norm": 0.04910755529999733, "learning_rate": 0.00019785191155810062, "loss": 1.6387, "step": 118 }, { "epoch": 0.22, "grad_norm": 0.04774147644639015, "learning_rate": 0.00019781209434193515, "loss": 1.7297, "step": 119 }, { "epoch": 0.22, "grad_norm": 0.04416586086153984, "learning_rate": 0.00019777191555936957, "loss": 1.8096, "step": 120 }, { "epoch": 0.22, "grad_norm": 0.04406105354428291, "learning_rate": 0.00019773137535892635, "loss": 1.7629, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.043473679572343826, "learning_rate": 0.00019769047389046402, "loss": 1.6979, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.04570621997117996, "learning_rate": 0.00019764921130517653, "loss": 1.7123, "step": 123 }, { "epoch": 0.23, "grad_norm": 0.04326749965548515, "learning_rate": 0.00019760758775559274, "loss": 1.716, "step": 124 }, { "epoch": 0.23, "grad_norm": 0.04397182539105415, "learning_rate": 0.00019756560339557572, "loss": 1.73, "step": 125 }, { "epoch": 0.23, "grad_norm": 0.04468885809183121, "learning_rate": 0.00019752325838032244, "loss": 1.7136, "step": 126 }, { "epoch": 0.23, "grad_norm": 0.04554520919919014, "learning_rate": 0.00019748055286636295, "loss": 1.7448, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.04646708443760872, "learning_rate": 0.00019743748701155995, "loss": 1.6956, "step": 128 }, { "epoch": 0.24, "grad_norm": 0.042717937380075455, "learning_rate": 0.00019739406097510812, "loss": 1.7245, "step": 129 }, { "epoch": 0.24, "grad_norm": 0.04367038235068321, "learning_rate": 0.00019735027491753353, "loss": 1.7102, "step": 130 }, { "epoch": 0.24, "grad_norm": 0.04296841099858284, "learning_rate": 0.0001973061290006932, "loss": 1.7163, "step": 131 }, { "epoch": 0.24, "grad_norm": 0.043665811419487, "learning_rate": 0.00019726162338777424, "loss": 1.7172, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.046134624630212784, "learning_rate": 0.00019721675824329354, "loss": 1.7327, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.04857848584651947, "learning_rate": 0.00019717153373309692, "loss": 1.6647, "step": 134 }, { "epoch": 0.25, "grad_norm": 0.047723885625600815, "learning_rate": 0.00019712595002435861, "loss": 1.7422, "step": 135 }, { "epoch": 0.25, "grad_norm": 0.04413154348731041, "learning_rate": 0.00019708000728558064, "loss": 1.6943, "step": 136 }, { "epoch": 0.25, "grad_norm": 0.043105412274599075, "learning_rate": 0.00019703370568659225, "loss": 1.7519, "step": 137 }, { "epoch": 0.25, "eval_loss": 1.7284438610076904, "eval_runtime": 76.3963, "eval_samples_per_second": 65.448, "eval_steps_per_second": 16.362, "step": 137 }, { "epoch": 0.25, "grad_norm": 0.04300757125020027, "learning_rate": 0.00019698704539854918, "loss": 1.7341, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.043961744755506516, "learning_rate": 0.00019694002659393305, "loss": 1.777, "step": 139 }, { "epoch": 0.26, "grad_norm": 0.04376057907938957, "learning_rate": 0.00019689264944655084, "loss": 1.7403, "step": 140 }, { "epoch": 0.26, "grad_norm": 0.04482461139559746, "learning_rate": 0.00019684491413153411, "loss": 1.6852, "step": 141 }, { "epoch": 0.26, "grad_norm": 0.045192863792181015, "learning_rate": 0.0001967968208253384, "loss": 1.7494, "step": 142 }, { "epoch": 0.26, "grad_norm": 0.04361759498715401, "learning_rate": 0.00019674836970574254, "loss": 1.7331, "step": 143 }, { "epoch": 0.26, "grad_norm": 0.04294734448194504, "learning_rate": 0.0001966995609518481, "loss": 1.6375, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.04528161138296127, "learning_rate": 0.00019665039474407863, "loss": 1.746, "step": 145 }, { "epoch": 0.27, "grad_norm": 0.04510699212551117, "learning_rate": 0.00019660087126417906, "loss": 1.7053, "step": 146 }, { "epoch": 0.27, "grad_norm": 0.042807720601558685, "learning_rate": 0.00019655099069521486, "loss": 1.6748, "step": 147 }, { "epoch": 0.27, "grad_norm": 0.04657953232526779, "learning_rate": 0.00019650075322157168, "loss": 1.684, "step": 148 }, { "epoch": 0.27, "grad_norm": 0.04593012481927872, "learning_rate": 0.00019645015902895437, "loss": 1.7076, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.04362139105796814, "learning_rate": 0.0001963992083043864, "loss": 1.6773, "step": 150 }, { "epoch": 0.28, "grad_norm": 0.04773354157805443, "learning_rate": 0.00019634790123620926, "loss": 1.7107, "step": 151 }, { "epoch": 0.28, "grad_norm": 0.05423569679260254, "learning_rate": 0.00019629623801408155, "loss": 1.7052, "step": 152 }, { "epoch": 0.28, "grad_norm": 0.043550509959459305, "learning_rate": 0.00019624421882897855, "loss": 1.7151, "step": 153 }, { "epoch": 0.28, "grad_norm": 0.04896851256489754, "learning_rate": 0.00019619184387319123, "loss": 1.6611, "step": 154 }, { "epoch": 0.28, "grad_norm": 0.04392845928668976, "learning_rate": 0.00019613911334032583, "loss": 1.738, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.04582325741648674, "learning_rate": 0.00019608602742530283, "loss": 1.6885, "step": 156 }, { "epoch": 0.29, "grad_norm": 0.045696284621953964, "learning_rate": 0.00019603258632435656, "loss": 1.7365, "step": 157 }, { "epoch": 0.29, "grad_norm": 0.043873440474271774, "learning_rate": 0.00019597879023503417, "loss": 1.8094, "step": 158 }, { "epoch": 0.29, "grad_norm": 0.05078018456697464, "learning_rate": 0.00019592463935619517, "loss": 1.7341, "step": 159 }, { "epoch": 0.29, "grad_norm": 0.042483873665332794, "learning_rate": 0.00019587013388801047, "loss": 1.7351, "step": 160 }, { "epoch": 0.29, "grad_norm": 0.045154914259910583, "learning_rate": 0.00019581527403196168, "loss": 1.6645, "step": 161 }, { "epoch": 0.3, "grad_norm": 0.04563280567526817, "learning_rate": 0.0001957600599908406, "loss": 1.7069, "step": 162 }, { "epoch": 0.3, "grad_norm": 0.0451313816010952, "learning_rate": 0.00019570449196874815, "loss": 1.7392, "step": 163 }, { "epoch": 0.3, "grad_norm": 0.04682654142379761, "learning_rate": 0.0001956485701710938, "loss": 1.6987, "step": 164 }, { "epoch": 0.3, "grad_norm": 0.04211273416876793, "learning_rate": 0.00019559229480459474, "loss": 1.6973, "step": 165 }, { "epoch": 0.3, "grad_norm": 0.04460490494966507, "learning_rate": 0.00019553566607727517, "loss": 1.7233, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.044608812779188156, "learning_rate": 0.00019547868419846548, "loss": 1.7371, "step": 167 }, { "epoch": 0.31, "grad_norm": 0.04518236592411995, "learning_rate": 0.00019542134937880154, "loss": 1.7257, "step": 168 }, { "epoch": 0.31, "grad_norm": 0.04374237731099129, "learning_rate": 0.00019536366183022384, "loss": 1.7136, "step": 169 }, { "epoch": 0.31, "grad_norm": 0.04429790750145912, "learning_rate": 0.00019530562176597673, "loss": 1.7216, "step": 170 }, { "epoch": 0.31, "grad_norm": 0.04807354509830475, "learning_rate": 0.0001952472294006077, "loss": 1.6568, "step": 171 }, { "epoch": 0.31, "grad_norm": 0.04785493016242981, "learning_rate": 0.00019518848494996655, "loss": 1.7272, "step": 172 }, { "epoch": 0.32, "grad_norm": 0.04472104460000992, "learning_rate": 0.0001951293886312045, "loss": 1.7283, "step": 173 }, { "epoch": 0.32, "grad_norm": 0.04852326214313507, "learning_rate": 0.00019506994066277348, "loss": 1.6968, "step": 174 }, { "epoch": 0.32, "grad_norm": 0.04624422639608383, "learning_rate": 0.0001950101412644254, "loss": 1.758, "step": 175 }, { "epoch": 0.32, "grad_norm": 0.044666189700365067, "learning_rate": 0.00019494999065721108, "loss": 1.6933, "step": 176 }, { "epoch": 0.32, "grad_norm": 0.05367857217788696, "learning_rate": 0.0001948894890634798, "loss": 1.7328, "step": 177 }, { "epoch": 0.32, "grad_norm": 0.046923939138650894, "learning_rate": 0.0001948286367068781, "loss": 1.7367, "step": 178 }, { "epoch": 0.33, "grad_norm": 0.04480034112930298, "learning_rate": 0.00019476743381234926, "loss": 1.7677, "step": 179 }, { "epoch": 0.33, "grad_norm": 0.045380428433418274, "learning_rate": 0.00019470588060613222, "loss": 1.7439, "step": 180 }, { "epoch": 0.33, "grad_norm": 0.04550057277083397, "learning_rate": 0.00019464397731576094, "loss": 1.6895, "step": 181 }, { "epoch": 0.33, "grad_norm": 0.049537234008312225, "learning_rate": 0.00019458172417006347, "loss": 1.7274, "step": 182 }, { "epoch": 0.33, "grad_norm": 0.04696514084935188, "learning_rate": 0.0001945191213991611, "loss": 1.7121, "step": 183 }, { "epoch": 0.34, "grad_norm": 0.04783783480525017, "learning_rate": 0.00019445616923446755, "loss": 1.6942, "step": 184 }, { "epoch": 0.34, "grad_norm": 0.04514686018228531, "learning_rate": 0.00019439286790868802, "loss": 1.7219, "step": 185 }, { "epoch": 0.34, "grad_norm": 0.045743513852357864, "learning_rate": 0.00019432921765581847, "loss": 1.76, "step": 186 }, { "epoch": 0.34, "grad_norm": 0.04406295716762543, "learning_rate": 0.00019426521871114468, "loss": 1.7531, "step": 187 }, { "epoch": 0.34, "grad_norm": 0.04445353150367737, "learning_rate": 0.00019420087131124131, "loss": 1.7742, "step": 188 }, { "epoch": 0.34, "grad_norm": 0.04396241530776024, "learning_rate": 0.0001941361756939712, "loss": 1.7701, "step": 189 }, { "epoch": 0.35, "grad_norm": 0.04415050894021988, "learning_rate": 0.0001940711320984843, "loss": 1.7062, "step": 190 }, { "epoch": 0.35, "grad_norm": 0.04672138765454292, "learning_rate": 0.00019400574076521693, "loss": 1.754, "step": 191 }, { "epoch": 0.35, "grad_norm": 0.04417939484119415, "learning_rate": 0.00019394000193589088, "loss": 1.7357, "step": 192 }, { "epoch": 0.35, "grad_norm": 0.04567494988441467, "learning_rate": 0.00019387391585351234, "loss": 1.752, "step": 193 }, { "epoch": 0.35, "grad_norm": 0.045080311596393585, "learning_rate": 0.00019380748276237123, "loss": 1.736, "step": 194 }, { "epoch": 0.36, "grad_norm": 0.04506627842783928, "learning_rate": 0.0001937407029080402, "loss": 1.6726, "step": 195 }, { "epoch": 0.36, "grad_norm": 0.04523961618542671, "learning_rate": 0.0001936735765373737, "loss": 1.7621, "step": 196 }, { "epoch": 0.36, "grad_norm": 0.04326867312192917, "learning_rate": 0.00019360610389850712, "loss": 1.7341, "step": 197 }, { "epoch": 0.36, "grad_norm": 0.05188523977994919, "learning_rate": 0.00019353828524085577, "loss": 1.7277, "step": 198 }, { "epoch": 0.36, "grad_norm": 0.04654062166810036, "learning_rate": 0.00019347012081511415, "loss": 1.6845, "step": 199 }, { "epoch": 0.36, "grad_norm": 0.044841405004262924, "learning_rate": 0.0001934016108732548, "loss": 1.6611, "step": 200 }, { "epoch": 0.37, "grad_norm": 0.0941338911652565, "learning_rate": 0.00019333275566852756, "loss": 1.6978, "step": 201 }, { "epoch": 0.37, "grad_norm": 0.05048836022615433, "learning_rate": 0.00019326355545545845, "loss": 1.7056, "step": 202 }, { "epoch": 0.37, "grad_norm": 0.046358656138181686, "learning_rate": 0.00019319401048984892, "loss": 1.649, "step": 203 }, { "epoch": 0.37, "grad_norm": 0.04557095095515251, "learning_rate": 0.00019312412102877473, "loss": 1.6793, "step": 204 }, { "epoch": 0.37, "grad_norm": 0.04551040008664131, "learning_rate": 0.0001930538873305852, "loss": 1.7339, "step": 205 }, { "epoch": 0.38, "grad_norm": 0.044258005917072296, "learning_rate": 0.000192983309654902, "loss": 1.6627, "step": 206 }, { "epoch": 0.38, "grad_norm": 0.0485963337123394, "learning_rate": 0.00019291238826261843, "loss": 1.715, "step": 207 }, { "epoch": 0.38, "grad_norm": 0.047103844583034515, "learning_rate": 0.00019284112341589832, "loss": 1.6855, "step": 208 }, { "epoch": 0.38, "grad_norm": 0.045252177864313126, "learning_rate": 0.000192769515378175, "loss": 1.7557, "step": 209 }, { "epoch": 0.38, "grad_norm": 0.049794841557741165, "learning_rate": 0.00019269756441415062, "loss": 1.7116, "step": 210 }, { "epoch": 0.38, "grad_norm": 0.04380947723984718, "learning_rate": 0.00019262527078979478, "loss": 1.7663, "step": 211 }, { "epoch": 0.39, "grad_norm": 0.046488065272569656, "learning_rate": 0.00019255263477234381, "loss": 1.6724, "step": 212 }, { "epoch": 0.39, "grad_norm": 0.0422043539583683, "learning_rate": 0.00019247965663029976, "loss": 1.7345, "step": 213 }, { "epoch": 0.39, "grad_norm": 0.05002991482615471, "learning_rate": 0.0001924063366334293, "loss": 1.7468, "step": 214 }, { "epoch": 0.39, "grad_norm": 0.04376322776079178, "learning_rate": 0.0001923326750527628, "loss": 1.7748, "step": 215 }, { "epoch": 0.39, "grad_norm": 0.04664807394146919, "learning_rate": 0.00019225867216059325, "loss": 1.7156, "step": 216 }, { "epoch": 0.4, "grad_norm": 0.047952812165021896, "learning_rate": 0.0001921843282304754, "loss": 1.7247, "step": 217 }, { "epoch": 0.4, "grad_norm": 0.045118216425180435, "learning_rate": 0.00019210964353722464, "loss": 1.7354, "step": 218 }, { "epoch": 0.4, "grad_norm": 0.054903436452150345, "learning_rate": 0.00019203461835691594, "loss": 1.7241, "step": 219 }, { "epoch": 0.4, "grad_norm": 0.04747498407959938, "learning_rate": 0.000191959252966883, "loss": 1.7498, "step": 220 }, { "epoch": 0.4, "grad_norm": 0.04605628177523613, "learning_rate": 0.000191883547645717, "loss": 1.6889, "step": 221 }, { "epoch": 0.4, "grad_norm": 0.04835960268974304, "learning_rate": 0.00019180750267326578, "loss": 1.715, "step": 222 }, { "epoch": 0.41, "grad_norm": 0.04828386381268501, "learning_rate": 0.00019173111833063273, "loss": 1.6931, "step": 223 }, { "epoch": 0.41, "grad_norm": 0.04604095220565796, "learning_rate": 0.0001916543949001756, "loss": 1.6717, "step": 224 }, { "epoch": 0.41, "grad_norm": 0.049674633890390396, "learning_rate": 0.00019157733266550575, "loss": 1.7746, "step": 225 }, { "epoch": 0.41, "grad_norm": 0.04439341649413109, "learning_rate": 0.00019149993191148687, "loss": 1.6925, "step": 226 }, { "epoch": 0.41, "grad_norm": 0.04741811007261276, "learning_rate": 0.00019142219292423395, "loss": 1.7219, "step": 227 }, { "epoch": 0.42, "grad_norm": 0.049409981817007065, "learning_rate": 0.00019134411599111242, "loss": 1.7306, "step": 228 }, { "epoch": 0.42, "grad_norm": 0.04618163779377937, "learning_rate": 0.00019126570140073676, "loss": 1.7271, "step": 229 }, { "epoch": 0.42, "grad_norm": 0.04557076469063759, "learning_rate": 0.0001911869494429698, "loss": 1.7188, "step": 230 }, { "epoch": 0.42, "grad_norm": 0.04645569249987602, "learning_rate": 0.0001911078604089213, "loss": 1.7191, "step": 231 }, { "epoch": 0.42, "grad_norm": 0.04584849998354912, "learning_rate": 0.0001910284345909471, "loss": 1.7592, "step": 232 }, { "epoch": 0.42, "grad_norm": 0.045582644641399384, "learning_rate": 0.000190948672282648, "loss": 1.6902, "step": 233 }, { "epoch": 0.43, "grad_norm": 0.04627401754260063, "learning_rate": 0.00019086857377886865, "loss": 1.6937, "step": 234 }, { "epoch": 0.43, "grad_norm": 0.04470285400748253, "learning_rate": 0.00019078813937569643, "loss": 1.6977, "step": 235 }, { "epoch": 0.43, "grad_norm": 0.05287547782063484, "learning_rate": 0.00019070736937046035, "loss": 1.7539, "step": 236 }, { "epoch": 0.43, "grad_norm": 0.04990493878722191, "learning_rate": 0.00019062626406173006, "loss": 1.7469, "step": 237 }, { "epoch": 0.43, "grad_norm": 0.048645589500665665, "learning_rate": 0.00019054482374931467, "loss": 1.7037, "step": 238 }, { "epoch": 0.44, "grad_norm": 0.04730357602238655, "learning_rate": 0.0001904630487342616, "loss": 1.7388, "step": 239 }, { "epoch": 0.44, "grad_norm": 0.04754168912768364, "learning_rate": 0.00019038093931885553, "loss": 1.7805, "step": 240 }, { "epoch": 0.44, "grad_norm": 0.04760801047086716, "learning_rate": 0.00019029849580661727, "loss": 1.7383, "step": 241 }, { "epoch": 0.44, "grad_norm": 0.048467203974723816, "learning_rate": 0.0001902157185023026, "loss": 1.7078, "step": 242 }, { "epoch": 0.44, "grad_norm": 0.0522041916847229, "learning_rate": 0.00019013260771190126, "loss": 1.7052, "step": 243 }, { "epoch": 0.44, "grad_norm": 0.0501788929104805, "learning_rate": 0.00019004916374263563, "loss": 1.7818, "step": 244 }, { "epoch": 0.45, "grad_norm": 0.04538620635867119, "learning_rate": 0.00018996538690295979, "loss": 1.6589, "step": 245 }, { "epoch": 0.45, "grad_norm": 0.04511679336428642, "learning_rate": 0.00018988127750255824, "loss": 1.7179, "step": 246 }, { "epoch": 0.45, "grad_norm": 0.04756203666329384, "learning_rate": 0.0001897968358523448, "loss": 1.7333, "step": 247 }, { "epoch": 0.45, "grad_norm": 0.05278336629271507, "learning_rate": 0.00018971206226446147, "loss": 1.7431, "step": 248 }, { "epoch": 0.45, "grad_norm": 0.05926801264286041, "learning_rate": 0.00018962695705227728, "loss": 1.7768, "step": 249 }, { "epoch": 0.46, "grad_norm": 0.049290940165519714, "learning_rate": 0.00018954152053038712, "loss": 1.7119, "step": 250 }, { "epoch": 0.46, "grad_norm": 0.04777907952666283, "learning_rate": 0.0001894557530146106, "loss": 1.7559, "step": 251 }, { "epoch": 0.46, "grad_norm": 0.04726920276880264, "learning_rate": 0.00018936965482199084, "loss": 1.7861, "step": 252 }, { "epoch": 0.46, "grad_norm": 0.04677857458591461, "learning_rate": 0.0001892832262707933, "loss": 1.7039, "step": 253 }, { "epoch": 0.46, "grad_norm": 0.04724700003862381, "learning_rate": 0.00018919646768050468, "loss": 1.6704, "step": 254 }, { "epoch": 0.46, "grad_norm": 0.04969072341918945, "learning_rate": 0.00018910937937183166, "loss": 1.7168, "step": 255 }, { "epoch": 0.47, "grad_norm": 0.04533353075385094, "learning_rate": 0.0001890219616666997, "loss": 1.6751, "step": 256 }, { "epoch": 0.47, "grad_norm": 0.04647386819124222, "learning_rate": 0.0001889342148882519, "loss": 1.7146, "step": 257 }, { "epoch": 0.47, "grad_norm": 0.047208696603775024, "learning_rate": 0.00018884613936084784, "loss": 1.7378, "step": 258 }, { "epoch": 0.47, "grad_norm": 0.04841624200344086, "learning_rate": 0.0001887577354100623, "loss": 1.7128, "step": 259 }, { "epoch": 0.47, "grad_norm": 0.05073019117116928, "learning_rate": 0.00018866900336268408, "loss": 1.7206, "step": 260 }, { "epoch": 0.48, "grad_norm": 0.051456011831760406, "learning_rate": 0.00018857994354671482, "loss": 1.755, "step": 261 }, { "epoch": 0.48, "grad_norm": 0.04637736827135086, "learning_rate": 0.0001884905562913678, "loss": 1.7395, "step": 262 }, { "epoch": 0.48, "grad_norm": 0.061346374452114105, "learning_rate": 0.00018840084192706658, "loss": 1.674, "step": 263 }, { "epoch": 0.48, "grad_norm": 0.04413258284330368, "learning_rate": 0.00018831080078544402, "loss": 1.7288, "step": 264 }, { "epoch": 0.48, "grad_norm": 0.0531301349401474, "learning_rate": 0.0001882204331993409, "loss": 1.7625, "step": 265 }, { "epoch": 0.48, "grad_norm": 0.05146196484565735, "learning_rate": 0.00018812973950280468, "loss": 1.6815, "step": 266 }, { "epoch": 0.49, "grad_norm": 0.047678787261247635, "learning_rate": 0.0001880387200310883, "loss": 1.7278, "step": 267 }, { "epoch": 0.49, "grad_norm": 0.0556582510471344, "learning_rate": 0.0001879473751206489, "loss": 1.74, "step": 268 }, { "epoch": 0.49, "grad_norm": 0.047515787184238434, "learning_rate": 0.00018785570510914678, "loss": 1.7207, "step": 269 }, { "epoch": 0.49, "grad_norm": 0.04592055827379227, "learning_rate": 0.0001877637103354438, "loss": 1.6589, "step": 270 }, { "epoch": 0.49, "grad_norm": 0.04531411454081535, "learning_rate": 0.0001876713911396024, "loss": 1.706, "step": 271 }, { "epoch": 0.5, "grad_norm": 0.04682420939207077, "learning_rate": 0.0001875787478628843, "loss": 1.7297, "step": 272 }, { "epoch": 0.5, "grad_norm": 0.04545978829264641, "learning_rate": 0.00018748578084774913, "loss": 1.6572, "step": 273 }, { "epoch": 0.5, "grad_norm": 0.04849430173635483, "learning_rate": 0.00018739249043785324, "loss": 1.7442, "step": 274 }, { "epoch": 0.5, "eval_loss": 1.726025938987732, "eval_runtime": 76.0967, "eval_samples_per_second": 65.706, "eval_steps_per_second": 16.426, "step": 274 }, { "epoch": 0.5, "grad_norm": 0.04745488613843918, "learning_rate": 0.00018729887697804847, "loss": 1.7398, "step": 275 }, { "epoch": 0.5, "grad_norm": 0.05489857494831085, "learning_rate": 0.00018720494081438078, "loss": 1.701, "step": 276 }, { "epoch": 0.51, "grad_norm": 0.04818108305335045, "learning_rate": 0.00018711068229408903, "loss": 1.7068, "step": 277 }, { "epoch": 0.51, "grad_norm": 0.04530555009841919, "learning_rate": 0.0001870161017656037, "loss": 1.6966, "step": 278 }, { "epoch": 0.51, "grad_norm": 0.045606572180986404, "learning_rate": 0.00018692119957854558, "loss": 1.7086, "step": 279 }, { "epoch": 0.51, "grad_norm": 0.04626869410276413, "learning_rate": 0.00018682597608372445, "loss": 1.6981, "step": 280 }, { "epoch": 0.51, "grad_norm": 0.04752146080136299, "learning_rate": 0.0001867304316331379, "loss": 1.692, "step": 281 }, { "epoch": 0.51, "grad_norm": 0.046230729669332504, "learning_rate": 0.0001866345665799698, "loss": 1.7338, "step": 282 }, { "epoch": 0.52, "grad_norm": 0.04928119108080864, "learning_rate": 0.00018653838127858933, "loss": 1.738, "step": 283 }, { "epoch": 0.52, "grad_norm": 0.04641352593898773, "learning_rate": 0.00018644187608454936, "loss": 1.6792, "step": 284 }, { "epoch": 0.52, "grad_norm": 0.04860611632466316, "learning_rate": 0.00018634505135458525, "loss": 1.663, "step": 285 }, { "epoch": 0.52, "grad_norm": 0.046515002846717834, "learning_rate": 0.00018624790744661355, "loss": 1.7327, "step": 286 }, { "epoch": 0.52, "grad_norm": 0.04668186604976654, "learning_rate": 0.00018615044471973074, "loss": 1.6987, "step": 287 }, { "epoch": 0.53, "grad_norm": 0.047913163900375366, "learning_rate": 0.00018605266353421176, "loss": 1.7953, "step": 288 }, { "epoch": 0.53, "grad_norm": 0.04924839362502098, "learning_rate": 0.00018595456425150872, "loss": 1.7891, "step": 289 }, { "epoch": 0.53, "grad_norm": 0.049241986125707626, "learning_rate": 0.00018585614723424962, "loss": 1.7451, "step": 290 }, { "epoch": 0.53, "grad_norm": 0.05132036283612251, "learning_rate": 0.00018575741284623703, "loss": 1.7598, "step": 291 }, { "epoch": 0.53, "grad_norm": 0.04659922048449516, "learning_rate": 0.00018565836145244662, "loss": 1.7331, "step": 292 }, { "epoch": 0.53, "grad_norm": 0.0466977022588253, "learning_rate": 0.0001855589934190259, "loss": 1.7171, "step": 293 }, { "epoch": 0.54, "grad_norm": 0.049368374049663544, "learning_rate": 0.00018545930911329287, "loss": 1.6929, "step": 294 }, { "epoch": 0.54, "grad_norm": 0.04552480950951576, "learning_rate": 0.00018535930890373466, "loss": 1.753, "step": 295 }, { "epoch": 0.54, "grad_norm": 0.04755065590143204, "learning_rate": 0.00018525899316000608, "loss": 1.7472, "step": 296 }, { "epoch": 0.54, "grad_norm": 0.050540413707494736, "learning_rate": 0.0001851583622529284, "loss": 1.7585, "step": 297 }, { "epoch": 0.54, "grad_norm": 0.04644971713423729, "learning_rate": 0.00018505741655448792, "loss": 1.7531, "step": 298 }, { "epoch": 0.55, "grad_norm": 0.05085503309965134, "learning_rate": 0.00018495615643783446, "loss": 1.6954, "step": 299 }, { "epoch": 0.55, "grad_norm": 0.0480993427336216, "learning_rate": 0.0001848545822772802, "loss": 1.6976, "step": 300 }, { "epoch": 0.55, "grad_norm": 0.0487300269305706, "learning_rate": 0.00018475269444829818, "loss": 1.7642, "step": 301 }, { "epoch": 0.55, "grad_norm": 0.04805615171790123, "learning_rate": 0.0001846504933275209, "loss": 1.6666, "step": 302 }, { "epoch": 0.55, "grad_norm": 0.045554857701063156, "learning_rate": 0.00018454797929273902, "loss": 1.7259, "step": 303 }, { "epoch": 0.55, "grad_norm": 0.04570743814110756, "learning_rate": 0.00018444515272289982, "loss": 1.7067, "step": 304 }, { "epoch": 0.56, "grad_norm": 0.047652073204517365, "learning_rate": 0.00018434201399810594, "loss": 1.8147, "step": 305 }, { "epoch": 0.56, "grad_norm": 0.046781569719314575, "learning_rate": 0.00018423856349961384, "loss": 1.7509, "step": 306 }, { "epoch": 0.56, "grad_norm": 0.04698612168431282, "learning_rate": 0.00018413480160983254, "loss": 1.7074, "step": 307 }, { "epoch": 0.56, "grad_norm": 0.04796341061592102, "learning_rate": 0.0001840307287123221, "loss": 1.7444, "step": 308 }, { "epoch": 0.56, "grad_norm": 0.047553375363349915, "learning_rate": 0.00018392634519179225, "loss": 1.7103, "step": 309 }, { "epoch": 0.57, "grad_norm": 0.046323925256729126, "learning_rate": 0.00018382165143410092, "loss": 1.716, "step": 310 }, { "epoch": 0.57, "grad_norm": 0.04571986570954323, "learning_rate": 0.00018371664782625287, "loss": 1.7035, "step": 311 }, { "epoch": 0.57, "grad_norm": 0.05170504003763199, "learning_rate": 0.0001836113347563982, "loss": 1.7151, "step": 312 }, { "epoch": 0.57, "grad_norm": 0.047869808971881866, "learning_rate": 0.000183505712613831, "loss": 1.7223, "step": 313 }, { "epoch": 0.57, "grad_norm": 0.0482964813709259, "learning_rate": 0.0001833997817889878, "loss": 1.6805, "step": 314 }, { "epoch": 0.57, "grad_norm": 0.0486602708697319, "learning_rate": 0.00018329354267344625, "loss": 1.7303, "step": 315 }, { "epoch": 0.58, "grad_norm": 0.046554964035749435, "learning_rate": 0.00018318699565992357, "loss": 1.7745, "step": 316 }, { "epoch": 0.58, "grad_norm": 0.047917045652866364, "learning_rate": 0.00018308014114227513, "loss": 1.718, "step": 317 }, { "epoch": 0.58, "grad_norm": 0.0479004867374897, "learning_rate": 0.00018297297951549304, "loss": 1.7707, "step": 318 }, { "epoch": 0.58, "grad_norm": 0.04681101068854332, "learning_rate": 0.0001828655111757046, "loss": 1.7646, "step": 319 }, { "epoch": 0.58, "grad_norm": 0.05201521888375282, "learning_rate": 0.00018275773652017097, "loss": 1.7479, "step": 320 }, { "epoch": 0.59, "grad_norm": 0.04852493852376938, "learning_rate": 0.00018264965594728548, "loss": 1.7463, "step": 321 }, { "epoch": 0.59, "grad_norm": 0.046121757477521896, "learning_rate": 0.00018254126985657246, "loss": 1.7444, "step": 322 }, { "epoch": 0.59, "grad_norm": 0.05163992941379547, "learning_rate": 0.00018243257864868548, "loss": 1.7134, "step": 323 }, { "epoch": 0.59, "grad_norm": 0.06267976760864258, "learning_rate": 0.00018232358272540604, "loss": 1.6712, "step": 324 }, { "epoch": 0.59, "grad_norm": 0.04854287579655647, "learning_rate": 0.00018221428248964202, "loss": 1.6932, "step": 325 }, { "epoch": 0.59, "grad_norm": 0.046650100499391556, "learning_rate": 0.00018210467834542615, "loss": 1.768, "step": 326 }, { "epoch": 0.6, "grad_norm": 0.04779491573572159, "learning_rate": 0.00018199477069791474, "loss": 1.7109, "step": 327 }, { "epoch": 0.6, "grad_norm": 0.05170130729675293, "learning_rate": 0.0001818845599533858, "loss": 1.6926, "step": 328 }, { "epoch": 0.6, "grad_norm": 0.04867775738239288, "learning_rate": 0.00018177404651923787, "loss": 1.6908, "step": 329 }, { "epoch": 0.6, "grad_norm": 0.04707460105419159, "learning_rate": 0.00018166323080398835, "loss": 1.7461, "step": 330 }, { "epoch": 0.6, "grad_norm": 0.048908475786447525, "learning_rate": 0.00018155211321727212, "loss": 1.7214, "step": 331 }, { "epoch": 0.61, "grad_norm": 0.04802173003554344, "learning_rate": 0.00018144069416983985, "loss": 1.7528, "step": 332 }, { "epoch": 0.61, "grad_norm": 0.04747573658823967, "learning_rate": 0.00018132897407355657, "loss": 1.6726, "step": 333 }, { "epoch": 0.61, "grad_norm": 0.049620069563388824, "learning_rate": 0.00018121695334140017, "loss": 1.7215, "step": 334 }, { "epoch": 0.61, "grad_norm": 0.047733817249536514, "learning_rate": 0.00018110463238745988, "loss": 1.7538, "step": 335 }, { "epoch": 0.61, "grad_norm": 0.04856455698609352, "learning_rate": 0.00018099201162693476, "loss": 1.6833, "step": 336 }, { "epoch": 0.61, "grad_norm": 0.04885758087038994, "learning_rate": 0.00018087909147613193, "loss": 1.7141, "step": 337 }, { "epoch": 0.62, "grad_norm": 0.047947369515895844, "learning_rate": 0.0001807658723524654, "loss": 1.733, "step": 338 }, { "epoch": 0.62, "grad_norm": 0.0499010868370533, "learning_rate": 0.0001806523546744543, "loss": 1.6825, "step": 339 }, { "epoch": 0.62, "grad_norm": 0.048193834722042084, "learning_rate": 0.0001805385388617213, "loss": 1.7282, "step": 340 }, { "epoch": 0.62, "grad_norm": 0.05272866412997246, "learning_rate": 0.00018042442533499123, "loss": 1.7599, "step": 341 }, { "epoch": 0.62, "grad_norm": 0.047657158225774765, "learning_rate": 0.00018031001451608943, "loss": 1.7292, "step": 342 }, { "epoch": 0.63, "grad_norm": 0.0498197004199028, "learning_rate": 0.00018019530682794014, "loss": 1.7417, "step": 343 }, { "epoch": 0.63, "grad_norm": 0.04958554729819298, "learning_rate": 0.00018008030269456505, "loss": 1.7274, "step": 344 }, { "epoch": 0.63, "grad_norm": 0.04730832576751709, "learning_rate": 0.00017996500254108152, "loss": 1.778, "step": 345 }, { "epoch": 0.63, "grad_norm": 0.050828639417886734, "learning_rate": 0.0001798494067937014, "loss": 1.7285, "step": 346 }, { "epoch": 0.63, "grad_norm": 0.046292368322610855, "learning_rate": 0.00017973351587972905, "loss": 1.7334, "step": 347 }, { "epoch": 0.63, "grad_norm": 0.04758565500378609, "learning_rate": 0.00017961733022755992, "loss": 1.6814, "step": 348 }, { "epoch": 0.64, "grad_norm": 0.050507742911577225, "learning_rate": 0.00017950085026667903, "loss": 1.6949, "step": 349 }, { "epoch": 0.64, "grad_norm": 0.04801836982369423, "learning_rate": 0.00017938407642765938, "loss": 1.6594, "step": 350 }, { "epoch": 0.64, "grad_norm": 0.04616666957736015, "learning_rate": 0.00017926700914216016, "loss": 1.6969, "step": 351 }, { "epoch": 0.64, "grad_norm": 0.048213839530944824, "learning_rate": 0.00017914964884292544, "loss": 1.6908, "step": 352 }, { "epoch": 0.64, "grad_norm": 0.04909725859761238, "learning_rate": 0.00017903199596378227, "loss": 1.7213, "step": 353 }, { "epoch": 0.65, "grad_norm": 0.050252340734004974, "learning_rate": 0.00017891405093963938, "loss": 1.7094, "step": 354 }, { "epoch": 0.65, "grad_norm": 0.05401075631380081, "learning_rate": 0.00017879581420648534, "loss": 1.7163, "step": 355 }, { "epoch": 0.65, "grad_norm": 0.05027545616030693, "learning_rate": 0.00017867728620138708, "loss": 1.7362, "step": 356 }, { "epoch": 0.65, "grad_norm": 0.047479428350925446, "learning_rate": 0.00017855846736248822, "loss": 1.6785, "step": 357 }, { "epoch": 0.65, "grad_norm": 0.05026884377002716, "learning_rate": 0.0001784393581290074, "loss": 1.7221, "step": 358 }, { "epoch": 0.65, "grad_norm": 0.04901432618498802, "learning_rate": 0.00017831995894123683, "loss": 1.6401, "step": 359 }, { "epoch": 0.66, "grad_norm": 0.04764765873551369, "learning_rate": 0.00017820027024054044, "loss": 1.7361, "step": 360 }, { "epoch": 0.66, "grad_norm": 0.046871528029441833, "learning_rate": 0.0001780802924693524, "loss": 1.7986, "step": 361 }, { "epoch": 0.66, "grad_norm": 0.05453401803970337, "learning_rate": 0.00017796002607117545, "loss": 1.7447, "step": 362 }, { "epoch": 0.66, "grad_norm": 0.04958674684166908, "learning_rate": 0.00017783947149057925, "loss": 1.7091, "step": 363 }, { "epoch": 0.66, "grad_norm": 0.053141675889492035, "learning_rate": 0.0001777186291731987, "loss": 1.6866, "step": 364 }, { "epoch": 0.67, "grad_norm": 0.047340743243694305, "learning_rate": 0.00017759749956573238, "loss": 1.7191, "step": 365 }, { "epoch": 0.67, "grad_norm": 0.051203418523073196, "learning_rate": 0.00017747608311594087, "loss": 1.7238, "step": 366 }, { "epoch": 0.67, "grad_norm": 0.047188933938741684, "learning_rate": 0.00017735438027264495, "loss": 1.762, "step": 367 }, { "epoch": 0.67, "grad_norm": 0.056479763239622116, "learning_rate": 0.00017723239148572422, "loss": 1.6587, "step": 368 }, { "epoch": 0.67, "grad_norm": 0.04922572523355484, "learning_rate": 0.00017711011720611514, "loss": 1.6988, "step": 369 }, { "epoch": 0.67, "grad_norm": 0.046839334070682526, "learning_rate": 0.00017698755788580963, "loss": 1.7092, "step": 370 }, { "epoch": 0.68, "grad_norm": 0.0491393506526947, "learning_rate": 0.0001768647139778532, "loss": 1.7313, "step": 371 }, { "epoch": 0.68, "grad_norm": 0.04811710864305496, "learning_rate": 0.0001767415859363434, "loss": 1.8071, "step": 372 }, { "epoch": 0.68, "grad_norm": 0.04601633548736572, "learning_rate": 0.00017661817421642804, "loss": 1.7594, "step": 373 }, { "epoch": 0.68, "grad_norm": 0.05098440870642662, "learning_rate": 0.00017649447927430362, "loss": 1.6524, "step": 374 }, { "epoch": 0.68, "grad_norm": 0.04978582262992859, "learning_rate": 0.00017637050156721346, "loss": 1.7448, "step": 375 }, { "epoch": 0.69, "grad_norm": 0.05097389221191406, "learning_rate": 0.00017624624155344626, "loss": 1.7362, "step": 376 }, { "epoch": 0.69, "grad_norm": 0.05258944630622864, "learning_rate": 0.00017612169969233424, "loss": 1.7033, "step": 377 }, { "epoch": 0.69, "grad_norm": 0.05384654179215431, "learning_rate": 0.0001759968764442515, "loss": 1.6349, "step": 378 }, { "epoch": 0.69, "grad_norm": 0.047803860157728195, "learning_rate": 0.00017587177227061226, "loss": 1.6655, "step": 379 }, { "epoch": 0.69, "grad_norm": 0.04812454432249069, "learning_rate": 0.00017574638763386916, "loss": 1.7064, "step": 380 }, { "epoch": 0.69, "grad_norm": 0.04860275238752365, "learning_rate": 0.00017562072299751163, "loss": 1.6648, "step": 381 }, { "epoch": 0.7, "grad_norm": 0.049836620688438416, "learning_rate": 0.00017549477882606418, "loss": 1.6957, "step": 382 }, { "epoch": 0.7, "grad_norm": 0.05114325135946274, "learning_rate": 0.00017536855558508458, "loss": 1.6257, "step": 383 }, { "epoch": 0.7, "grad_norm": 0.054609425365924835, "learning_rate": 0.00017524205374116214, "loss": 1.6854, "step": 384 }, { "epoch": 0.7, "grad_norm": 0.04757620766758919, "learning_rate": 0.00017511527376191618, "loss": 1.7425, "step": 385 }, { "epoch": 0.7, "grad_norm": 0.05384545028209686, "learning_rate": 0.00017498821611599397, "loss": 1.712, "step": 386 }, { "epoch": 0.71, "grad_norm": 0.04726232588291168, "learning_rate": 0.00017486088127306932, "loss": 1.701, "step": 387 }, { "epoch": 0.71, "grad_norm": 0.04885297268629074, "learning_rate": 0.0001747332697038407, "loss": 1.7227, "step": 388 }, { "epoch": 0.71, "grad_norm": 0.04793693870306015, "learning_rate": 0.00017460538188002946, "loss": 1.7058, "step": 389 }, { "epoch": 0.71, "grad_norm": 0.04942973330616951, "learning_rate": 0.0001744772182743782, "loss": 1.7443, "step": 390 }, { "epoch": 0.71, "grad_norm": 0.05246872082352638, "learning_rate": 0.00017434877936064886, "loss": 1.6807, "step": 391 }, { "epoch": 0.71, "grad_norm": 0.04894121363759041, "learning_rate": 0.0001742200656136212, "loss": 1.7963, "step": 392 }, { "epoch": 0.72, "grad_norm": 0.05082324892282486, "learning_rate": 0.00017409107750909078, "loss": 1.7024, "step": 393 }, { "epoch": 0.72, "grad_norm": 0.04718152433633804, "learning_rate": 0.00017396181552386741, "loss": 1.711, "step": 394 }, { "epoch": 0.72, "grad_norm": 0.05174902826547623, "learning_rate": 0.00017383228013577331, "loss": 1.7362, "step": 395 }, { "epoch": 0.72, "grad_norm": 0.048003047704696655, "learning_rate": 0.0001737024718236413, "loss": 1.6944, "step": 396 }, { "epoch": 0.72, "grad_norm": 0.0462164506316185, "learning_rate": 0.00017357239106731317, "loss": 1.7297, "step": 397 }, { "epoch": 0.73, "grad_norm": 0.04808316007256508, "learning_rate": 0.0001734420383476377, "loss": 1.6971, "step": 398 }, { "epoch": 0.73, "grad_norm": 0.05553476884961128, "learning_rate": 0.00017331141414646904, "loss": 1.7262, "step": 399 }, { "epoch": 0.73, "grad_norm": 0.046341411769390106, "learning_rate": 0.00017318051894666487, "loss": 1.7135, "step": 400 }, { "epoch": 0.73, "grad_norm": 0.048155754804611206, "learning_rate": 0.00017304935323208466, "loss": 1.7377, "step": 401 }, { "epoch": 0.73, "grad_norm": 0.05066389963030815, "learning_rate": 0.00017291791748758785, "loss": 1.6516, "step": 402 }, { "epoch": 0.73, "grad_norm": 0.05046610161662102, "learning_rate": 0.000172786212199032, "loss": 1.7536, "step": 403 }, { "epoch": 0.74, "grad_norm": 0.0542440302670002, "learning_rate": 0.00017265423785327107, "loss": 1.7857, "step": 404 }, { "epoch": 0.74, "grad_norm": 0.04833053797483444, "learning_rate": 0.0001725219949381537, "loss": 1.7594, "step": 405 }, { "epoch": 0.74, "grad_norm": 0.047335654497146606, "learning_rate": 0.00017238948394252115, "loss": 1.7495, "step": 406 }, { "epoch": 0.74, "grad_norm": 0.04961543157696724, "learning_rate": 0.00017225670535620576, "loss": 1.7201, "step": 407 }, { "epoch": 0.74, "grad_norm": 0.04761854186654091, "learning_rate": 0.00017212365967002893, "loss": 1.7522, "step": 408 }, { "epoch": 0.75, "grad_norm": 0.05010442063212395, "learning_rate": 0.0001719903473757996, "loss": 1.7535, "step": 409 }, { "epoch": 0.75, "grad_norm": 0.049323149025440216, "learning_rate": 0.000171856768966312, "loss": 1.6984, "step": 410 }, { "epoch": 0.75, "grad_norm": 0.08661342412233353, "learning_rate": 0.0001717229249353442, "loss": 1.7182, "step": 411 }, { "epoch": 0.75, "eval_loss": 1.724851131439209, "eval_runtime": 76.3068, "eval_samples_per_second": 65.525, "eval_steps_per_second": 16.381, "step": 411 }, { "epoch": 0.75, "grad_norm": 0.05118868127465248, "learning_rate": 0.00017158881577765612, "loss": 1.683, "step": 412 }, { "epoch": 0.75, "grad_norm": 0.053089968860149384, "learning_rate": 0.00017145444198898776, "loss": 1.7162, "step": 413 }, { "epoch": 0.75, "grad_norm": 0.05191902816295624, "learning_rate": 0.0001713198040660573, "loss": 1.7223, "step": 414 }, { "epoch": 0.76, "grad_norm": 0.05995416268706322, "learning_rate": 0.00017118490250655932, "loss": 1.7148, "step": 415 }, { "epoch": 0.76, "grad_norm": 0.04749016463756561, "learning_rate": 0.00017104973780916294, "loss": 1.7364, "step": 416 }, { "epoch": 0.76, "grad_norm": 0.047870930284261703, "learning_rate": 0.00017091431047351, "loss": 1.7607, "step": 417 }, { "epoch": 0.76, "grad_norm": 0.04802364483475685, "learning_rate": 0.00017077862100021318, "loss": 1.6957, "step": 418 }, { "epoch": 0.76, "grad_norm": 0.04796374961733818, "learning_rate": 0.00017064266989085412, "loss": 1.6972, "step": 419 }, { "epoch": 0.77, "grad_norm": 0.048874564468860626, "learning_rate": 0.00017050645764798164, "loss": 1.736, "step": 420 }, { "epoch": 0.77, "grad_norm": 0.052477337419986725, "learning_rate": 0.00017036998477510992, "loss": 1.7447, "step": 421 }, { "epoch": 0.77, "grad_norm": 0.049993280321359634, "learning_rate": 0.00017023325177671647, "loss": 1.7635, "step": 422 }, { "epoch": 0.77, "grad_norm": 0.09700744599103928, "learning_rate": 0.00017009625915824037, "loss": 1.7402, "step": 423 }, { "epoch": 0.77, "grad_norm": 0.048865802586078644, "learning_rate": 0.0001699590074260805, "loss": 1.7229, "step": 424 }, { "epoch": 0.77, "grad_norm": 0.04994821920990944, "learning_rate": 0.00016982149708759343, "loss": 1.672, "step": 425 }, { "epoch": 0.78, "grad_norm": 0.05008814111351967, "learning_rate": 0.00016968372865109176, "loss": 1.7338, "step": 426 }, { "epoch": 0.78, "grad_norm": 0.04830687865614891, "learning_rate": 0.00016954570262584214, "loss": 1.7177, "step": 427 }, { "epoch": 0.78, "grad_norm": 0.04781452193856239, "learning_rate": 0.0001694074195220634, "loss": 1.7628, "step": 428 }, { "epoch": 0.78, "grad_norm": 0.04739667847752571, "learning_rate": 0.00016926887985092468, "loss": 1.7107, "step": 429 }, { "epoch": 0.78, "grad_norm": 0.0481286458671093, "learning_rate": 0.00016913008412454357, "loss": 1.7646, "step": 430 }, { "epoch": 0.79, "grad_norm": 0.06283537298440933, "learning_rate": 0.0001689910328559841, "loss": 1.6896, "step": 431 }, { "epoch": 0.79, "grad_norm": 0.04944480583071709, "learning_rate": 0.00016885172655925495, "loss": 1.6931, "step": 432 }, { "epoch": 0.79, "grad_norm": 0.05051645264029503, "learning_rate": 0.00016871216574930754, "loss": 1.7752, "step": 433 }, { "epoch": 0.79, "grad_norm": 0.05406402051448822, "learning_rate": 0.0001685723509420341, "loss": 1.7203, "step": 434 }, { "epoch": 0.79, "grad_norm": 0.0995137020945549, "learning_rate": 0.00016843228265426584, "loss": 1.6454, "step": 435 }, { "epoch": 0.79, "grad_norm": 0.05356389284133911, "learning_rate": 0.00016829196140377085, "loss": 1.7327, "step": 436 }, { "epoch": 0.8, "grad_norm": 0.04902141913771629, "learning_rate": 0.0001681513877092523, "loss": 1.7262, "step": 437 }, { "epoch": 0.8, "grad_norm": 0.047820378094911575, "learning_rate": 0.00016801056209034672, "loss": 1.7294, "step": 438 }, { "epoch": 0.8, "grad_norm": 0.048359643667936325, "learning_rate": 0.00016786948506762164, "loss": 1.6959, "step": 439 }, { "epoch": 0.8, "grad_norm": 0.04830753803253174, "learning_rate": 0.00016772815716257412, "loss": 1.7714, "step": 440 }, { "epoch": 0.8, "grad_norm": 0.05318046733736992, "learning_rate": 0.0001675865788976285, "loss": 1.7325, "step": 441 }, { "epoch": 0.81, "grad_norm": 0.04992082715034485, "learning_rate": 0.0001674447507961346, "loss": 1.7866, "step": 442 }, { "epoch": 0.81, "grad_norm": 0.05253741890192032, "learning_rate": 0.0001673026733823658, "loss": 1.7273, "step": 443 }, { "epoch": 0.81, "grad_norm": 0.05121272802352905, "learning_rate": 0.00016716034718151706, "loss": 1.7063, "step": 444 }, { "epoch": 0.81, "grad_norm": 0.04715156927704811, "learning_rate": 0.000167017772719703, "loss": 1.7575, "step": 445 }, { "epoch": 0.81, "grad_norm": 0.05717930197715759, "learning_rate": 0.00016687495052395595, "loss": 1.7835, "step": 446 }, { "epoch": 0.81, "grad_norm": 0.04992460459470749, "learning_rate": 0.00016673188112222394, "loss": 1.7218, "step": 447 }, { "epoch": 0.82, "grad_norm": 0.0481155663728714, "learning_rate": 0.0001665885650433689, "loss": 1.7269, "step": 448 }, { "epoch": 0.82, "grad_norm": 0.0485762394964695, "learning_rate": 0.00016644500281716456, "loss": 1.6857, "step": 449 }, { "epoch": 0.82, "grad_norm": 0.04729575663805008, "learning_rate": 0.00016630119497429457, "loss": 1.7208, "step": 450 }, { "epoch": 0.82, "grad_norm": 0.051819782704114914, "learning_rate": 0.00016615714204635043, "loss": 1.7117, "step": 451 }, { "epoch": 0.82, "grad_norm": 0.052782051265239716, "learning_rate": 0.0001660128445658297, "loss": 1.7811, "step": 452 }, { "epoch": 0.83, "grad_norm": 0.05251288414001465, "learning_rate": 0.00016586830306613393, "loss": 1.7517, "step": 453 }, { "epoch": 0.83, "grad_norm": 0.047806352376937866, "learning_rate": 0.00016572351808156666, "loss": 1.7132, "step": 454 }, { "epoch": 0.83, "grad_norm": 0.05114049091935158, "learning_rate": 0.0001655784901473315, "loss": 1.7729, "step": 455 }, { "epoch": 0.83, "grad_norm": 0.04811178147792816, "learning_rate": 0.00016543321979953007, "loss": 1.7855, "step": 456 }, { "epoch": 0.83, "grad_norm": 0.05107167363166809, "learning_rate": 0.00016528770757516027, "loss": 1.7331, "step": 457 }, { "epoch": 0.84, "grad_norm": 0.04712466895580292, "learning_rate": 0.00016514195401211388, "loss": 1.7048, "step": 458 }, { "epoch": 0.84, "grad_norm": 0.05438878387212753, "learning_rate": 0.0001649959596491749, "loss": 1.753, "step": 459 }, { "epoch": 0.84, "grad_norm": 0.04884348064661026, "learning_rate": 0.00016484972502601753, "loss": 1.6734, "step": 460 }, { "epoch": 0.84, "grad_norm": 0.0536276139318943, "learning_rate": 0.00016470325068320392, "loss": 1.711, "step": 461 }, { "epoch": 0.84, "grad_norm": 0.05346493422985077, "learning_rate": 0.00016455653716218252, "loss": 1.7366, "step": 462 }, { "epoch": 0.84, "grad_norm": 0.05044522508978844, "learning_rate": 0.0001644095850052858, "loss": 1.7269, "step": 463 }, { "epoch": 0.85, "grad_norm": 0.05273488536477089, "learning_rate": 0.00016426239475572852, "loss": 1.7586, "step": 464 }, { "epoch": 0.85, "grad_norm": 0.053452517837285995, "learning_rate": 0.0001641149669576053, "loss": 1.7379, "step": 465 }, { "epoch": 0.85, "grad_norm": 0.047611016780138016, "learning_rate": 0.00016396730215588915, "loss": 1.7471, "step": 466 }, { "epoch": 0.85, "grad_norm": 0.05317235738039017, "learning_rate": 0.00016381940089642893, "loss": 1.6925, "step": 467 }, { "epoch": 0.85, "grad_norm": 0.049223560839891434, "learning_rate": 0.00016367126372594774, "loss": 1.7229, "step": 468 }, { "epoch": 0.86, "grad_norm": 0.047821756452322006, "learning_rate": 0.0001635228911920407, "loss": 1.7484, "step": 469 }, { "epoch": 0.86, "grad_norm": 0.05013042315840721, "learning_rate": 0.00016337428384317288, "loss": 1.7435, "step": 470 }, { "epoch": 0.86, "grad_norm": 0.04820725694298744, "learning_rate": 0.00016322544222867742, "loss": 1.7594, "step": 471 }, { "epoch": 0.86, "grad_norm": 0.04791193827986717, "learning_rate": 0.00016307636689875347, "loss": 1.644, "step": 472 }, { "epoch": 0.86, "grad_norm": 0.04905365779995918, "learning_rate": 0.00016292705840446404, "loss": 1.7144, "step": 473 }, { "epoch": 0.86, "grad_norm": 0.04875028133392334, "learning_rate": 0.00016277751729773407, "loss": 1.712, "step": 474 }, { "epoch": 0.87, "grad_norm": 0.05170164629817009, "learning_rate": 0.0001626277441313484, "loss": 1.7367, "step": 475 }, { "epoch": 0.87, "grad_norm": 0.05205371975898743, "learning_rate": 0.00016247773945894962, "loss": 1.689, "step": 476 }, { "epoch": 0.87, "grad_norm": 0.0485403798520565, "learning_rate": 0.00016232750383503617, "loss": 1.706, "step": 477 }, { "epoch": 0.87, "grad_norm": 0.0538201630115509, "learning_rate": 0.0001621770378149601, "loss": 1.7284, "step": 478 }, { "epoch": 0.87, "grad_norm": 0.04828377440571785, "learning_rate": 0.00016202634195492524, "loss": 1.661, "step": 479 }, { "epoch": 0.88, "grad_norm": 0.050310611724853516, "learning_rate": 0.000161875416811985, "loss": 1.6852, "step": 480 }, { "epoch": 0.88, "grad_norm": 0.050804853439331055, "learning_rate": 0.00016172426294404032, "loss": 1.7358, "step": 481 }, { "epoch": 0.88, "grad_norm": 0.051962971687316895, "learning_rate": 0.00016157288090983763, "loss": 1.6692, "step": 482 }, { "epoch": 0.88, "grad_norm": 0.05179814621806145, "learning_rate": 0.0001614212712689668, "loss": 1.6983, "step": 483 }, { "epoch": 0.88, "grad_norm": 0.05398216098546982, "learning_rate": 0.00016126943458185907, "loss": 1.7261, "step": 484 }, { "epoch": 0.88, "grad_norm": 0.049869704991579056, "learning_rate": 0.00016111737140978494, "loss": 1.6951, "step": 485 }, { "epoch": 0.89, "grad_norm": 0.048107776790857315, "learning_rate": 0.00016096508231485217, "loss": 1.6941, "step": 486 }, { "epoch": 0.89, "grad_norm": 0.05527656897902489, "learning_rate": 0.00016081256786000357, "loss": 1.7054, "step": 487 }, { "epoch": 0.89, "grad_norm": 0.05169270187616348, "learning_rate": 0.00016065982860901504, "loss": 1.7307, "step": 488 }, { "epoch": 0.89, "grad_norm": 0.04972197115421295, "learning_rate": 0.00016050686512649354, "loss": 1.6955, "step": 489 }, { "epoch": 0.89, "grad_norm": 0.05033208429813385, "learning_rate": 0.00016035367797787476, "loss": 1.7013, "step": 490 }, { "epoch": 0.9, "grad_norm": 0.05073223263025284, "learning_rate": 0.00016020026772942125, "loss": 1.6831, "step": 491 }, { "epoch": 0.9, "grad_norm": 0.056367356330156326, "learning_rate": 0.00016004663494822028, "loss": 1.6654, "step": 492 }, { "epoch": 0.9, "grad_norm": 0.049483008682727814, "learning_rate": 0.0001598927802021817, "loss": 1.7285, "step": 493 }, { "epoch": 0.9, "grad_norm": 0.052070703357458115, "learning_rate": 0.00015973870406003578, "loss": 1.7948, "step": 494 }, { "epoch": 0.9, "grad_norm": 0.05687413364648819, "learning_rate": 0.0001595844070913314, "loss": 1.7336, "step": 495 }, { "epoch": 0.9, "grad_norm": 0.048987727612257004, "learning_rate": 0.00015942988986643352, "loss": 1.6661, "step": 496 }, { "epoch": 0.91, "grad_norm": 0.05027730017900467, "learning_rate": 0.00015927515295652143, "loss": 1.7364, "step": 497 }, { "epoch": 0.91, "grad_norm": 0.048406291753053665, "learning_rate": 0.00015912019693358636, "loss": 1.6419, "step": 498 }, { "epoch": 0.91, "grad_norm": 0.05071192979812622, "learning_rate": 0.00015896502237042963, "loss": 1.6301, "step": 499 }, { "epoch": 0.91, "grad_norm": 0.05111885070800781, "learning_rate": 0.00015880962984066036, "loss": 1.7112, "step": 500 }, { "epoch": 0.91, "grad_norm": 0.06297910958528519, "learning_rate": 0.0001586540199186933, "loss": 1.7438, "step": 501 }, { "epoch": 0.92, "grad_norm": 0.04950469359755516, "learning_rate": 0.00015849819317974694, "loss": 1.6837, "step": 502 }, { "epoch": 0.92, "grad_norm": 0.04900701716542244, "learning_rate": 0.0001583421501998412, "loss": 1.7432, "step": 503 }, { "epoch": 0.92, "grad_norm": 0.04949019104242325, "learning_rate": 0.0001581858915557953, "loss": 1.688, "step": 504 }, { "epoch": 0.92, "grad_norm": 0.05047097057104111, "learning_rate": 0.00015802941782522569, "loss": 1.7256, "step": 505 }, { "epoch": 0.92, "grad_norm": 0.04921870306134224, "learning_rate": 0.0001578727295865439, "loss": 1.7723, "step": 506 }, { "epoch": 0.92, "grad_norm": 0.04841122031211853, "learning_rate": 0.0001577158274189544, "loss": 1.71, "step": 507 }, { "epoch": 0.93, "grad_norm": 0.04886234924197197, "learning_rate": 0.00015755871190245251, "loss": 1.6622, "step": 508 }, { "epoch": 0.93, "grad_norm": 0.04966573417186737, "learning_rate": 0.00015740138361782207, "loss": 1.7357, "step": 509 }, { "epoch": 0.93, "grad_norm": 0.050070296972990036, "learning_rate": 0.0001572438431466336, "loss": 1.6803, "step": 510 }, { "epoch": 0.93, "grad_norm": 0.054121073335409164, "learning_rate": 0.00015708609107124177, "loss": 1.7659, "step": 511 }, { "epoch": 0.93, "grad_norm": 0.05084529519081116, "learning_rate": 0.00015692812797478368, "loss": 1.6943, "step": 512 }, { "epoch": 0.94, "grad_norm": 0.056926507502794266, "learning_rate": 0.0001567699544411763, "loss": 1.6562, "step": 513 }, { "epoch": 0.94, "grad_norm": 0.05053721368312836, "learning_rate": 0.00015661157105511457, "loss": 1.7624, "step": 514 }, { "epoch": 0.94, "grad_norm": 0.048727016896009445, "learning_rate": 0.00015645297840206915, "loss": 1.7364, "step": 515 }, { "epoch": 0.94, "grad_norm": 0.051376283168792725, "learning_rate": 0.00015629417706828423, "loss": 1.699, "step": 516 }, { "epoch": 0.94, "grad_norm": 0.05029591917991638, "learning_rate": 0.00015613516764077548, "loss": 1.6972, "step": 517 }, { "epoch": 0.94, "grad_norm": 0.053968969732522964, "learning_rate": 0.00015597595070732765, "loss": 1.7128, "step": 518 }, { "epoch": 0.95, "grad_norm": 0.050694871693849564, "learning_rate": 0.00015581652685649276, "loss": 1.7681, "step": 519 }, { "epoch": 0.95, "grad_norm": 0.052369993180036545, "learning_rate": 0.00015565689667758746, "loss": 1.7321, "step": 520 }, { "epoch": 0.95, "grad_norm": 0.04850650206208229, "learning_rate": 0.00015549706076069128, "loss": 1.7162, "step": 521 }, { "epoch": 0.95, "grad_norm": 0.04979635775089264, "learning_rate": 0.00015533701969664424, "loss": 1.7429, "step": 522 }, { "epoch": 0.95, "grad_norm": 0.04920853301882744, "learning_rate": 0.0001551767740770446, "loss": 1.7103, "step": 523 }, { "epoch": 0.96, "grad_norm": 0.05081456899642944, "learning_rate": 0.0001550163244942469, "loss": 1.7781, "step": 524 }, { "epoch": 0.96, "grad_norm": 0.050754062831401825, "learning_rate": 0.00015485567154135952, "loss": 1.7496, "step": 525 }, { "epoch": 0.96, "grad_norm": 0.050315603613853455, "learning_rate": 0.00015469481581224272, "loss": 1.7303, "step": 526 }, { "epoch": 0.96, "grad_norm": 0.05050061643123627, "learning_rate": 0.00015453375790150617, "loss": 1.679, "step": 527 }, { "epoch": 0.96, "grad_norm": 0.06212810054421425, "learning_rate": 0.00015437249840450715, "loss": 1.713, "step": 528 }, { "epoch": 0.96, "grad_norm": 0.050966355949640274, "learning_rate": 0.00015421103791734786, "loss": 1.7551, "step": 529 }, { "epoch": 0.97, "grad_norm": 0.04892159253358841, "learning_rate": 0.00015404937703687363, "loss": 1.6758, "step": 530 }, { "epoch": 0.97, "grad_norm": 0.05551762133836746, "learning_rate": 0.00015388751636067052, "loss": 1.703, "step": 531 }, { "epoch": 0.97, "grad_norm": 0.0516047477722168, "learning_rate": 0.00015372545648706306, "loss": 1.7407, "step": 532 }, { "epoch": 0.97, "grad_norm": 0.05094458907842636, "learning_rate": 0.0001535631980151123, "loss": 1.6534, "step": 533 }, { "epoch": 0.97, "grad_norm": 0.05045678839087486, "learning_rate": 0.00015340074154461316, "loss": 1.7335, "step": 534 }, { "epoch": 0.98, "grad_norm": 0.05067756026983261, "learning_rate": 0.00015323808767609277, "loss": 1.7169, "step": 535 }, { "epoch": 0.98, "grad_norm": 0.05005278438329697, "learning_rate": 0.00015307523701080768, "loss": 1.7778, "step": 536 }, { "epoch": 0.98, "grad_norm": 0.04952746629714966, "learning_rate": 0.0001529121901507421, "loss": 1.7199, "step": 537 }, { "epoch": 0.98, "grad_norm": 0.04711218178272247, "learning_rate": 0.00015274894769860538, "loss": 1.734, "step": 538 }, { "epoch": 0.98, "grad_norm": 0.05313078686594963, "learning_rate": 0.0001525855102578299, "loss": 1.7733, "step": 539 }, { "epoch": 0.98, "grad_norm": 0.04977120831608772, "learning_rate": 0.0001524218784325688, "loss": 1.731, "step": 540 }, { "epoch": 0.99, "grad_norm": 0.05076899752020836, "learning_rate": 0.00015225805282769383, "loss": 1.7277, "step": 541 }, { "epoch": 0.99, "grad_norm": 0.049164701253175735, "learning_rate": 0.00015209403404879303, "loss": 1.7032, "step": 542 }, { "epoch": 0.99, "grad_norm": 0.0488349013030529, "learning_rate": 0.00015192982270216854, "loss": 1.765, "step": 543 }, { "epoch": 0.99, "grad_norm": 0.04831582307815552, "learning_rate": 0.0001517654193948343, "loss": 1.7548, "step": 544 }, { "epoch": 0.99, "grad_norm": 0.052940741181373596, "learning_rate": 0.00015160082473451378, "loss": 1.7209, "step": 545 }, { "epoch": 1.0, "grad_norm": 0.056908875703811646, "learning_rate": 0.00015143603932963795, "loss": 1.6537, "step": 546 }, { "epoch": 1.0, "grad_norm": 0.0509711354970932, "learning_rate": 0.00015127106378934273, "loss": 1.7151, "step": 547 }, { "epoch": 1.0, "grad_norm": 0.04795239865779877, "learning_rate": 0.000151105898723467, "loss": 1.743, "step": 548 }, { "epoch": 1.0, "eval_loss": 1.7236659526824951, "eval_runtime": 76.6784, "eval_samples_per_second": 65.207, "eval_steps_per_second": 16.302, "step": 548 }, { "epoch": 1.0, "grad_norm": 0.05828290060162544, "learning_rate": 0.00015094054474255007, "loss": 1.7014, "step": 549 }, { "epoch": 1.0, "grad_norm": 0.04827438294887543, "learning_rate": 0.00015077500245782978, "loss": 1.7124, "step": 550 }, { "epoch": 1.0, "grad_norm": 0.04962700232863426, "learning_rate": 0.0001506092724812399, "loss": 1.7496, "step": 551 }, { "epoch": 1.01, "grad_norm": 0.05015181377530098, "learning_rate": 0.00015044335542540804, "loss": 1.6653, "step": 552 }, { "epoch": 1.01, "grad_norm": 0.07125337421894073, "learning_rate": 0.0001502772519036534, "loss": 1.6938, "step": 553 }, { "epoch": 1.01, "grad_norm": 0.05031266435980797, "learning_rate": 0.0001501109625299844, "loss": 1.7782, "step": 554 }, { "epoch": 1.01, "grad_norm": 0.0487028993666172, "learning_rate": 0.00014994448791909656, "loss": 1.7202, "step": 555 }, { "epoch": 1.0, "grad_norm": 0.06726840883493423, "learning_rate": 0.00014977782868636999, "loss": 1.7504, "step": 556 }, { "epoch": 1.0, "grad_norm": 0.06244590878486633, "learning_rate": 0.00014961098544786743, "loss": 1.6834, "step": 557 }, { "epoch": 1.01, "grad_norm": 0.04934772849082947, "learning_rate": 0.00014944395882033167, "loss": 1.6822, "step": 558 }, { "epoch": 1.01, "grad_norm": 0.050311822444200516, "learning_rate": 0.00014927674942118345, "loss": 1.747, "step": 559 }, { "epoch": 1.01, "grad_norm": 0.051862068474292755, "learning_rate": 0.00014910935786851919, "loss": 1.7355, "step": 560 }, { "epoch": 1.01, "grad_norm": 0.049238841980695724, "learning_rate": 0.00014894178478110857, "loss": 1.6973, "step": 561 }, { "epoch": 1.01, "grad_norm": 0.05033009499311447, "learning_rate": 0.00014877403077839235, "loss": 1.6718, "step": 562 }, { "epoch": 1.01, "grad_norm": 0.04922296851873398, "learning_rate": 0.00014860609648048004, "loss": 1.7236, "step": 563 }, { "epoch": 1.02, "grad_norm": 0.05257139354944229, "learning_rate": 0.0001484379825081476, "loss": 1.6868, "step": 564 }, { "epoch": 1.02, "grad_norm": 0.05213212966918945, "learning_rate": 0.0001482696894828353, "loss": 1.726, "step": 565 }, { "epoch": 1.02, "grad_norm": 0.053737424314022064, "learning_rate": 0.00014810121802664512, "loss": 1.7046, "step": 566 }, { "epoch": 1.02, "grad_norm": 0.054125770926475525, "learning_rate": 0.0001479325687623386, "loss": 1.6106, "step": 567 }, { "epoch": 1.02, "grad_norm": 0.051876723766326904, "learning_rate": 0.00014776374231333477, "loss": 1.7354, "step": 568 }, { "epoch": 1.03, "grad_norm": 0.050595056265592575, "learning_rate": 0.00014759473930370736, "loss": 1.6947, "step": 569 }, { "epoch": 1.03, "grad_norm": 0.06360866129398346, "learning_rate": 0.00014742556035818297, "loss": 1.7379, "step": 570 }, { "epoch": 1.03, "grad_norm": 0.05476611480116844, "learning_rate": 0.0001472562061021385, "loss": 1.6392, "step": 571 }, { "epoch": 1.03, "grad_norm": 0.051338374614715576, "learning_rate": 0.0001470866771615988, "loss": 1.687, "step": 572 }, { "epoch": 1.03, "grad_norm": 0.05180288851261139, "learning_rate": 0.00014691697416323454, "loss": 1.6942, "step": 573 }, { "epoch": 1.03, "grad_norm": 0.05175211653113365, "learning_rate": 0.00014674709773435983, "loss": 1.6648, "step": 574 }, { "epoch": 1.04, "grad_norm": 0.055275119841098785, "learning_rate": 0.00014657704850292976, "loss": 1.7311, "step": 575 }, { "epoch": 1.04, "grad_norm": 0.053508460521698, "learning_rate": 0.00014640682709753832, "loss": 1.7118, "step": 576 }, { "epoch": 1.04, "grad_norm": 0.05283378064632416, "learning_rate": 0.00014623643414741585, "loss": 1.6675, "step": 577 }, { "epoch": 1.04, "grad_norm": 0.05684136226773262, "learning_rate": 0.00014606587028242682, "loss": 1.709, "step": 578 }, { "epoch": 1.04, "grad_norm": 0.0515415295958519, "learning_rate": 0.0001458951361330676, "loss": 1.653, "step": 579 }, { "epoch": 1.05, "grad_norm": 0.052131347358226776, "learning_rate": 0.00014572423233046386, "loss": 1.6497, "step": 580 }, { "epoch": 1.05, "grad_norm": 0.05229787901043892, "learning_rate": 0.00014555315950636854, "loss": 1.6209, "step": 581 }, { "epoch": 1.05, "grad_norm": 0.058796849101781845, "learning_rate": 0.00014538191829315927, "loss": 1.6907, "step": 582 }, { "epoch": 1.05, "grad_norm": 0.0535275973379612, "learning_rate": 0.00014521050932383625, "loss": 1.6765, "step": 583 }, { "epoch": 1.05, "grad_norm": 0.06131954491138458, "learning_rate": 0.00014503893323201966, "loss": 1.6963, "step": 584 }, { "epoch": 1.05, "grad_norm": 0.05318441987037659, "learning_rate": 0.00014486719065194757, "loss": 1.6693, "step": 585 }, { "epoch": 1.06, "grad_norm": 0.053547151386737823, "learning_rate": 0.00014469528221847344, "loss": 1.6265, "step": 586 }, { "epoch": 1.06, "grad_norm": 0.05694759264588356, "learning_rate": 0.00014452320856706382, "loss": 1.6998, "step": 587 }, { "epoch": 1.06, "grad_norm": 0.053848620504140854, "learning_rate": 0.00014435097033379596, "loss": 1.7248, "step": 588 }, { "epoch": 1.06, "grad_norm": 0.05272265151143074, "learning_rate": 0.00014417856815535554, "loss": 1.6973, "step": 589 }, { "epoch": 1.06, "grad_norm": 0.05548195540904999, "learning_rate": 0.00014400600266903423, "loss": 1.6912, "step": 590 }, { "epoch": 1.07, "grad_norm": 0.05391455814242363, "learning_rate": 0.00014383327451272744, "loss": 1.6507, "step": 591 }, { "epoch": 1.07, "grad_norm": 0.05697217211127281, "learning_rate": 0.00014366038432493181, "loss": 1.7277, "step": 592 }, { "epoch": 1.07, "grad_norm": 0.054713811725378036, "learning_rate": 0.000143487332744743, "loss": 1.7225, "step": 593 }, { "epoch": 1.07, "grad_norm": 0.05515265092253685, "learning_rate": 0.00014331412041185322, "loss": 1.6838, "step": 594 }, { "epoch": 1.07, "grad_norm": 0.054941218346357346, "learning_rate": 0.00014314074796654896, "loss": 1.6913, "step": 595 }, { "epoch": 1.07, "grad_norm": 0.05448353663086891, "learning_rate": 0.0001429672160497085, "loss": 1.6685, "step": 596 }, { "epoch": 1.08, "grad_norm": 0.058499112725257874, "learning_rate": 0.0001427935253027997, "loss": 1.6637, "step": 597 }, { "epoch": 1.08, "grad_norm": 0.0628763735294342, "learning_rate": 0.00014261967636787747, "loss": 1.7139, "step": 598 }, { "epoch": 1.08, "grad_norm": 0.05447819083929062, "learning_rate": 0.00014244566988758152, "loss": 1.6984, "step": 599 }, { "epoch": 1.08, "grad_norm": 0.05434316396713257, "learning_rate": 0.0001422715065051339, "loss": 1.6688, "step": 600 }, { "epoch": 1.08, "grad_norm": 0.052557747811079025, "learning_rate": 0.00014209718686433663, "loss": 1.7169, "step": 601 }, { "epoch": 1.09, "grad_norm": 0.054510824382305145, "learning_rate": 0.00014192271160956942, "loss": 1.6186, "step": 602 }, { "epoch": 1.09, "grad_norm": 0.0586363822221756, "learning_rate": 0.00014174808138578713, "loss": 1.7364, "step": 603 }, { "epoch": 1.09, "grad_norm": 0.05653434619307518, "learning_rate": 0.0001415732968385176, "loss": 1.77, "step": 604 }, { "epoch": 1.09, "grad_norm": 0.052821431308984756, "learning_rate": 0.00014139835861385892, "loss": 1.6599, "step": 605 }, { "epoch": 1.09, "grad_norm": 0.054437246173620224, "learning_rate": 0.00014122326735847748, "loss": 1.7026, "step": 606 }, { "epoch": 1.09, "grad_norm": 0.056837234646081924, "learning_rate": 0.00014104802371960523, "loss": 1.6475, "step": 607 }, { "epoch": 1.1, "grad_norm": 0.06032341718673706, "learning_rate": 0.0001408726283450374, "loss": 1.7482, "step": 608 }, { "epoch": 1.1, "grad_norm": 0.05582507699728012, "learning_rate": 0.00014069708188313017, "loss": 1.7046, "step": 609 }, { "epoch": 1.1, "grad_norm": 0.05785200744867325, "learning_rate": 0.00014052138498279828, "loss": 1.7234, "step": 610 }, { "epoch": 1.1, "grad_norm": 0.05540376156568527, "learning_rate": 0.00014034553829351236, "loss": 1.7157, "step": 611 }, { "epoch": 1.1, "grad_norm": 0.05743914842605591, "learning_rate": 0.00014016954246529696, "loss": 1.7548, "step": 612 }, { "epoch": 1.11, "grad_norm": 0.05496819317340851, "learning_rate": 0.00013999339814872784, "loss": 1.6913, "step": 613 }, { "epoch": 1.11, "grad_norm": 0.05739595368504524, "learning_rate": 0.00013981710599492964, "loss": 1.7232, "step": 614 }, { "epoch": 1.11, "grad_norm": 0.05653569847345352, "learning_rate": 0.00013964066665557348, "loss": 1.6953, "step": 615 }, { "epoch": 1.11, "grad_norm": 0.05570907145738602, "learning_rate": 0.00013946408078287462, "loss": 1.6858, "step": 616 }, { "epoch": 1.11, "grad_norm": 0.054925207048654556, "learning_rate": 0.00013928734902958996, "loss": 1.6248, "step": 617 }, { "epoch": 1.11, "grad_norm": 0.05743985250592232, "learning_rate": 0.0001391104720490156, "loss": 1.6627, "step": 618 }, { "epoch": 1.12, "grad_norm": 0.05516685172915459, "learning_rate": 0.00013893345049498457, "loss": 1.6714, "step": 619 }, { "epoch": 1.12, "grad_norm": 0.05717911571264267, "learning_rate": 0.0001387562850218642, "loss": 1.7124, "step": 620 }, { "epoch": 1.12, "grad_norm": 0.05529535561800003, "learning_rate": 0.00013857897628455397, "loss": 1.6451, "step": 621 }, { "epoch": 1.12, "grad_norm": 0.05724070221185684, "learning_rate": 0.00013840152493848284, "loss": 1.7274, "step": 622 }, { "epoch": 1.12, "grad_norm": 0.05622214823961258, "learning_rate": 0.0001382239316396069, "loss": 1.6506, "step": 623 }, { "epoch": 1.13, "grad_norm": 0.05893300846219063, "learning_rate": 0.00013804619704440714, "loss": 1.7037, "step": 624 }, { "epoch": 1.13, "grad_norm": 0.05549685284495354, "learning_rate": 0.00013786832180988666, "loss": 1.6894, "step": 625 }, { "epoch": 1.13, "grad_norm": 0.05931728705763817, "learning_rate": 0.00013769030659356853, "loss": 1.7189, "step": 626 }, { "epoch": 1.13, "grad_norm": 0.05465949699282646, "learning_rate": 0.0001375121520534933, "loss": 1.7016, "step": 627 }, { "epoch": 1.13, "grad_norm": 0.056453317403793335, "learning_rate": 0.00013733385884821648, "loss": 1.6711, "step": 628 }, { "epoch": 1.13, "grad_norm": 0.054540056735277176, "learning_rate": 0.00013715542763680623, "loss": 1.6638, "step": 629 }, { "epoch": 1.14, "grad_norm": 0.05919068679213524, "learning_rate": 0.00013697685907884072, "loss": 1.7241, "step": 630 }, { "epoch": 1.14, "grad_norm": 0.05730579420924187, "learning_rate": 0.00013679815383440603, "loss": 1.6946, "step": 631 }, { "epoch": 1.14, "grad_norm": 0.05658195540308952, "learning_rate": 0.00013661931256409325, "loss": 1.7038, "step": 632 }, { "epoch": 1.14, "grad_norm": 0.057528719305992126, "learning_rate": 0.00013644033592899658, "loss": 1.6853, "step": 633 }, { "epoch": 1.14, "grad_norm": 0.062490735203027725, "learning_rate": 0.00013626122459071033, "loss": 1.6733, "step": 634 }, { "epoch": 1.15, "grad_norm": 0.05776170268654823, "learning_rate": 0.00013608197921132696, "loss": 1.7351, "step": 635 }, { "epoch": 1.15, "grad_norm": 0.06134483963251114, "learning_rate": 0.00013590260045343432, "loss": 1.6203, "step": 636 }, { "epoch": 1.15, "grad_norm": 0.061270635575056076, "learning_rate": 0.0001357230889801133, "loss": 1.7268, "step": 637 }, { "epoch": 1.15, "grad_norm": 0.056105442345142365, "learning_rate": 0.00013554344545493535, "loss": 1.7171, "step": 638 }, { "epoch": 1.15, "grad_norm": 0.05647943168878555, "learning_rate": 0.0001353636705419602, "loss": 1.713, "step": 639 }, { "epoch": 1.15, "grad_norm": 0.05758386850357056, "learning_rate": 0.00013518376490573306, "loss": 1.6991, "step": 640 }, { "epoch": 1.16, "grad_norm": 0.05906842276453972, "learning_rate": 0.0001350037292112825, "loss": 1.6387, "step": 641 }, { "epoch": 1.16, "grad_norm": 0.06219753623008728, "learning_rate": 0.00013482356412411781, "loss": 1.7145, "step": 642 }, { "epoch": 1.16, "grad_norm": 0.05719519779086113, "learning_rate": 0.00013464327031022659, "loss": 1.7399, "step": 643 }, { "epoch": 1.16, "grad_norm": 0.08058752119541168, "learning_rate": 0.00013446284843607225, "loss": 1.6275, "step": 644 }, { "epoch": 1.16, "grad_norm": 0.06629724055528641, "learning_rate": 0.00013428229916859167, "loss": 1.6582, "step": 645 }, { "epoch": 1.17, "grad_norm": 0.05791241303086281, "learning_rate": 0.00013410162317519257, "loss": 1.6599, "step": 646 }, { "epoch": 1.17, "grad_norm": 0.06143872067332268, "learning_rate": 0.0001339208211237511, "loss": 1.6634, "step": 647 }, { "epoch": 1.17, "grad_norm": 0.06067274510860443, "learning_rate": 0.00013373989368260948, "loss": 1.6869, "step": 648 }, { "epoch": 1.17, "grad_norm": 0.06446303427219391, "learning_rate": 0.00013355884152057334, "loss": 1.6658, "step": 649 }, { "epoch": 1.17, "grad_norm": 0.05910011753439903, "learning_rate": 0.00013337766530690943, "loss": 1.683, "step": 650 }, { "epoch": 1.17, "grad_norm": 0.06423602253198624, "learning_rate": 0.00013319636571134297, "loss": 1.7058, "step": 651 }, { "epoch": 1.18, "grad_norm": 0.05743340775370598, "learning_rate": 0.00013301494340405535, "loss": 1.6491, "step": 652 }, { "epoch": 1.18, "grad_norm": 0.05755629763007164, "learning_rate": 0.00013283339905568157, "loss": 1.6606, "step": 653 }, { "epoch": 1.18, "grad_norm": 0.05766105651855469, "learning_rate": 0.00013265173333730764, "loss": 1.6855, "step": 654 }, { "epoch": 1.18, "grad_norm": 0.05892917141318321, "learning_rate": 0.00013246994692046836, "loss": 1.6398, "step": 655 }, { "epoch": 1.18, "grad_norm": 0.05860791355371475, "learning_rate": 0.00013228804047714463, "loss": 1.7089, "step": 656 }, { "epoch": 1.19, "grad_norm": 0.059190504252910614, "learning_rate": 0.00013210601467976104, "loss": 1.6703, "step": 657 }, { "epoch": 1.19, "grad_norm": 0.05735331028699875, "learning_rate": 0.0001319238702011834, "loss": 1.73, "step": 658 }, { "epoch": 1.19, "grad_norm": 0.05985163152217865, "learning_rate": 0.0001317416077147162, "loss": 1.6864, "step": 659 }, { "epoch": 1.19, "grad_norm": 0.05826161056756973, "learning_rate": 0.00013155922789410016, "loss": 1.6419, "step": 660 }, { "epoch": 1.19, "grad_norm": 0.059993255883455276, "learning_rate": 0.00013137673141350972, "loss": 1.7027, "step": 661 }, { "epoch": 1.19, "grad_norm": 0.06040223315358162, "learning_rate": 0.00013119411894755063, "loss": 1.7584, "step": 662 }, { "epoch": 1.2, "grad_norm": 0.056883446872234344, "learning_rate": 0.00013101139117125722, "loss": 1.6971, "step": 663 }, { "epoch": 1.2, "grad_norm": 0.05828433483839035, "learning_rate": 0.0001308285487600903, "loss": 1.6797, "step": 664 }, { "epoch": 1.2, "grad_norm": 0.0568573996424675, "learning_rate": 0.0001306455923899342, "loss": 1.6967, "step": 665 }, { "epoch": 1.2, "grad_norm": 0.05763811990618706, "learning_rate": 0.00013046252273709468, "loss": 1.7189, "step": 666 }, { "epoch": 1.2, "grad_norm": 0.05759183317422867, "learning_rate": 0.00013027934047829616, "loss": 1.7293, "step": 667 }, { "epoch": 1.21, "grad_norm": 0.06087080016732216, "learning_rate": 0.00013009604629067933, "loss": 1.7287, "step": 668 }, { "epoch": 1.21, "grad_norm": 0.05685460940003395, "learning_rate": 0.00012991264085179864, "loss": 1.6717, "step": 669 }, { "epoch": 1.21, "grad_norm": 0.06102333217859268, "learning_rate": 0.00012972912483961982, "loss": 1.7911, "step": 670 }, { "epoch": 1.21, "grad_norm": 0.05811255797743797, "learning_rate": 0.00012954549893251724, "loss": 1.7057, "step": 671 }, { "epoch": 1.21, "grad_norm": 0.05935278907418251, "learning_rate": 0.00012936176380927162, "loss": 1.6678, "step": 672 }, { "epoch": 1.21, "grad_norm": 0.06539764255285263, "learning_rate": 0.00012917792014906733, "loss": 1.6305, "step": 673 }, { "epoch": 1.22, "grad_norm": 0.059705205261707306, "learning_rate": 0.00012899396863148995, "loss": 1.7273, "step": 674 }, { "epoch": 1.22, "grad_norm": 0.05784007906913757, "learning_rate": 0.00012880990993652377, "loss": 1.6549, "step": 675 }, { "epoch": 1.22, "grad_norm": 0.07344791293144226, "learning_rate": 0.00012862574474454928, "loss": 1.6809, "step": 676 }, { "epoch": 1.22, "grad_norm": 0.06028100103139877, "learning_rate": 0.00012844147373634066, "loss": 1.6852, "step": 677 }, { "epoch": 1.22, "grad_norm": 0.06096576154232025, "learning_rate": 0.00012825709759306316, "loss": 1.7256, "step": 678 }, { "epoch": 1.23, "grad_norm": 0.060117993503808975, "learning_rate": 0.00012807261699627077, "loss": 1.7094, "step": 679 }, { "epoch": 1.23, "grad_norm": 0.06428851187229156, "learning_rate": 0.0001278880326279035, "loss": 1.6538, "step": 680 }, { "epoch": 1.23, "grad_norm": 0.060511935502290726, "learning_rate": 0.00012770334517028505, "loss": 1.6631, "step": 681 }, { "epoch": 1.23, "grad_norm": 0.05897079408168793, "learning_rate": 0.00012751855530612012, "loss": 1.6732, "step": 682 }, { "epoch": 1.23, "grad_norm": 0.05949567258358002, "learning_rate": 0.00012733366371849201, "loss": 1.6989, "step": 683 }, { "epoch": 1.23, "grad_norm": 0.05985894054174423, "learning_rate": 0.00012714867109086, "loss": 1.6983, "step": 684 }, { "epoch": 1.24, "grad_norm": 0.061160728335380554, "learning_rate": 0.0001269635781070569, "loss": 1.7075, "step": 685 }, { "epoch": 1.24, "eval_loss": 1.7264653444290161, "eval_runtime": 76.4445, "eval_samples_per_second": 65.407, "eval_steps_per_second": 16.352, "step": 685 }, { "epoch": 1.24, "grad_norm": 0.0652250349521637, "learning_rate": 0.00012677838545128647, "loss": 1.6851, "step": 686 }, { "epoch": 1.24, "grad_norm": 0.060404662042856216, "learning_rate": 0.00012659309380812092, "loss": 1.6539, "step": 687 }, { "epoch": 1.24, "grad_norm": 0.05635406821966171, "learning_rate": 0.0001264077038624984, "loss": 1.678, "step": 688 }, { "epoch": 1.24, "grad_norm": 0.06129194051027298, "learning_rate": 0.00012622221629972043, "loss": 1.6455, "step": 689 }, { "epoch": 1.25, "grad_norm": 0.06195101514458656, "learning_rate": 0.0001260366318054493, "loss": 1.7009, "step": 690 }, { "epoch": 1.25, "grad_norm": 0.06593389809131622, "learning_rate": 0.0001258509510657057, "loss": 1.6897, "step": 691 }, { "epoch": 1.25, "grad_norm": 0.0664474368095398, "learning_rate": 0.00012566517476686606, "loss": 1.6847, "step": 692 }, { "epoch": 1.25, "grad_norm": 0.06081750988960266, "learning_rate": 0.00012547930359566007, "loss": 1.6126, "step": 693 }, { "epoch": 1.25, "grad_norm": 0.06048804894089699, "learning_rate": 0.00012529333823916807, "loss": 1.7086, "step": 694 }, { "epoch": 1.25, "grad_norm": 0.06522712111473083, "learning_rate": 0.00012510727938481865, "loss": 1.6931, "step": 695 }, { "epoch": 1.26, "grad_norm": 0.0614117830991745, "learning_rate": 0.0001249211277203859, "loss": 1.7362, "step": 696 }, { "epoch": 1.26, "grad_norm": 0.05812584608793259, "learning_rate": 0.00012473488393398706, "loss": 1.7052, "step": 697 }, { "epoch": 1.26, "grad_norm": 0.059068553149700165, "learning_rate": 0.00012454854871407994, "loss": 1.6872, "step": 698 }, { "epoch": 1.26, "grad_norm": 0.06033525615930557, "learning_rate": 0.0001243621227494602, "loss": 1.6954, "step": 699 }, { "epoch": 1.26, "grad_norm": 0.06032804027199745, "learning_rate": 0.00012417560672925912, "loss": 1.6571, "step": 700 }, { "epoch": 1.27, "grad_norm": 0.06035863235592842, "learning_rate": 0.00012398900134294073, "loss": 1.6894, "step": 701 }, { "epoch": 1.27, "grad_norm": 0.059223804622888565, "learning_rate": 0.00012380230728029946, "loss": 1.711, "step": 702 }, { "epoch": 1.27, "grad_norm": 0.061139173805713654, "learning_rate": 0.00012361552523145757, "loss": 1.626, "step": 703 }, { "epoch": 1.27, "grad_norm": 0.06459489464759827, "learning_rate": 0.0001234286558868625, "loss": 1.7467, "step": 704 }, { "epoch": 1.27, "grad_norm": 0.06497075408697128, "learning_rate": 0.00012324169993728438, "loss": 1.7419, "step": 705 }, { "epoch": 1.27, "grad_norm": 0.06115833297371864, "learning_rate": 0.0001230546580738136, "loss": 1.6781, "step": 706 }, { "epoch": 1.28, "grad_norm": 0.06160350516438484, "learning_rate": 0.00012286753098785796, "loss": 1.6907, "step": 707 }, { "epoch": 1.28, "grad_norm": 0.06168088689446449, "learning_rate": 0.00012268031937114044, "loss": 1.7265, "step": 708 }, { "epoch": 1.28, "grad_norm": 0.06278149783611298, "learning_rate": 0.00012249302391569638, "loss": 1.7023, "step": 709 }, { "epoch": 1.28, "grad_norm": 0.06181812658905983, "learning_rate": 0.00012230564531387107, "loss": 1.6897, "step": 710 }, { "epoch": 1.28, "grad_norm": 0.05875727906823158, "learning_rate": 0.00012211818425831718, "loss": 1.644, "step": 711 }, { "epoch": 1.29, "grad_norm": 0.061242878437042236, "learning_rate": 0.00012193064144199218, "loss": 1.7256, "step": 712 }, { "epoch": 1.29, "grad_norm": 0.060726381838321686, "learning_rate": 0.00012174301755815571, "loss": 1.6871, "step": 713 }, { "epoch": 1.29, "grad_norm": 0.06219150498509407, "learning_rate": 0.00012155531330036712, "loss": 1.7048, "step": 714 }, { "epoch": 1.29, "grad_norm": 0.06084437295794487, "learning_rate": 0.0001213675293624829, "loss": 1.6888, "step": 715 }, { "epoch": 1.29, "grad_norm": 0.06178005784749985, "learning_rate": 0.00012117966643865398, "loss": 1.6791, "step": 716 }, { "epoch": 1.29, "grad_norm": 0.05991113558411598, "learning_rate": 0.00012099172522332338, "loss": 1.7318, "step": 717 }, { "epoch": 1.3, "grad_norm": 0.06223401054739952, "learning_rate": 0.00012080370641122345, "loss": 1.6417, "step": 718 }, { "epoch": 1.3, "grad_norm": 0.062392983585596085, "learning_rate": 0.00012061561069737343, "loss": 1.6411, "step": 719 }, { "epoch": 1.3, "grad_norm": 0.060492224991321564, "learning_rate": 0.00012042743877707678, "loss": 1.6717, "step": 720 }, { "epoch": 1.3, "grad_norm": 0.06418413668870926, "learning_rate": 0.0001202391913459187, "loss": 1.6751, "step": 721 }, { "epoch": 1.3, "grad_norm": 0.060530129820108414, "learning_rate": 0.0001200508690997635, "loss": 1.7175, "step": 722 }, { "epoch": 1.31, "grad_norm": 0.06409049779176712, "learning_rate": 0.00011986247273475206, "loss": 1.6953, "step": 723 }, { "epoch": 1.31, "grad_norm": 0.05866590142250061, "learning_rate": 0.0001196740029472992, "loss": 1.6935, "step": 724 }, { "epoch": 1.31, "grad_norm": 0.06476990133523941, "learning_rate": 0.00011948546043409123, "loss": 1.7017, "step": 725 }, { "epoch": 1.31, "grad_norm": 0.06523357331752777, "learning_rate": 0.00011929684589208326, "loss": 1.7183, "step": 726 }, { "epoch": 1.31, "grad_norm": 0.060969460755586624, "learning_rate": 0.00011910816001849654, "loss": 1.6887, "step": 727 }, { "epoch": 1.31, "grad_norm": 0.11310483515262604, "learning_rate": 0.00011891940351081625, "loss": 1.6816, "step": 728 }, { "epoch": 1.32, "grad_norm": 0.059255216270685196, "learning_rate": 0.00011873057706678843, "loss": 1.6554, "step": 729 }, { "epoch": 1.32, "grad_norm": 0.062034714967012405, "learning_rate": 0.00011854168138441775, "loss": 1.668, "step": 730 }, { "epoch": 1.32, "grad_norm": 0.06186864525079727, "learning_rate": 0.00011835271716196486, "loss": 1.6806, "step": 731 }, { "epoch": 1.32, "grad_norm": 0.06105494871735573, "learning_rate": 0.00011816368509794364, "loss": 1.6615, "step": 732 }, { "epoch": 1.32, "grad_norm": 0.06231169030070305, "learning_rate": 0.00011797458589111894, "loss": 1.6588, "step": 733 }, { "epoch": 1.33, "grad_norm": 0.06832422316074371, "learning_rate": 0.00011778542024050361, "loss": 1.6758, "step": 734 }, { "epoch": 1.33, "grad_norm": 0.06158846989274025, "learning_rate": 0.00011759618884535624, "loss": 1.7025, "step": 735 }, { "epoch": 1.33, "grad_norm": 0.07147394865751266, "learning_rate": 0.00011740689240517837, "loss": 1.6691, "step": 736 }, { "epoch": 1.33, "grad_norm": 0.06047786399722099, "learning_rate": 0.00011721753161971212, "loss": 1.6968, "step": 737 }, { "epoch": 1.33, "grad_norm": 0.0623675100505352, "learning_rate": 0.00011702810718893722, "loss": 1.7372, "step": 738 }, { "epoch": 1.34, "grad_norm": 0.06291418522596359, "learning_rate": 0.00011683861981306893, "loss": 1.7083, "step": 739 }, { "epoch": 1.34, "grad_norm": 0.059522755444049835, "learning_rate": 0.00011664907019255502, "loss": 1.6533, "step": 740 }, { "epoch": 1.34, "grad_norm": 0.060890860855579376, "learning_rate": 0.00011645945902807341, "loss": 1.6875, "step": 741 }, { "epoch": 1.34, "grad_norm": 0.060426972806453705, "learning_rate": 0.00011626978702052948, "loss": 1.6463, "step": 742 }, { "epoch": 1.34, "grad_norm": 0.062305621802806854, "learning_rate": 0.00011608005487105362, "loss": 1.6785, "step": 743 }, { "epoch": 1.34, "grad_norm": 0.06419097632169724, "learning_rate": 0.00011589026328099839, "loss": 1.6679, "step": 744 }, { "epoch": 1.35, "grad_norm": 0.06365741044282913, "learning_rate": 0.00011570041295193622, "loss": 1.6668, "step": 745 }, { "epoch": 1.35, "grad_norm": 0.0642697736620903, "learning_rate": 0.00011551050458565658, "loss": 1.7095, "step": 746 }, { "epoch": 1.35, "grad_norm": 0.06443499773740768, "learning_rate": 0.00011532053888416343, "loss": 1.6586, "step": 747 }, { "epoch": 1.35, "grad_norm": 0.06351306289434433, "learning_rate": 0.00011513051654967286, "loss": 1.6776, "step": 748 }, { "epoch": 1.35, "grad_norm": 0.06554794311523438, "learning_rate": 0.00011494043828461007, "loss": 1.7105, "step": 749 }, { "epoch": 1.36, "grad_norm": 0.10256826132535934, "learning_rate": 0.00011475030479160725, "loss": 1.7046, "step": 750 }, { "epoch": 1.36, "grad_norm": 0.06379935145378113, "learning_rate": 0.00011456011677350051, "loss": 1.711, "step": 751 }, { "epoch": 1.36, "grad_norm": 0.06044677272439003, "learning_rate": 0.00011436987493332767, "loss": 1.7186, "step": 752 }, { "epoch": 1.36, "grad_norm": 0.06297197192907333, "learning_rate": 0.00011417957997432546, "loss": 1.6453, "step": 753 }, { "epoch": 1.36, "grad_norm": 0.06677673757076263, "learning_rate": 0.00011398923259992697, "loss": 1.6443, "step": 754 }, { "epoch": 1.36, "grad_norm": 0.062335170805454254, "learning_rate": 0.00011379883351375901, "loss": 1.6738, "step": 755 }, { "epoch": 1.37, "grad_norm": 0.06286536902189255, "learning_rate": 0.00011360838341963964, "loss": 1.7081, "step": 756 }, { "epoch": 1.37, "grad_norm": 0.07303211838006973, "learning_rate": 0.00011341788302157536, "loss": 1.6906, "step": 757 }, { "epoch": 1.37, "grad_norm": 0.06304056942462921, "learning_rate": 0.00011322733302375863, "loss": 1.6783, "step": 758 }, { "epoch": 1.37, "grad_norm": 0.07291906327009201, "learning_rate": 0.00011303673413056541, "loss": 1.7162, "step": 759 }, { "epoch": 1.37, "grad_norm": 0.061802685260772705, "learning_rate": 0.00011284608704655215, "loss": 1.7375, "step": 760 }, { "epoch": 1.38, "grad_norm": 0.06205203756690025, "learning_rate": 0.00011265539247645373, "loss": 1.6617, "step": 761 }, { "epoch": 1.38, "grad_norm": 0.06457790732383728, "learning_rate": 0.0001124646511251803, "loss": 1.6395, "step": 762 }, { "epoch": 1.38, "grad_norm": 0.06102142482995987, "learning_rate": 0.00011227386369781508, "loss": 1.7031, "step": 763 }, { "epoch": 1.38, "grad_norm": 0.062267519533634186, "learning_rate": 0.00011208303089961161, "loss": 1.6889, "step": 764 }, { "epoch": 1.38, "grad_norm": 0.06354745477437973, "learning_rate": 0.00011189215343599109, "loss": 1.7099, "step": 765 }, { "epoch": 1.38, "grad_norm": 0.06255058199167252, "learning_rate": 0.00011170123201253986, "loss": 1.7092, "step": 766 }, { "epoch": 1.39, "grad_norm": 0.06354597955942154, "learning_rate": 0.00011151026733500677, "loss": 1.6462, "step": 767 }, { "epoch": 1.39, "grad_norm": 0.06314928829669952, "learning_rate": 0.00011131926010930058, "loss": 1.6377, "step": 768 }, { "epoch": 1.39, "grad_norm": 0.06911808252334595, "learning_rate": 0.00011112821104148723, "loss": 1.6787, "step": 769 }, { "epoch": 1.39, "grad_norm": 0.06356338411569595, "learning_rate": 0.00011093712083778746, "loss": 1.6657, "step": 770 }, { "epoch": 1.39, "grad_norm": 0.06266220659017563, "learning_rate": 0.00011074599020457395, "loss": 1.7108, "step": 771 }, { "epoch": 1.4, "grad_norm": 0.06397093832492828, "learning_rate": 0.00011055481984836893, "loss": 1.715, "step": 772 }, { "epoch": 1.4, "grad_norm": 0.06519615650177002, "learning_rate": 0.00011036361047584143, "loss": 1.6625, "step": 773 }, { "epoch": 1.4, "grad_norm": 0.06543872505426407, "learning_rate": 0.00011017236279380467, "loss": 1.6611, "step": 774 }, { "epoch": 1.4, "grad_norm": 0.06356982886791229, "learning_rate": 0.00010998107750921354, "loss": 1.6366, "step": 775 }, { "epoch": 1.4, "grad_norm": 0.06404688209295273, "learning_rate": 0.00010978975532916189, "loss": 1.689, "step": 776 }, { "epoch": 1.4, "grad_norm": 0.06206212565302849, "learning_rate": 0.00010959839696088001, "loss": 1.6853, "step": 777 }, { "epoch": 1.41, "grad_norm": 0.0640236884355545, "learning_rate": 0.00010940700311173184, "loss": 1.6874, "step": 778 }, { "epoch": 1.41, "grad_norm": 0.06289862096309662, "learning_rate": 0.00010921557448921267, "loss": 1.7187, "step": 779 }, { "epoch": 1.41, "grad_norm": 0.06534165889024734, "learning_rate": 0.00010902411180094607, "loss": 1.6285, "step": 780 }, { "epoch": 1.41, "grad_norm": 0.06411545723676682, "learning_rate": 0.00010883261575468184, "loss": 1.6932, "step": 781 }, { "epoch": 1.41, "grad_norm": 0.06283684074878693, "learning_rate": 0.00010864108705829282, "loss": 1.7544, "step": 782 }, { "epoch": 1.42, "grad_norm": 0.06294089555740356, "learning_rate": 0.00010844952641977273, "loss": 1.695, "step": 783 }, { "epoch": 1.42, "grad_norm": 0.06469050794839859, "learning_rate": 0.00010825793454723325, "loss": 1.654, "step": 784 }, { "epoch": 1.42, "grad_norm": 0.06504753977060318, "learning_rate": 0.00010806631214890155, "loss": 1.6641, "step": 785 }, { "epoch": 1.42, "grad_norm": 0.06289339065551758, "learning_rate": 0.00010787465993311768, "loss": 1.7246, "step": 786 }, { "epoch": 1.42, "grad_norm": 0.07044830918312073, "learning_rate": 0.00010768297860833185, "loss": 1.6784, "step": 787 }, { "epoch": 1.42, "grad_norm": 0.06241421401500702, "learning_rate": 0.00010749126888310197, "loss": 1.7413, "step": 788 }, { "epoch": 1.43, "grad_norm": 0.061875198036432266, "learning_rate": 0.00010729953146609076, "loss": 1.6837, "step": 789 }, { "epoch": 1.43, "grad_norm": 0.06335246562957764, "learning_rate": 0.00010710776706606349, "loss": 1.6713, "step": 790 }, { "epoch": 1.43, "grad_norm": 0.06218186393380165, "learning_rate": 0.00010691597639188507, "loss": 1.6563, "step": 791 }, { "epoch": 1.43, "grad_norm": 0.06283168494701385, "learning_rate": 0.00010672416015251757, "loss": 1.6672, "step": 792 }, { "epoch": 1.43, "grad_norm": 0.06283591687679291, "learning_rate": 0.00010653231905701748, "loss": 1.6719, "step": 793 }, { "epoch": 1.44, "grad_norm": 0.0629267543554306, "learning_rate": 0.00010634045381453337, "loss": 1.6764, "step": 794 }, { "epoch": 1.44, "grad_norm": 0.06264865398406982, "learning_rate": 0.00010614856513430284, "loss": 1.6874, "step": 795 }, { "epoch": 1.44, "grad_norm": 0.06411181390285492, "learning_rate": 0.00010595665372565027, "loss": 1.7095, "step": 796 }, { "epoch": 1.44, "grad_norm": 0.06262548267841339, "learning_rate": 0.00010576472029798399, "loss": 1.6898, "step": 797 }, { "epoch": 1.44, "grad_norm": 0.06278496235609055, "learning_rate": 0.00010557276556079378, "loss": 1.6055, "step": 798 }, { "epoch": 1.44, "grad_norm": 0.06674374639987946, "learning_rate": 0.00010538079022364819, "loss": 1.7226, "step": 799 }, { "epoch": 1.45, "grad_norm": 0.06753117591142654, "learning_rate": 0.00010518879499619181, "loss": 1.7008, "step": 800 }, { "epoch": 1.45, "grad_norm": 0.07137101143598557, "learning_rate": 0.0001049967805881429, "loss": 1.6945, "step": 801 }, { "epoch": 1.45, "grad_norm": 0.06417196989059448, "learning_rate": 0.00010480474770929054, "loss": 1.6662, "step": 802 }, { "epoch": 1.45, "grad_norm": 0.064505934715271, "learning_rate": 0.00010461269706949213, "loss": 1.6914, "step": 803 }, { "epoch": 1.45, "grad_norm": 0.06325452029705048, "learning_rate": 0.00010442062937867063, "loss": 1.6703, "step": 804 }, { "epoch": 1.46, "grad_norm": 0.0945320799946785, "learning_rate": 0.00010422854534681219, "loss": 1.6595, "step": 805 }, { "epoch": 1.46, "grad_norm": 0.07015063613653183, "learning_rate": 0.00010403644568396322, "loss": 1.7153, "step": 806 }, { "epoch": 1.46, "grad_norm": 0.06436234712600708, "learning_rate": 0.000103844331100228, "loss": 1.6767, "step": 807 }, { "epoch": 1.46, "grad_norm": 0.06437043845653534, "learning_rate": 0.0001036522023057659, "loss": 1.7026, "step": 808 }, { "epoch": 1.46, "grad_norm": 0.06160353124141693, "learning_rate": 0.00010346006001078885, "loss": 1.7112, "step": 809 }, { "epoch": 1.46, "grad_norm": 0.06519316881895065, "learning_rate": 0.00010326790492555876, "loss": 1.6611, "step": 810 }, { "epoch": 1.47, "grad_norm": 0.06452979147434235, "learning_rate": 0.00010307573776038462, "loss": 1.6291, "step": 811 }, { "epoch": 1.47, "grad_norm": 0.06813566386699677, "learning_rate": 0.00010288355922562034, "loss": 1.6432, "step": 812 }, { "epoch": 1.47, "grad_norm": 0.06800167262554169, "learning_rate": 0.0001026913700316616, "loss": 1.6739, "step": 813 }, { "epoch": 1.47, "grad_norm": 0.062173567712306976, "learning_rate": 0.0001024991708889437, "loss": 1.7207, "step": 814 }, { "epoch": 1.47, "grad_norm": 0.06301440298557281, "learning_rate": 0.00010230696250793856, "loss": 1.6348, "step": 815 }, { "epoch": 1.48, "grad_norm": 0.06262702494859695, "learning_rate": 0.00010211474559915233, "loss": 1.6982, "step": 816 }, { "epoch": 1.48, "grad_norm": 0.06448613107204437, "learning_rate": 0.00010192252087312265, "loss": 1.7004, "step": 817 }, { "epoch": 1.48, "grad_norm": 0.06269077211618423, "learning_rate": 0.00010173028904041606, "loss": 1.6981, "step": 818 }, { "epoch": 1.48, "grad_norm": 0.06326784938573837, "learning_rate": 0.00010153805081162539, "loss": 1.718, "step": 819 }, { "epoch": 1.48, "grad_norm": 0.06502313911914825, "learning_rate": 0.0001013458068973671, "loss": 1.6669, "step": 820 }, { "epoch": 1.48, "grad_norm": 0.06869412958621979, "learning_rate": 0.0001011535580082787, "loss": 1.6237, "step": 821 }, { "epoch": 1.49, "grad_norm": 0.0637192502617836, "learning_rate": 0.00010096130485501598, "loss": 1.7264, "step": 822 }, { "epoch": 1.49, "eval_loss": 1.7267118692398071, "eval_runtime": 76.2251, "eval_samples_per_second": 65.595, "eval_steps_per_second": 16.399, "step": 822 }, { "epoch": 1.49, "grad_norm": 0.06338479369878769, "learning_rate": 0.00010076904814825066, "loss": 1.66, "step": 823 }, { "epoch": 1.49, "grad_norm": 0.0718810185790062, "learning_rate": 0.0001005767885986674, "loss": 1.7044, "step": 824 }, { "epoch": 1.49, "grad_norm": 0.06428621709346771, "learning_rate": 0.00010038452691696161, "loss": 1.6375, "step": 825 }, { "epoch": 1.49, "grad_norm": 0.06198599189519882, "learning_rate": 0.00010019226381383633, "loss": 1.644, "step": 826 }, { "epoch": 1.5, "grad_norm": 0.0649799108505249, "learning_rate": 0.0001, "loss": 1.6751, "step": 827 }, { "epoch": 1.5, "grad_norm": 0.06546121090650558, "learning_rate": 9.980773618616371e-05, "loss": 1.6728, "step": 828 }, { "epoch": 1.5, "grad_norm": 0.0744151845574379, "learning_rate": 9.961547308303844e-05, "loss": 1.7465, "step": 829 }, { "epoch": 1.5, "grad_norm": 0.06264037638902664, "learning_rate": 9.942321140133261e-05, "loss": 1.6005, "step": 830 }, { "epoch": 1.5, "grad_norm": 0.06265675276517868, "learning_rate": 9.923095185174938e-05, "loss": 1.7181, "step": 831 }, { "epoch": 1.5, "grad_norm": 0.06809694319963455, "learning_rate": 9.903869514498402e-05, "loss": 1.6345, "step": 832 }, { "epoch": 1.51, "grad_norm": 0.06538775563240051, "learning_rate": 9.884644199172135e-05, "loss": 1.7251, "step": 833 }, { "epoch": 1.51, "grad_norm": 0.06529638916254044, "learning_rate": 9.865419310263292e-05, "loss": 1.6418, "step": 834 }, { "epoch": 1.51, "grad_norm": 0.08285729587078094, "learning_rate": 9.846194918837462e-05, "loss": 1.6837, "step": 835 }, { "epoch": 1.51, "grad_norm": 0.06490971148014069, "learning_rate": 9.826971095958395e-05, "loss": 1.6723, "step": 836 }, { "epoch": 1.51, "grad_norm": 0.06375712156295776, "learning_rate": 9.807747912687739e-05, "loss": 1.6838, "step": 837 }, { "epoch": 1.52, "grad_norm": 0.06696437299251556, "learning_rate": 9.788525440084771e-05, "loss": 1.6579, "step": 838 }, { "epoch": 1.52, "grad_norm": 0.06473565846681595, "learning_rate": 9.769303749206146e-05, "loss": 1.6489, "step": 839 }, { "epoch": 1.52, "grad_norm": 0.07211591303348541, "learning_rate": 9.750082911105634e-05, "loss": 1.7435, "step": 840 }, { "epoch": 1.52, "grad_norm": 0.06550677120685577, "learning_rate": 9.730862996833841e-05, "loss": 1.6935, "step": 841 }, { "epoch": 1.52, "grad_norm": 0.06820110231637955, "learning_rate": 9.711644077437968e-05, "loss": 1.6759, "step": 842 }, { "epoch": 1.52, "grad_norm": 0.06783100217580795, "learning_rate": 9.692426223961537e-05, "loss": 1.7081, "step": 843 }, { "epoch": 1.53, "grad_norm": 0.06615381687879562, "learning_rate": 9.67320950744413e-05, "loss": 1.7375, "step": 844 }, { "epoch": 1.53, "grad_norm": 0.0648663192987442, "learning_rate": 9.653993998921118e-05, "loss": 1.6836, "step": 845 }, { "epoch": 1.53, "grad_norm": 0.0639321506023407, "learning_rate": 9.63477976942341e-05, "loss": 1.6319, "step": 846 }, { "epoch": 1.53, "grad_norm": 0.06528212130069733, "learning_rate": 9.615566889977201e-05, "loss": 1.6675, "step": 847 }, { "epoch": 1.53, "grad_norm": 0.06574473530054092, "learning_rate": 9.59635543160368e-05, "loss": 1.6442, "step": 848 }, { "epoch": 1.54, "grad_norm": 0.06326039880514145, "learning_rate": 9.577145465318783e-05, "loss": 1.639, "step": 849 }, { "epoch": 1.54, "grad_norm": 0.06851720809936523, "learning_rate": 9.557937062132938e-05, "loss": 1.7044, "step": 850 }, { "epoch": 1.54, "grad_norm": 0.06546233594417572, "learning_rate": 9.538730293050792e-05, "loss": 1.7091, "step": 851 }, { "epoch": 1.54, "grad_norm": 0.0674884095788002, "learning_rate": 9.51952522907095e-05, "loss": 1.6572, "step": 852 }, { "epoch": 1.54, "grad_norm": 0.06366416811943054, "learning_rate": 9.50032194118571e-05, "loss": 1.6913, "step": 853 }, { "epoch": 1.54, "grad_norm": 0.065780408680439, "learning_rate": 9.481120500380818e-05, "loss": 1.7106, "step": 854 }, { "epoch": 1.55, "grad_norm": 0.06662867218255997, "learning_rate": 9.461920977635184e-05, "loss": 1.6486, "step": 855 }, { "epoch": 1.55, "grad_norm": 0.06339140236377716, "learning_rate": 9.442723443920623e-05, "loss": 1.6799, "step": 856 }, { "epoch": 1.55, "grad_norm": 0.06222783029079437, "learning_rate": 9.423527970201602e-05, "loss": 1.72, "step": 857 }, { "epoch": 1.55, "grad_norm": 0.06612752377986908, "learning_rate": 9.404334627434974e-05, "loss": 1.7294, "step": 858 }, { "epoch": 1.55, "grad_norm": 0.06335198134183884, "learning_rate": 9.385143486569718e-05, "loss": 1.6978, "step": 859 }, { "epoch": 1.56, "grad_norm": 0.0652630627155304, "learning_rate": 9.365954618546665e-05, "loss": 1.6808, "step": 860 }, { "epoch": 1.56, "grad_norm": 0.08252695202827454, "learning_rate": 9.346768094298252e-05, "loss": 1.7117, "step": 861 }, { "epoch": 1.56, "grad_norm": 0.0695163905620575, "learning_rate": 9.327583984748248e-05, "loss": 1.6948, "step": 862 }, { "epoch": 1.56, "grad_norm": 0.06612583249807358, "learning_rate": 9.308402360811497e-05, "loss": 1.705, "step": 863 }, { "epoch": 1.56, "grad_norm": 0.06415654718875885, "learning_rate": 9.289223293393652e-05, "loss": 1.6796, "step": 864 }, { "epoch": 1.56, "grad_norm": 0.06522924453020096, "learning_rate": 9.270046853390925e-05, "loss": 1.6783, "step": 865 }, { "epoch": 1.57, "grad_norm": 0.06422727555036545, "learning_rate": 9.250873111689808e-05, "loss": 1.709, "step": 866 }, { "epoch": 1.57, "grad_norm": 0.06485796719789505, "learning_rate": 9.231702139166816e-05, "loss": 1.6323, "step": 867 }, { "epoch": 1.57, "grad_norm": 0.06597612798213959, "learning_rate": 9.212534006688233e-05, "loss": 1.6578, "step": 868 }, { "epoch": 1.57, "grad_norm": 0.06861060112714767, "learning_rate": 9.193368785109844e-05, "loss": 1.6711, "step": 869 }, { "epoch": 1.57, "grad_norm": 0.07582002878189087, "learning_rate": 9.174206545276677e-05, "loss": 1.666, "step": 870 }, { "epoch": 1.58, "grad_norm": 0.06606924533843994, "learning_rate": 9.15504735802273e-05, "loss": 1.7304, "step": 871 }, { "epoch": 1.58, "grad_norm": 0.06642486900091171, "learning_rate": 9.135891294170718e-05, "loss": 1.7082, "step": 872 }, { "epoch": 1.58, "grad_norm": 0.072264164686203, "learning_rate": 9.11673842453182e-05, "loss": 1.6355, "step": 873 }, { "epoch": 1.58, "grad_norm": 0.06571400165557861, "learning_rate": 9.097588819905394e-05, "loss": 1.6383, "step": 874 }, { "epoch": 1.58, "grad_norm": 0.062258243560791016, "learning_rate": 9.078442551078736e-05, "loss": 1.6676, "step": 875 }, { "epoch": 1.58, "grad_norm": 0.06381349265575409, "learning_rate": 9.059299688826816e-05, "loss": 1.699, "step": 876 }, { "epoch": 1.59, "grad_norm": 0.06702978163957596, "learning_rate": 9.040160303912003e-05, "loss": 1.7245, "step": 877 }, { "epoch": 1.59, "grad_norm": 0.0637059286236763, "learning_rate": 9.021024467083812e-05, "loss": 1.6478, "step": 878 }, { "epoch": 1.59, "grad_norm": 0.0654047429561615, "learning_rate": 9.001892249078648e-05, "loss": 1.7275, "step": 879 }, { "epoch": 1.59, "grad_norm": 0.06602399051189423, "learning_rate": 8.982763720619533e-05, "loss": 1.7712, "step": 880 }, { "epoch": 1.59, "grad_norm": 0.06693969666957855, "learning_rate": 8.96363895241586e-05, "loss": 1.6684, "step": 881 }, { "epoch": 1.6, "grad_norm": 0.06519246846437454, "learning_rate": 8.944518015163108e-05, "loss": 1.6698, "step": 882 }, { "epoch": 1.6, "grad_norm": 0.06838595122098923, "learning_rate": 8.925400979542606e-05, "loss": 1.655, "step": 883 }, { "epoch": 1.6, "grad_norm": 0.06535571813583374, "learning_rate": 8.906287916221259e-05, "loss": 1.6858, "step": 884 }, { "epoch": 1.6, "grad_norm": 0.06805121898651123, "learning_rate": 8.887178895851279e-05, "loss": 1.6746, "step": 885 }, { "epoch": 1.6, "grad_norm": 0.0715852826833725, "learning_rate": 8.868073989069943e-05, "loss": 1.7676, "step": 886 }, { "epoch": 1.6, "grad_norm": 0.06408550590276718, "learning_rate": 8.848973266499322e-05, "loss": 1.6434, "step": 887 }, { "epoch": 1.61, "grad_norm": 0.0682334452867508, "learning_rate": 8.829876798746017e-05, "loss": 1.6663, "step": 888 }, { "epoch": 1.61, "grad_norm": 0.06532958894968033, "learning_rate": 8.810784656400895e-05, "loss": 1.6914, "step": 889 }, { "epoch": 1.61, "grad_norm": 0.06579031050205231, "learning_rate": 8.791696910038843e-05, "loss": 1.6359, "step": 890 }, { "epoch": 1.61, "grad_norm": 0.0659404769539833, "learning_rate": 8.772613630218492e-05, "loss": 1.7121, "step": 891 }, { "epoch": 1.61, "grad_norm": 0.06567792594432831, "learning_rate": 8.753534887481976e-05, "loss": 1.6565, "step": 892 }, { "epoch": 1.62, "grad_norm": 0.07625501602888107, "learning_rate": 8.734460752354629e-05, "loss": 1.6743, "step": 893 }, { "epoch": 1.62, "grad_norm": 0.06591348350048065, "learning_rate": 8.715391295344784e-05, "loss": 1.6265, "step": 894 }, { "epoch": 1.62, "grad_norm": 0.06538601964712143, "learning_rate": 8.696326586943464e-05, "loss": 1.7139, "step": 895 }, { "epoch": 1.62, "grad_norm": 0.06885919719934464, "learning_rate": 8.677266697624138e-05, "loss": 1.6884, "step": 896 }, { "epoch": 1.62, "grad_norm": 0.06452605873346329, "learning_rate": 8.658211697842466e-05, "loss": 1.6894, "step": 897 }, { "epoch": 1.62, "grad_norm": 0.06521788239479065, "learning_rate": 8.639161658036037e-05, "loss": 1.6943, "step": 898 }, { "epoch": 1.63, "grad_norm": 0.06771497428417206, "learning_rate": 8.6201166486241e-05, "loss": 1.6718, "step": 899 }, { "epoch": 1.63, "grad_norm": 0.0637250766158104, "learning_rate": 8.601076740007305e-05, "loss": 1.6842, "step": 900 }, { "epoch": 1.63, "grad_norm": 0.0656089335680008, "learning_rate": 8.582042002567456e-05, "loss": 1.6649, "step": 901 }, { "epoch": 1.63, "grad_norm": 0.06827680766582489, "learning_rate": 8.563012506667233e-05, "loss": 1.7095, "step": 902 }, { "epoch": 1.63, "grad_norm": 0.06502600759267807, "learning_rate": 8.543988322649954e-05, "loss": 1.6368, "step": 903 }, { "epoch": 1.64, "grad_norm": 0.06803898513317108, "learning_rate": 8.524969520839279e-05, "loss": 1.657, "step": 904 }, { "epoch": 1.64, "grad_norm": 0.06632059067487717, "learning_rate": 8.505956171538994e-05, "loss": 1.7279, "step": 905 }, { "epoch": 1.64, "grad_norm": 0.06838211417198181, "learning_rate": 8.486948345032719e-05, "loss": 1.6318, "step": 906 }, { "epoch": 1.64, "grad_norm": 0.0652574896812439, "learning_rate": 8.46794611158366e-05, "loss": 1.6307, "step": 907 }, { "epoch": 1.64, "grad_norm": 0.0648072361946106, "learning_rate": 8.448949541434346e-05, "loss": 1.6517, "step": 908 }, { "epoch": 1.64, "grad_norm": 0.06592056900262833, "learning_rate": 8.429958704806379e-05, "loss": 1.6958, "step": 909 }, { "epoch": 1.65, "grad_norm": 0.06285024434328079, "learning_rate": 8.410973671900162e-05, "loss": 1.666, "step": 910 }, { "epoch": 1.65, "grad_norm": 0.06529216468334198, "learning_rate": 8.391994512894641e-05, "loss": 1.6919, "step": 911 }, { "epoch": 1.65, "grad_norm": 0.06455468386411667, "learning_rate": 8.373021297947053e-05, "loss": 1.6217, "step": 912 }, { "epoch": 1.65, "grad_norm": 0.06522978842258453, "learning_rate": 8.35405409719266e-05, "loss": 1.6729, "step": 913 }, { "epoch": 1.65, "grad_norm": 0.06686036288738251, "learning_rate": 8.335092980744502e-05, "loss": 1.6324, "step": 914 }, { "epoch": 1.66, "grad_norm": 0.06648086756467819, "learning_rate": 8.316138018693108e-05, "loss": 1.6052, "step": 915 }, { "epoch": 1.66, "grad_norm": 0.06622032076120377, "learning_rate": 8.297189281106278e-05, "loss": 1.7219, "step": 916 }, { "epoch": 1.66, "grad_norm": 0.07183654606342316, "learning_rate": 8.278246838028793e-05, "loss": 1.7633, "step": 917 }, { "epoch": 1.66, "grad_norm": 0.06654607504606247, "learning_rate": 8.259310759482164e-05, "loss": 1.7602, "step": 918 }, { "epoch": 1.66, "grad_norm": 0.06768395006656647, "learning_rate": 8.240381115464377e-05, "loss": 1.678, "step": 919 }, { "epoch": 1.66, "grad_norm": 0.0649079754948616, "learning_rate": 8.22145797594964e-05, "loss": 1.7013, "step": 920 }, { "epoch": 1.67, "grad_norm": 0.06565246731042862, "learning_rate": 8.20254141088811e-05, "loss": 1.7064, "step": 921 }, { "epoch": 1.67, "grad_norm": 0.06477197259664536, "learning_rate": 8.183631490205637e-05, "loss": 1.7219, "step": 922 }, { "epoch": 1.67, "grad_norm": 0.06408128142356873, "learning_rate": 8.164728283803518e-05, "loss": 1.7337, "step": 923 }, { "epoch": 1.67, "grad_norm": 0.06464950740337372, "learning_rate": 8.145831861558225e-05, "loss": 1.6853, "step": 924 }, { "epoch": 1.67, "grad_norm": 0.06401928514242172, "learning_rate": 8.126942293321162e-05, "loss": 1.6587, "step": 925 }, { "epoch": 1.68, "grad_norm": 0.06978955864906311, "learning_rate": 8.108059648918377e-05, "loss": 1.7083, "step": 926 }, { "epoch": 1.68, "grad_norm": 0.06544001400470734, "learning_rate": 8.089183998150344e-05, "loss": 1.6318, "step": 927 }, { "epoch": 1.68, "grad_norm": 0.06558380275964737, "learning_rate": 8.070315410791679e-05, "loss": 1.6897, "step": 928 }, { "epoch": 1.68, "grad_norm": 0.06930231302976608, "learning_rate": 8.051453956590878e-05, "loss": 1.6266, "step": 929 }, { "epoch": 1.68, "grad_norm": 0.06593599915504456, "learning_rate": 8.03259970527008e-05, "loss": 1.7096, "step": 930 }, { "epoch": 1.69, "grad_norm": 0.06622833758592606, "learning_rate": 8.013752726524795e-05, "loss": 1.5817, "step": 931 }, { "epoch": 1.69, "grad_norm": 0.06626243144273758, "learning_rate": 7.994913090023651e-05, "loss": 1.6525, "step": 932 }, { "epoch": 1.69, "grad_norm": 0.0677393451333046, "learning_rate": 7.976080865408131e-05, "loss": 1.7158, "step": 933 }, { "epoch": 1.69, "grad_norm": 0.06529498845338821, "learning_rate": 7.957256122292323e-05, "loss": 1.7317, "step": 934 }, { "epoch": 1.69, "grad_norm": 0.07396451383829117, "learning_rate": 7.938438930262656e-05, "loss": 1.6791, "step": 935 }, { "epoch": 1.69, "grad_norm": 0.07032353430986404, "learning_rate": 7.919629358877657e-05, "loss": 1.7024, "step": 936 }, { "epoch": 1.7, "grad_norm": 0.06451990455389023, "learning_rate": 7.900827477667663e-05, "loss": 1.7266, "step": 937 }, { "epoch": 1.7, "grad_norm": 0.06694858521223068, "learning_rate": 7.882033356134603e-05, "loss": 1.6612, "step": 938 }, { "epoch": 1.7, "grad_norm": 0.06609500199556351, "learning_rate": 7.863247063751715e-05, "loss": 1.713, "step": 939 }, { "epoch": 1.7, "grad_norm": 0.06344272941350937, "learning_rate": 7.844468669963289e-05, "loss": 1.6219, "step": 940 }, { "epoch": 1.7, "grad_norm": 0.06307589262723923, "learning_rate": 7.825698244184431e-05, "loss": 1.7042, "step": 941 }, { "epoch": 1.71, "grad_norm": 0.06659837812185287, "learning_rate": 7.806935855800782e-05, "loss": 1.6993, "step": 942 }, { "epoch": 1.71, "grad_norm": 0.06524292379617691, "learning_rate": 7.788181574168283e-05, "loss": 1.6687, "step": 943 }, { "epoch": 1.71, "grad_norm": 0.06560816615819931, "learning_rate": 7.769435468612896e-05, "loss": 1.7081, "step": 944 }, { "epoch": 1.71, "grad_norm": 0.06725630909204483, "learning_rate": 7.750697608430365e-05, "loss": 1.7001, "step": 945 }, { "epoch": 1.71, "grad_norm": 0.06650066375732422, "learning_rate": 7.731968062885956e-05, "loss": 1.7225, "step": 946 }, { "epoch": 1.71, "grad_norm": 0.06517896801233292, "learning_rate": 7.713246901214206e-05, "loss": 1.6299, "step": 947 }, { "epoch": 1.72, "grad_norm": 0.06807747483253479, "learning_rate": 7.694534192618641e-05, "loss": 1.695, "step": 948 }, { "epoch": 1.72, "grad_norm": 0.06809186935424805, "learning_rate": 7.67583000627156e-05, "loss": 1.6611, "step": 949 }, { "epoch": 1.72, "grad_norm": 0.06693090498447418, "learning_rate": 7.657134411313753e-05, "loss": 1.6603, "step": 950 }, { "epoch": 1.72, "grad_norm": 0.06553305685520172, "learning_rate": 7.638447476854245e-05, "loss": 1.7036, "step": 951 }, { "epoch": 1.72, "grad_norm": 0.06823913007974625, "learning_rate": 7.619769271970056e-05, "loss": 1.6848, "step": 952 }, { "epoch": 1.73, "grad_norm": 0.0652228444814682, "learning_rate": 7.601099865705927e-05, "loss": 1.6893, "step": 953 }, { "epoch": 1.73, "grad_norm": 0.07233775407075882, "learning_rate": 7.58243932707409e-05, "loss": 1.6777, "step": 954 }, { "epoch": 1.73, "grad_norm": 0.07119675725698471, "learning_rate": 7.563787725053981e-05, "loss": 1.706, "step": 955 }, { "epoch": 1.73, "grad_norm": 0.06489936262369156, "learning_rate": 7.54514512859201e-05, "loss": 1.6538, "step": 956 }, { "epoch": 1.73, "grad_norm": 0.06696008145809174, "learning_rate": 7.526511606601293e-05, "loss": 1.6862, "step": 957 }, { "epoch": 1.73, "grad_norm": 0.06405473500490189, "learning_rate": 7.507887227961414e-05, "loss": 1.662, "step": 958 }, { "epoch": 1.74, "grad_norm": 0.06998445093631744, "learning_rate": 7.489272061518136e-05, "loss": 1.6604, "step": 959 }, { "epoch": 1.74, "eval_loss": 1.726022481918335, "eval_runtime": 76.3141, "eval_samples_per_second": 65.519, "eval_steps_per_second": 16.38, "step": 959 }, { "epoch": 1.74, "grad_norm": 0.06673965603113174, "learning_rate": 7.470666176083192e-05, "loss": 1.7049, "step": 960 }, { "epoch": 1.74, "grad_norm": 0.06746464222669601, "learning_rate": 7.452069640433997e-05, "loss": 1.6803, "step": 961 }, { "epoch": 1.74, "grad_norm": 0.06396359950304031, "learning_rate": 7.433482523313395e-05, "loss": 1.7104, "step": 962 }, { "epoch": 1.74, "grad_norm": 0.066098153591156, "learning_rate": 7.414904893429433e-05, "loss": 1.6527, "step": 963 }, { "epoch": 1.75, "grad_norm": 0.06473662704229355, "learning_rate": 7.39633681945507e-05, "loss": 1.6891, "step": 964 }, { "epoch": 1.75, "grad_norm": 0.07003339380025864, "learning_rate": 7.377778370027962e-05, "loss": 1.676, "step": 965 }, { "epoch": 1.75, "grad_norm": 0.06654497236013412, "learning_rate": 7.35922961375016e-05, "loss": 1.6601, "step": 966 }, { "epoch": 1.75, "grad_norm": 0.06775406002998352, "learning_rate": 7.340690619187908e-05, "loss": 1.6391, "step": 967 }, { "epoch": 1.75, "grad_norm": 0.06764483451843262, "learning_rate": 7.322161454871356e-05, "loss": 1.7057, "step": 968 }, { "epoch": 1.75, "grad_norm": 0.0728226825594902, "learning_rate": 7.303642189294316e-05, "loss": 1.6793, "step": 969 }, { "epoch": 1.76, "grad_norm": 0.06543935835361481, "learning_rate": 7.285132890914002e-05, "loss": 1.6962, "step": 970 }, { "epoch": 1.76, "grad_norm": 0.06830572336912155, "learning_rate": 7.266633628150801e-05, "loss": 1.6774, "step": 971 }, { "epoch": 1.76, "grad_norm": 0.07373080402612686, "learning_rate": 7.248144469387992e-05, "loss": 1.6815, "step": 972 }, { "epoch": 1.76, "grad_norm": 0.06465107947587967, "learning_rate": 7.229665482971499e-05, "loss": 1.6572, "step": 973 }, { "epoch": 1.76, "grad_norm": 0.06544660031795502, "learning_rate": 7.211196737209653e-05, "loss": 1.6841, "step": 974 }, { "epoch": 1.77, "grad_norm": 0.06559861451387405, "learning_rate": 7.192738300372925e-05, "loss": 1.6835, "step": 975 }, { "epoch": 1.77, "grad_norm": 0.06756362318992615, "learning_rate": 7.174290240693689e-05, "loss": 1.5912, "step": 976 }, { "epoch": 1.77, "grad_norm": 0.06515438854694366, "learning_rate": 7.155852626365938e-05, "loss": 1.6586, "step": 977 }, { "epoch": 1.77, "grad_norm": 0.06673271209001541, "learning_rate": 7.137425525545074e-05, "loss": 1.67, "step": 978 }, { "epoch": 1.77, "grad_norm": 0.06732840090990067, "learning_rate": 7.119009006347625e-05, "loss": 1.6262, "step": 979 }, { "epoch": 1.77, "grad_norm": 0.0666419267654419, "learning_rate": 7.100603136851009e-05, "loss": 1.6963, "step": 980 }, { "epoch": 1.78, "grad_norm": 0.07527624070644379, "learning_rate": 7.082207985093268e-05, "loss": 1.6903, "step": 981 }, { "epoch": 1.78, "grad_norm": 0.06989062577486038, "learning_rate": 7.063823619072838e-05, "loss": 1.6497, "step": 982 }, { "epoch": 1.78, "grad_norm": 0.0654689222574234, "learning_rate": 7.045450106748277e-05, "loss": 1.6782, "step": 983 }, { "epoch": 1.78, "grad_norm": 0.06511061638593674, "learning_rate": 7.027087516038022e-05, "loss": 1.6824, "step": 984 }, { "epoch": 1.78, "grad_norm": 0.06674464046955109, "learning_rate": 7.008735914820138e-05, "loss": 1.7367, "step": 985 }, { "epoch": 1.79, "grad_norm": 0.06592298299074173, "learning_rate": 6.990395370932068e-05, "loss": 1.6879, "step": 986 }, { "epoch": 1.79, "grad_norm": 0.06826543807983398, "learning_rate": 6.97206595217039e-05, "loss": 1.6682, "step": 987 }, { "epoch": 1.79, "grad_norm": 0.06695631891489029, "learning_rate": 6.953747726290535e-05, "loss": 1.7181, "step": 988 }, { "epoch": 1.79, "grad_norm": 0.06656961888074875, "learning_rate": 6.935440761006582e-05, "loss": 1.6778, "step": 989 }, { "epoch": 1.79, "grad_norm": 0.06611720472574234, "learning_rate": 6.917145123990973e-05, "loss": 1.6467, "step": 990 }, { "epoch": 1.79, "grad_norm": 0.06846632063388824, "learning_rate": 6.898860882874279e-05, "loss": 1.7165, "step": 991 }, { "epoch": 1.8, "grad_norm": 0.06631824374198914, "learning_rate": 6.88058810524494e-05, "loss": 1.7042, "step": 992 }, { "epoch": 1.8, "grad_norm": 0.06761027872562408, "learning_rate": 6.862326858649026e-05, "loss": 1.6822, "step": 993 }, { "epoch": 1.8, "grad_norm": 0.06898529827594757, "learning_rate": 6.844077210589986e-05, "loss": 1.6635, "step": 994 }, { "epoch": 1.8, "grad_norm": 0.06683610379695892, "learning_rate": 6.825839228528382e-05, "loss": 1.6949, "step": 995 }, { "epoch": 1.8, "grad_norm": 0.06670662760734558, "learning_rate": 6.807612979881661e-05, "loss": 1.6724, "step": 996 }, { "epoch": 1.81, "grad_norm": 0.19084873795509338, "learning_rate": 6.789398532023894e-05, "loss": 1.7499, "step": 997 }, { "epoch": 1.81, "grad_norm": 0.06561749428510666, "learning_rate": 6.77119595228554e-05, "loss": 1.6733, "step": 998 }, { "epoch": 1.81, "grad_norm": 0.07371030747890472, "learning_rate": 6.753005307953167e-05, "loss": 1.6607, "step": 999 }, { "epoch": 1.81, "grad_norm": 0.0679875835776329, "learning_rate": 6.734826666269238e-05, "loss": 1.6233, "step": 1000 }, { "epoch": 1.81, "grad_norm": 0.0667947381734848, "learning_rate": 6.716660094431846e-05, "loss": 1.6186, "step": 1001 }, { "epoch": 1.81, "grad_norm": 0.06578990817070007, "learning_rate": 6.698505659594466e-05, "loss": 1.6997, "step": 1002 }, { "epoch": 1.82, "grad_norm": 0.07320542633533478, "learning_rate": 6.680363428865704e-05, "loss": 1.6729, "step": 1003 }, { "epoch": 1.82, "grad_norm": 0.06879616528749466, "learning_rate": 6.662233469309058e-05, "loss": 1.6982, "step": 1004 }, { "epoch": 1.82, "grad_norm": 0.06353451311588287, "learning_rate": 6.644115847942667e-05, "loss": 1.6698, "step": 1005 }, { "epoch": 1.82, "grad_norm": 0.06664732843637466, "learning_rate": 6.626010631739054e-05, "loss": 1.6225, "step": 1006 }, { "epoch": 1.82, "grad_norm": 0.0662289708852768, "learning_rate": 6.60791788762489e-05, "loss": 1.713, "step": 1007 }, { "epoch": 1.83, "grad_norm": 0.06735072284936905, "learning_rate": 6.589837682480744e-05, "loss": 1.6431, "step": 1008 }, { "epoch": 1.83, "grad_norm": 0.06567612290382385, "learning_rate": 6.571770083140836e-05, "loss": 1.6972, "step": 1009 }, { "epoch": 1.83, "grad_norm": 0.06742958724498749, "learning_rate": 6.553715156392776e-05, "loss": 1.6439, "step": 1010 }, { "epoch": 1.83, "grad_norm": 0.06748675554990768, "learning_rate": 6.535672968977345e-05, "loss": 1.6711, "step": 1011 }, { "epoch": 1.83, "grad_norm": 0.07259120792150497, "learning_rate": 6.517643587588221e-05, "loss": 1.7223, "step": 1012 }, { "epoch": 1.83, "grad_norm": 0.07579007744789124, "learning_rate": 6.499627078871753e-05, "loss": 1.6614, "step": 1013 }, { "epoch": 1.84, "grad_norm": 0.07152054458856583, "learning_rate": 6.481623509426697e-05, "loss": 1.7038, "step": 1014 }, { "epoch": 1.84, "grad_norm": 0.06873390078544617, "learning_rate": 6.463632945803981e-05, "loss": 1.6602, "step": 1015 }, { "epoch": 1.84, "grad_norm": 0.0664227306842804, "learning_rate": 6.445655454506465e-05, "loss": 1.6916, "step": 1016 }, { "epoch": 1.84, "grad_norm": 0.06599757075309753, "learning_rate": 6.427691101988673e-05, "loss": 1.605, "step": 1017 }, { "epoch": 1.84, "grad_norm": 0.06476866453886032, "learning_rate": 6.40973995465657e-05, "loss": 1.6309, "step": 1018 }, { "epoch": 1.85, "grad_norm": 0.06668147444725037, "learning_rate": 6.391802078867304e-05, "loss": 1.684, "step": 1019 }, { "epoch": 1.85, "grad_norm": 0.06579145044088364, "learning_rate": 6.373877540928972e-05, "loss": 1.6277, "step": 1020 }, { "epoch": 1.85, "grad_norm": 0.06740958243608475, "learning_rate": 6.355966407100346e-05, "loss": 1.728, "step": 1021 }, { "epoch": 1.85, "grad_norm": 0.07092586159706116, "learning_rate": 6.338068743590676e-05, "loss": 1.7091, "step": 1022 }, { "epoch": 1.85, "grad_norm": 0.06797771900892258, "learning_rate": 6.320184616559402e-05, "loss": 1.6962, "step": 1023 }, { "epoch": 1.85, "grad_norm": 0.06833136081695557, "learning_rate": 6.30231409211593e-05, "loss": 1.6981, "step": 1024 }, { "epoch": 1.86, "grad_norm": 0.06703907996416092, "learning_rate": 6.284457236319381e-05, "loss": 1.7082, "step": 1025 }, { "epoch": 1.86, "grad_norm": 0.0666668489575386, "learning_rate": 6.266614115178351e-05, "loss": 1.6198, "step": 1026 }, { "epoch": 1.86, "grad_norm": 0.07242632657289505, "learning_rate": 6.248784794650672e-05, "loss": 1.705, "step": 1027 }, { "epoch": 1.86, "grad_norm": 0.06651555746793747, "learning_rate": 6.230969340643149e-05, "loss": 1.6417, "step": 1028 }, { "epoch": 1.86, "grad_norm": 0.06552428007125854, "learning_rate": 6.213167819011338e-05, "loss": 1.6917, "step": 1029 }, { "epoch": 1.87, "grad_norm": 0.06741311401128769, "learning_rate": 6.195380295559288e-05, "loss": 1.7241, "step": 1030 }, { "epoch": 1.87, "grad_norm": 0.06656550616025925, "learning_rate": 6.177606836039311e-05, "loss": 1.646, "step": 1031 }, { "epoch": 1.87, "grad_norm": 0.06896986067295074, "learning_rate": 6.159847506151719e-05, "loss": 1.6708, "step": 1032 }, { "epoch": 1.87, "grad_norm": 0.06811494380235672, "learning_rate": 6.142102371544604e-05, "loss": 1.6927, "step": 1033 }, { "epoch": 1.87, "grad_norm": 0.06616541743278503, "learning_rate": 6.124371497813582e-05, "loss": 1.6175, "step": 1034 }, { "epoch": 1.87, "grad_norm": 0.06697241216897964, "learning_rate": 6.106654950501547e-05, "loss": 1.6848, "step": 1035 }, { "epoch": 1.88, "grad_norm": 0.06779171526432037, "learning_rate": 6.0889527950984416e-05, "loss": 1.6566, "step": 1036 }, { "epoch": 1.88, "grad_norm": 0.0683891773223877, "learning_rate": 6.071265097041005e-05, "loss": 1.6258, "step": 1037 }, { "epoch": 1.88, "grad_norm": 0.06936081498861313, "learning_rate": 6.053591921712541e-05, "loss": 1.6115, "step": 1038 }, { "epoch": 1.88, "grad_norm": 0.0856877937912941, "learning_rate": 6.035933334442654e-05, "loss": 1.6742, "step": 1039 }, { "epoch": 1.88, "grad_norm": 0.07240041345357895, "learning_rate": 6.01828940050704e-05, "loss": 1.6901, "step": 1040 }, { "epoch": 1.89, "grad_norm": 0.0770583376288414, "learning_rate": 6.000660185127219e-05, "loss": 1.6803, "step": 1041 }, { "epoch": 1.89, "grad_norm": 0.06806863099336624, "learning_rate": 5.983045753470308e-05, "loss": 1.6561, "step": 1042 }, { "epoch": 1.89, "grad_norm": 0.06816756725311279, "learning_rate": 5.965446170648765e-05, "loss": 1.6635, "step": 1043 }, { "epoch": 1.89, "grad_norm": 0.06543378531932831, "learning_rate": 5.947861501720175e-05, "loss": 1.7153, "step": 1044 }, { "epoch": 1.89, "grad_norm": 0.06688012927770615, "learning_rate": 5.930291811686983e-05, "loss": 1.7142, "step": 1045 }, { "epoch": 1.89, "grad_norm": 0.071477010846138, "learning_rate": 5.9127371654962615e-05, "loss": 1.6804, "step": 1046 }, { "epoch": 1.9, "grad_norm": 0.06843505799770355, "learning_rate": 5.8951976280394795e-05, "loss": 1.7476, "step": 1047 }, { "epoch": 1.9, "grad_norm": 0.06697747856378555, "learning_rate": 5.8776732641522503e-05, "loss": 1.662, "step": 1048 }, { "epoch": 1.9, "grad_norm": 0.06771202385425568, "learning_rate": 5.86016413861411e-05, "loss": 1.655, "step": 1049 }, { "epoch": 1.9, "grad_norm": 0.07092612236738205, "learning_rate": 5.842670316148244e-05, "loss": 1.707, "step": 1050 }, { "epoch": 1.9, "grad_norm": 0.06740372627973557, "learning_rate": 5.825191861421285e-05, "loss": 1.673, "step": 1051 }, { "epoch": 1.91, "grad_norm": 0.06587556004524231, "learning_rate": 5.807728839043061e-05, "loss": 1.6879, "step": 1052 }, { "epoch": 1.91, "grad_norm": 0.06834732741117477, "learning_rate": 5.790281313566341e-05, "loss": 1.7233, "step": 1053 }, { "epoch": 1.91, "grad_norm": 0.06691209226846695, "learning_rate": 5.7728493494866134e-05, "loss": 1.6966, "step": 1054 }, { "epoch": 1.91, "grad_norm": 0.06715382635593414, "learning_rate": 5.755433011241851e-05, "loss": 1.7185, "step": 1055 }, { "epoch": 1.91, "grad_norm": 0.06831709295511246, "learning_rate": 5.738032363212258e-05, "loss": 1.6529, "step": 1056 }, { "epoch": 1.91, "grad_norm": 0.06592843681573868, "learning_rate": 5.720647469720033e-05, "loss": 1.6939, "step": 1057 }, { "epoch": 1.92, "grad_norm": 0.06575801223516464, "learning_rate": 5.70327839502915e-05, "loss": 1.6642, "step": 1058 }, { "epoch": 1.92, "grad_norm": 0.07193956524133682, "learning_rate": 5.685925203345108e-05, "loss": 1.6675, "step": 1059 }, { "epoch": 1.92, "grad_norm": 0.0670444443821907, "learning_rate": 5.6685879588146815e-05, "loss": 1.7136, "step": 1060 }, { "epoch": 1.92, "grad_norm": 0.07206844538450241, "learning_rate": 5.651266725525703e-05, "loss": 1.6999, "step": 1061 }, { "epoch": 1.92, "grad_norm": 0.0692375898361206, "learning_rate": 5.633961567506819e-05, "loss": 1.6782, "step": 1062 }, { "epoch": 1.93, "grad_norm": 0.06483175605535507, "learning_rate": 5.6166725487272576e-05, "loss": 1.6448, "step": 1063 }, { "epoch": 1.93, "grad_norm": 0.0667993351817131, "learning_rate": 5.5993997330965796e-05, "loss": 1.6683, "step": 1064 }, { "epoch": 1.93, "grad_norm": 0.0673048198223114, "learning_rate": 5.5821431844644476e-05, "loss": 1.6534, "step": 1065 }, { "epoch": 1.93, "grad_norm": 0.07212254405021667, "learning_rate": 5.564902966620408e-05, "loss": 1.7084, "step": 1066 }, { "epoch": 1.93, "grad_norm": 0.06697355955839157, "learning_rate": 5.547679143293624e-05, "loss": 1.7029, "step": 1067 }, { "epoch": 1.93, "grad_norm": 0.07669904828071594, "learning_rate": 5.530471778152658e-05, "loss": 1.7153, "step": 1068 }, { "epoch": 1.94, "grad_norm": 0.07381530106067657, "learning_rate": 5.513280934805243e-05, "loss": 1.6769, "step": 1069 }, { "epoch": 1.94, "grad_norm": 0.068946473300457, "learning_rate": 5.4961066767980363e-05, "loss": 1.6799, "step": 1070 }, { "epoch": 1.94, "grad_norm": 0.06763108819723129, "learning_rate": 5.478949067616381e-05, "loss": 1.7185, "step": 1071 }, { "epoch": 1.94, "grad_norm": 0.06624120473861694, "learning_rate": 5.4618081706840754e-05, "loss": 1.6972, "step": 1072 }, { "epoch": 1.94, "grad_norm": 0.06670323014259338, "learning_rate": 5.444684049363147e-05, "loss": 1.6826, "step": 1073 }, { "epoch": 1.95, "grad_norm": 0.06699904054403305, "learning_rate": 5.4275767669536146e-05, "loss": 1.643, "step": 1074 }, { "epoch": 1.95, "grad_norm": 0.07036450505256653, "learning_rate": 5.410486386693243e-05, "loss": 1.6719, "step": 1075 }, { "epoch": 1.95, "grad_norm": 0.06482276320457458, "learning_rate": 5.3934129717573165e-05, "loss": 1.6756, "step": 1076 }, { "epoch": 1.95, "grad_norm": 0.06716746836900711, "learning_rate": 5.3763565852584177e-05, "loss": 1.6995, "step": 1077 }, { "epoch": 1.95, "grad_norm": 0.06743574887514114, "learning_rate": 5.3593172902461717e-05, "loss": 1.7064, "step": 1078 }, { "epoch": 1.95, "grad_norm": 0.06770848482847214, "learning_rate": 5.342295149707025e-05, "loss": 1.6588, "step": 1079 }, { "epoch": 1.96, "grad_norm": 0.06666205823421478, "learning_rate": 5.325290226564017e-05, "loss": 1.6215, "step": 1080 }, { "epoch": 1.96, "grad_norm": 0.0728970617055893, "learning_rate": 5.308302583676548e-05, "loss": 1.6878, "step": 1081 }, { "epoch": 1.96, "grad_norm": 0.06758435070514679, "learning_rate": 5.291332283840125e-05, "loss": 1.6422, "step": 1082 }, { "epoch": 1.96, "grad_norm": 0.06901335716247559, "learning_rate": 5.274379389786154e-05, "loss": 1.7208, "step": 1083 }, { "epoch": 1.96, "grad_norm": 0.06578974425792694, "learning_rate": 5.2574439641817006e-05, "loss": 1.6822, "step": 1084 }, { "epoch": 1.97, "grad_norm": 0.08507327735424042, "learning_rate": 5.240526069629265e-05, "loss": 1.6697, "step": 1085 }, { "epoch": 1.97, "grad_norm": 0.06818517297506332, "learning_rate": 5.223625768666528e-05, "loss": 1.7514, "step": 1086 }, { "epoch": 1.97, "grad_norm": 0.06869194656610489, "learning_rate": 5.206743123766139e-05, "loss": 1.6667, "step": 1087 }, { "epoch": 1.97, "grad_norm": 0.06622481346130371, "learning_rate": 5.1898781973354914e-05, "loss": 1.6807, "step": 1088 }, { "epoch": 1.97, "grad_norm": 0.07047388702630997, "learning_rate": 5.173031051716472e-05, "loss": 1.7118, "step": 1089 }, { "epoch": 1.97, "grad_norm": 0.0671396255493164, "learning_rate": 5.1562017491852387e-05, "loss": 1.641, "step": 1090 }, { "epoch": 1.98, "grad_norm": 0.06699879467487335, "learning_rate": 5.139390351951997e-05, "loss": 1.689, "step": 1091 }, { "epoch": 1.98, "grad_norm": 0.06538563221693039, "learning_rate": 5.122596922160768e-05, "loss": 1.6552, "step": 1092 }, { "epoch": 1.98, "grad_norm": 0.06701681017875671, "learning_rate": 5.105821521889147e-05, "loss": 1.6229, "step": 1093 }, { "epoch": 1.98, "grad_norm": 0.06672403961420059, "learning_rate": 5.089064213148082e-05, "loss": 1.695, "step": 1094 }, { "epoch": 1.98, "grad_norm": 0.06800191104412079, "learning_rate": 5.0723250578816576e-05, "loss": 1.6773, "step": 1095 }, { "epoch": 1.99, "grad_norm": 0.066898874938488, "learning_rate": 5.0556041179668354e-05, "loss": 1.6562, "step": 1096 }, { "epoch": 1.99, "eval_loss": 1.7255171537399292, "eval_runtime": 76.5349, "eval_samples_per_second": 65.33, "eval_steps_per_second": 16.332, "step": 1096 }, { "epoch": 1.99, "grad_norm": 0.07488064467906952, "learning_rate": 5.0389014552132606e-05, "loss": 1.7073, "step": 1097 }, { "epoch": 1.99, "grad_norm": 0.06735250353813171, "learning_rate": 5.0222171313630004e-05, "loss": 1.7315, "step": 1098 }, { "epoch": 1.99, "grad_norm": 0.06765727698802948, "learning_rate": 5.005551208090348e-05, "loss": 1.6667, "step": 1099 }, { "epoch": 1.99, "grad_norm": 0.06766493618488312, "learning_rate": 4.988903747001563e-05, "loss": 1.6702, "step": 1100 }, { "epoch": 1.99, "grad_norm": 0.06818216294050217, "learning_rate": 4.9722748096346625e-05, "loss": 1.6871, "step": 1101 }, { "epoch": 2.0, "grad_norm": 0.07047205418348312, "learning_rate": 4.955664457459197e-05, "loss": 1.7162, "step": 1102 }, { "epoch": 2.0, "grad_norm": 0.1438203603029251, "learning_rate": 4.939072751876014e-05, "loss": 1.6952, "step": 1103 }, { "epoch": 2.0, "grad_norm": 0.06903135031461716, "learning_rate": 4.922499754217026e-05, "loss": 1.7198, "step": 1104 }, { "epoch": 2.0, "grad_norm": 0.0666298121213913, "learning_rate": 4.9059455257449935e-05, "loss": 1.6737, "step": 1105 }, { "epoch": 2.0, "grad_norm": 0.0657208189368248, "learning_rate": 4.8894101276533055e-05, "loss": 1.6637, "step": 1106 }, { "epoch": 2.01, "grad_norm": 0.06741775572299957, "learning_rate": 4.872893621065727e-05, "loss": 1.6657, "step": 1107 }, { "epoch": 2.01, "grad_norm": 0.06579755991697311, "learning_rate": 4.8563960670362076e-05, "loss": 1.7045, "step": 1108 }, { "epoch": 2.01, "grad_norm": 0.06663426011800766, "learning_rate": 4.839917526548622e-05, "loss": 1.692, "step": 1109 }, { "epoch": 2.01, "grad_norm": 0.06874460726976395, "learning_rate": 4.8234580605165744e-05, "loss": 1.6418, "step": 1110 }, { "epoch": 2.0, "grad_norm": 0.06945845484733582, "learning_rate": 4.80701772978315e-05, "loss": 1.6268, "step": 1111 }, { "epoch": 2.0, "grad_norm": 0.06622739881277084, "learning_rate": 4.790596595120699e-05, "loss": 1.6178, "step": 1112 }, { "epoch": 2.01, "grad_norm": 0.06573659926652908, "learning_rate": 4.774194717230618e-05, "loss": 1.6314, "step": 1113 }, { "epoch": 2.01, "grad_norm": 0.06432535499334335, "learning_rate": 4.757812156743124e-05, "loss": 1.6731, "step": 1114 }, { "epoch": 2.01, "grad_norm": 0.06486698985099792, "learning_rate": 4.7414489742170175e-05, "loss": 1.6806, "step": 1115 }, { "epoch": 2.01, "grad_norm": 0.0704229399561882, "learning_rate": 4.725105230139465e-05, "loss": 1.6605, "step": 1116 }, { "epoch": 2.01, "grad_norm": 0.06596864014863968, "learning_rate": 4.7087809849257925e-05, "loss": 1.6765, "step": 1117 }, { "epoch": 2.01, "grad_norm": 0.07028446346521378, "learning_rate": 4.6924762989192314e-05, "loss": 1.5877, "step": 1118 }, { "epoch": 2.02, "grad_norm": 0.06906835734844208, "learning_rate": 4.6761912323907266e-05, "loss": 1.6345, "step": 1119 }, { "epoch": 2.02, "grad_norm": 0.06638389825820923, "learning_rate": 4.659925845538683e-05, "loss": 1.6482, "step": 1120 }, { "epoch": 2.02, "grad_norm": 0.06999731063842773, "learning_rate": 4.643680198488775e-05, "loss": 1.636, "step": 1121 }, { "epoch": 2.02, "grad_norm": 0.06815838068723679, "learning_rate": 4.627454351293697e-05, "loss": 1.6351, "step": 1122 }, { "epoch": 2.02, "grad_norm": 0.0672587901353836, "learning_rate": 4.611248363932952e-05, "loss": 1.6199, "step": 1123 }, { "epoch": 2.03, "grad_norm": 0.07053370773792267, "learning_rate": 4.595062296312637e-05, "loss": 1.6576, "step": 1124 }, { "epoch": 2.03, "grad_norm": 0.07248280197381973, "learning_rate": 4.578896208265217e-05, "loss": 1.6646, "step": 1125 }, { "epoch": 2.03, "grad_norm": 0.0717586800456047, "learning_rate": 4.562750159549289e-05, "loss": 1.621, "step": 1126 }, { "epoch": 2.03, "grad_norm": 0.07123278826475143, "learning_rate": 4.546624209849383e-05, "loss": 1.6427, "step": 1127 }, { "epoch": 2.03, "grad_norm": 0.07100563496351242, "learning_rate": 4.530518418775733e-05, "loss": 1.611, "step": 1128 }, { "epoch": 2.03, "grad_norm": 0.07044953107833862, "learning_rate": 4.5144328458640495e-05, "loss": 1.6279, "step": 1129 }, { "epoch": 2.04, "grad_norm": 0.06963769346475601, "learning_rate": 4.498367550575314e-05, "loss": 1.6702, "step": 1130 }, { "epoch": 2.04, "grad_norm": 0.06952192634344101, "learning_rate": 4.48232259229554e-05, "loss": 1.5885, "step": 1131 }, { "epoch": 2.04, "grad_norm": 0.07034874707460403, "learning_rate": 4.46629803033558e-05, "loss": 1.6689, "step": 1132 }, { "epoch": 2.04, "grad_norm": 0.0707901194691658, "learning_rate": 4.450293923930876e-05, "loss": 1.6286, "step": 1133 }, { "epoch": 2.04, "grad_norm": 0.07061001658439636, "learning_rate": 4.434310332241257e-05, "loss": 1.662, "step": 1134 }, { "epoch": 2.05, "grad_norm": 0.07052913308143616, "learning_rate": 4.418347314350726e-05, "loss": 1.6287, "step": 1135 }, { "epoch": 2.05, "grad_norm": 0.14670002460479736, "learning_rate": 4.402404929267235e-05, "loss": 1.5772, "step": 1136 }, { "epoch": 2.05, "grad_norm": 0.07111482322216034, "learning_rate": 4.3864832359224574e-05, "loss": 1.7201, "step": 1137 }, { "epoch": 2.05, "grad_norm": 0.07072410732507706, "learning_rate": 4.3705822931715775e-05, "loss": 1.6462, "step": 1138 }, { "epoch": 2.05, "grad_norm": 0.07244502007961273, "learning_rate": 4.3547021597930884e-05, "loss": 1.6621, "step": 1139 }, { "epoch": 2.05, "grad_norm": 0.07025172561407089, "learning_rate": 4.338842894488544e-05, "loss": 1.6013, "step": 1140 }, { "epoch": 2.06, "grad_norm": 0.06991175562143326, "learning_rate": 4.3230045558823727e-05, "loss": 1.5937, "step": 1141 }, { "epoch": 2.06, "grad_norm": 0.0720113068819046, "learning_rate": 4.307187202521632e-05, "loss": 1.7092, "step": 1142 }, { "epoch": 2.06, "grad_norm": 0.06878534704446793, "learning_rate": 4.291390892875824e-05, "loss": 1.6834, "step": 1143 }, { "epoch": 2.06, "grad_norm": 0.07022685557603836, "learning_rate": 4.275615685336646e-05, "loss": 1.6046, "step": 1144 }, { "epoch": 2.06, "grad_norm": 0.07354921847581863, "learning_rate": 4.259861638217794e-05, "loss": 1.6252, "step": 1145 }, { "epoch": 2.07, "grad_norm": 0.07390516251325607, "learning_rate": 4.2441288097547496e-05, "loss": 1.6652, "step": 1146 }, { "epoch": 2.07, "grad_norm": 0.0713810846209526, "learning_rate": 4.22841725810456e-05, "loss": 1.6332, "step": 1147 }, { "epoch": 2.07, "grad_norm": 0.07300698012113571, "learning_rate": 4.212727041345613e-05, "loss": 1.6361, "step": 1148 }, { "epoch": 2.07, "grad_norm": 0.07520492374897003, "learning_rate": 4.197058217477433e-05, "loss": 1.6409, "step": 1149 }, { "epoch": 2.07, "grad_norm": 0.07239343971014023, "learning_rate": 4.181410844420474e-05, "loss": 1.6662, "step": 1150 }, { "epoch": 2.07, "grad_norm": 0.0685645192861557, "learning_rate": 4.16578498001588e-05, "loss": 1.6523, "step": 1151 }, { "epoch": 2.08, "grad_norm": 0.07199341803789139, "learning_rate": 4.1501806820253065e-05, "loss": 1.6386, "step": 1152 }, { "epoch": 2.08, "grad_norm": 0.0726553425192833, "learning_rate": 4.13459800813067e-05, "loss": 1.6506, "step": 1153 }, { "epoch": 2.08, "grad_norm": 0.0729970782995224, "learning_rate": 4.119037015933967e-05, "loss": 1.6561, "step": 1154 }, { "epoch": 2.08, "grad_norm": 0.07292263209819794, "learning_rate": 4.103497762957039e-05, "loss": 1.6288, "step": 1155 }, { "epoch": 2.08, "grad_norm": 0.07344435155391693, "learning_rate": 4.087980306641365e-05, "loss": 1.6016, "step": 1156 }, { "epoch": 2.09, "grad_norm": 0.07376082986593246, "learning_rate": 4.072484704347856e-05, "loss": 1.6657, "step": 1157 }, { "epoch": 2.09, "grad_norm": 0.07173088192939758, "learning_rate": 4.057011013356648e-05, "loss": 1.5845, "step": 1158 }, { "epoch": 2.09, "grad_norm": 0.07206153124570847, "learning_rate": 4.041559290866862e-05, "loss": 1.6198, "step": 1159 }, { "epoch": 2.09, "grad_norm": 0.07455331832170486, "learning_rate": 4.026129593996422e-05, "loss": 1.6247, "step": 1160 }, { "epoch": 2.09, "grad_norm": 0.07515669614076614, "learning_rate": 4.010721979781836e-05, "loss": 1.5627, "step": 1161 }, { "epoch": 2.09, "grad_norm": 0.07117699086666107, "learning_rate": 3.995336505177975e-05, "loss": 1.6536, "step": 1162 }, { "epoch": 2.1, "grad_norm": 0.07095436751842499, "learning_rate": 3.979973227057879e-05, "loss": 1.5989, "step": 1163 }, { "epoch": 2.1, "grad_norm": 0.07377563416957855, "learning_rate": 3.964632202212526e-05, "loss": 1.6961, "step": 1164 }, { "epoch": 2.1, "grad_norm": 0.07506656646728516, "learning_rate": 3.949313487350649e-05, "loss": 1.6876, "step": 1165 }, { "epoch": 2.1, "grad_norm": 0.07030579447746277, "learning_rate": 3.934017139098498e-05, "loss": 1.6001, "step": 1166 }, { "epoch": 2.1, "grad_norm": 0.07692673802375793, "learning_rate": 3.918743213999646e-05, "loss": 1.5694, "step": 1167 }, { "epoch": 2.11, "grad_norm": 0.07509323209524155, "learning_rate": 3.903491768514789e-05, "loss": 1.6334, "step": 1168 }, { "epoch": 2.11, "grad_norm": 0.07726525515317917, "learning_rate": 3.8882628590215074e-05, "loss": 1.6794, "step": 1169 }, { "epoch": 2.11, "grad_norm": 0.07405252754688263, "learning_rate": 3.8730565418140975e-05, "loss": 1.6583, "step": 1170 }, { "epoch": 2.11, "grad_norm": 0.07292646914720535, "learning_rate": 3.857872873103322e-05, "loss": 1.6246, "step": 1171 }, { "epoch": 2.11, "grad_norm": 0.07286576926708221, "learning_rate": 3.842711909016241e-05, "loss": 1.6792, "step": 1172 }, { "epoch": 2.11, "grad_norm": 0.07348307967185974, "learning_rate": 3.827573705595969e-05, "loss": 1.5941, "step": 1173 }, { "epoch": 2.12, "grad_norm": 0.07215212285518646, "learning_rate": 3.812458318801502e-05, "loss": 1.6452, "step": 1174 }, { "epoch": 2.12, "grad_norm": 0.07415865361690521, "learning_rate": 3.797365804507475e-05, "loss": 1.6339, "step": 1175 }, { "epoch": 2.12, "grad_norm": 0.07542405277490616, "learning_rate": 3.7822962185039914e-05, "loss": 1.6573, "step": 1176 }, { "epoch": 2.12, "grad_norm": 0.07552020251750946, "learning_rate": 3.7672496164963866e-05, "loss": 1.6285, "step": 1177 }, { "epoch": 2.12, "grad_norm": 0.07570580393075943, "learning_rate": 3.752226054105038e-05, "loss": 1.617, "step": 1178 }, { "epoch": 2.13, "grad_norm": 0.07173493504524231, "learning_rate": 3.737225586865162e-05, "loss": 1.6491, "step": 1179 }, { "epoch": 2.13, "grad_norm": 0.07744941860437393, "learning_rate": 3.7222482702265925e-05, "loss": 1.6662, "step": 1180 }, { "epoch": 2.13, "grad_norm": 0.07182161509990692, "learning_rate": 3.707294159553599e-05, "loss": 1.6019, "step": 1181 }, { "epoch": 2.13, "grad_norm": 0.0749620795249939, "learning_rate": 3.692363310124654e-05, "loss": 1.7007, "step": 1182 }, { "epoch": 2.13, "grad_norm": 0.07278329133987427, "learning_rate": 3.67745577713226e-05, "loss": 1.5967, "step": 1183 }, { "epoch": 2.13, "grad_norm": 0.0723608136177063, "learning_rate": 3.662571615682714e-05, "loss": 1.6513, "step": 1184 }, { "epoch": 2.14, "grad_norm": 0.07207518815994263, "learning_rate": 3.6477108807959336e-05, "loss": 1.6281, "step": 1185 }, { "epoch": 2.14, "grad_norm": 0.07690425962209702, "learning_rate": 3.6328736274052254e-05, "loss": 1.6363, "step": 1186 }, { "epoch": 2.14, "grad_norm": 0.07670443505048752, "learning_rate": 3.618059910357109e-05, "loss": 1.6787, "step": 1187 }, { "epoch": 2.14, "grad_norm": 0.07191004604101181, "learning_rate": 3.60326978441109e-05, "loss": 1.5742, "step": 1188 }, { "epoch": 2.14, "grad_norm": 0.0717279464006424, "learning_rate": 3.58850330423947e-05, "loss": 1.6738, "step": 1189 }, { "epoch": 2.15, "grad_norm": 0.0730665773153305, "learning_rate": 3.573760524427153e-05, "loss": 1.576, "step": 1190 }, { "epoch": 2.15, "grad_norm": 0.07341915369033813, "learning_rate": 3.5590414994714194e-05, "loss": 1.7294, "step": 1191 }, { "epoch": 2.15, "grad_norm": 0.07364702224731445, "learning_rate": 3.544346283781752e-05, "loss": 1.6801, "step": 1192 }, { "epoch": 2.15, "grad_norm": 0.0715540423989296, "learning_rate": 3.529674931679609e-05, "loss": 1.6198, "step": 1193 }, { "epoch": 2.15, "grad_norm": 0.07254005968570709, "learning_rate": 3.515027497398251e-05, "loss": 1.6487, "step": 1194 }, { "epoch": 2.15, "grad_norm": 0.07599195837974548, "learning_rate": 3.5004040350825084e-05, "loss": 1.6699, "step": 1195 }, { "epoch": 2.16, "grad_norm": 0.07351738959550858, "learning_rate": 3.4858045987886145e-05, "loss": 1.623, "step": 1196 }, { "epoch": 2.16, "grad_norm": 0.07156158983707428, "learning_rate": 3.471229242483973e-05, "loss": 1.6142, "step": 1197 }, { "epoch": 2.16, "grad_norm": 0.07604535669088364, "learning_rate": 3.456678020046992e-05, "loss": 1.5599, "step": 1198 }, { "epoch": 2.16, "grad_norm": 0.0729883536696434, "learning_rate": 3.442150985266854e-05, "loss": 1.6969, "step": 1199 }, { "epoch": 2.16, "grad_norm": 0.07562372833490372, "learning_rate": 3.427648191843336e-05, "loss": 1.6663, "step": 1200 }, { "epoch": 2.17, "grad_norm": 0.0742512047290802, "learning_rate": 3.4131696933866096e-05, "loss": 1.6417, "step": 1201 }, { "epoch": 2.17, "grad_norm": 0.07412253320217133, "learning_rate": 3.39871554341703e-05, "loss": 1.6504, "step": 1202 }, { "epoch": 2.17, "grad_norm": 0.08407189697027206, "learning_rate": 3.38428579536496e-05, "loss": 1.6699, "step": 1203 }, { "epoch": 2.17, "grad_norm": 0.07601872086524963, "learning_rate": 3.369880502570545e-05, "loss": 1.5881, "step": 1204 }, { "epoch": 2.17, "grad_norm": 0.07239924371242523, "learning_rate": 3.355499718283545e-05, "loss": 1.6016, "step": 1205 }, { "epoch": 2.17, "grad_norm": 0.07520369440317154, "learning_rate": 3.341143495663109e-05, "loss": 1.6176, "step": 1206 }, { "epoch": 2.18, "grad_norm": 0.07146305590867996, "learning_rate": 3.3268118877776066e-05, "loss": 1.6098, "step": 1207 }, { "epoch": 2.18, "grad_norm": 0.07684969156980515, "learning_rate": 3.3125049476044056e-05, "loss": 1.6296, "step": 1208 }, { "epoch": 2.18, "grad_norm": 0.0767345130443573, "learning_rate": 3.298222728029702e-05, "loss": 1.6311, "step": 1209 }, { "epoch": 2.18, "grad_norm": 0.07862910628318787, "learning_rate": 3.283965281848297e-05, "loss": 1.6245, "step": 1210 }, { "epoch": 2.18, "grad_norm": 0.07340650260448456, "learning_rate": 3.269732661763421e-05, "loss": 1.6281, "step": 1211 }, { "epoch": 2.19, "grad_norm": 0.07612470537424088, "learning_rate": 3.2555249203865445e-05, "loss": 1.6195, "step": 1212 }, { "epoch": 2.19, "grad_norm": 0.07243123650550842, "learning_rate": 3.241342110237152e-05, "loss": 1.6813, "step": 1213 }, { "epoch": 2.19, "grad_norm": 0.07734370231628418, "learning_rate": 3.227184283742591e-05, "loss": 1.6417, "step": 1214 }, { "epoch": 2.19, "grad_norm": 0.07529491186141968, "learning_rate": 3.2130514932378366e-05, "loss": 1.6924, "step": 1215 }, { "epoch": 2.19, "grad_norm": 0.0757734403014183, "learning_rate": 3.198943790965332e-05, "loss": 1.6688, "step": 1216 }, { "epoch": 2.19, "grad_norm": 0.07498044520616531, "learning_rate": 3.184861229074769e-05, "loss": 1.6656, "step": 1217 }, { "epoch": 2.2, "grad_norm": 0.07474586367607117, "learning_rate": 3.1708038596229195e-05, "loss": 1.6515, "step": 1218 }, { "epoch": 2.2, "grad_norm": 0.07445251941680908, "learning_rate": 3.156771734573416e-05, "loss": 1.6921, "step": 1219 }, { "epoch": 2.2, "grad_norm": 0.07668186724185944, "learning_rate": 3.142764905796589e-05, "loss": 1.6171, "step": 1220 }, { "epoch": 2.2, "grad_norm": 0.07463613897562027, "learning_rate": 3.1287834250692494e-05, "loss": 1.6333, "step": 1221 }, { "epoch": 2.2, "grad_norm": 0.07449369132518768, "learning_rate": 3.114827344074508e-05, "loss": 1.6402, "step": 1222 }, { "epoch": 2.21, "grad_norm": 0.07567483186721802, "learning_rate": 3.1008967144015954e-05, "loss": 1.7189, "step": 1223 }, { "epoch": 2.21, "grad_norm": 0.0763445794582367, "learning_rate": 3.086991587545645e-05, "loss": 1.5962, "step": 1224 }, { "epoch": 2.21, "grad_norm": 0.0785403922200203, "learning_rate": 3.0731120149075335e-05, "loss": 1.6571, "step": 1225 }, { "epoch": 2.21, "grad_norm": 0.07129363715648651, "learning_rate": 3.059258047793661e-05, "loss": 1.6312, "step": 1226 }, { "epoch": 2.21, "grad_norm": 0.07397794723510742, "learning_rate": 3.045429737415789e-05, "loss": 1.6825, "step": 1227 }, { "epoch": 2.21, "grad_norm": 0.07527241855859756, "learning_rate": 3.0316271348908254e-05, "loss": 1.6808, "step": 1228 }, { "epoch": 2.22, "grad_norm": 0.07551155239343643, "learning_rate": 3.0178502912406592e-05, "loss": 1.6667, "step": 1229 }, { "epoch": 2.22, "grad_norm": 0.08903563767671585, "learning_rate": 3.0040992573919503e-05, "loss": 1.6995, "step": 1230 }, { "epoch": 2.22, "grad_norm": 0.07444559782743454, "learning_rate": 2.990374084175963e-05, "loss": 1.7107, "step": 1231 }, { "epoch": 2.22, "grad_norm": 0.07313070446252823, "learning_rate": 2.976674822328357e-05, "loss": 1.6549, "step": 1232 }, { "epoch": 2.22, "grad_norm": 0.07412869483232498, "learning_rate": 2.9630015224890084e-05, "loss": 1.6455, "step": 1233 }, { "epoch": 2.22, "eval_loss": 1.7308404445648193, "eval_runtime": 76.3384, "eval_samples_per_second": 65.498, "eval_steps_per_second": 16.374, "step": 1233 }, { "epoch": 2.23, "grad_norm": 0.07468422502279282, "learning_rate": 2.9493542352018365e-05, "loss": 1.569, "step": 1234 }, { "epoch": 2.23, "grad_norm": 0.07683281600475311, "learning_rate": 2.9357330109145897e-05, "loss": 1.6725, "step": 1235 }, { "epoch": 2.23, "grad_norm": 0.07285214960575104, "learning_rate": 2.9221378999786853e-05, "loss": 1.6817, "step": 1236 }, { "epoch": 2.23, "grad_norm": 0.0746966302394867, "learning_rate": 2.908568952649e-05, "loss": 1.6112, "step": 1237 }, { "epoch": 2.23, "grad_norm": 0.07568127661943436, "learning_rate": 2.8950262190837064e-05, "loss": 1.7395, "step": 1238 }, { "epoch": 2.23, "grad_norm": 0.07346561551094055, "learning_rate": 2.881509749344068e-05, "loss": 1.6174, "step": 1239 }, { "epoch": 2.24, "grad_norm": 0.07394208759069443, "learning_rate": 2.8680195933942722e-05, "loss": 1.6363, "step": 1240 }, { "epoch": 2.24, "grad_norm": 0.07679972052574158, "learning_rate": 2.8545558011012274e-05, "loss": 1.5835, "step": 1241 }, { "epoch": 2.24, "grad_norm": 0.0740526095032692, "learning_rate": 2.8411184222343902e-05, "loss": 1.5996, "step": 1242 }, { "epoch": 2.24, "grad_norm": 0.08160841464996338, "learning_rate": 2.8277075064655843e-05, "loss": 1.6428, "step": 1243 }, { "epoch": 2.24, "grad_norm": 0.07413484156131744, "learning_rate": 2.814323103368802e-05, "loss": 1.6615, "step": 1244 }, { "epoch": 2.25, "grad_norm": 0.07478610426187515, "learning_rate": 2.800965262420043e-05, "loss": 1.6479, "step": 1245 }, { "epoch": 2.25, "grad_norm": 0.0758727416396141, "learning_rate": 2.787634032997105e-05, "loss": 1.6435, "step": 1246 }, { "epoch": 2.25, "grad_norm": 0.07492438703775406, "learning_rate": 2.7743294643794272e-05, "loss": 1.6349, "step": 1247 }, { "epoch": 2.25, "grad_norm": 0.0757269635796547, "learning_rate": 2.7610516057478853e-05, "loss": 1.6629, "step": 1248 }, { "epoch": 2.25, "grad_norm": 0.07544314861297607, "learning_rate": 2.747800506184631e-05, "loss": 1.6507, "step": 1249 }, { "epoch": 2.25, "grad_norm": 0.0811799094080925, "learning_rate": 2.7345762146728903e-05, "loss": 1.7006, "step": 1250 }, { "epoch": 2.26, "grad_norm": 0.07570774853229523, "learning_rate": 2.7213787800968027e-05, "loss": 1.6383, "step": 1251 }, { "epoch": 2.26, "grad_norm": 0.07623685896396637, "learning_rate": 2.708208251241219e-05, "loss": 1.661, "step": 1252 }, { "epoch": 2.26, "grad_norm": 0.09012570232152939, "learning_rate": 2.6950646767915345e-05, "loss": 1.6021, "step": 1253 }, { "epoch": 2.26, "grad_norm": 0.07601132243871689, "learning_rate": 2.6819481053335162e-05, "loss": 1.6452, "step": 1254 }, { "epoch": 2.26, "grad_norm": 0.07657255977392197, "learning_rate": 2.668858585353099e-05, "loss": 1.656, "step": 1255 }, { "epoch": 2.27, "grad_norm": 0.07921941578388214, "learning_rate": 2.655796165236234e-05, "loss": 1.6537, "step": 1256 }, { "epoch": 2.27, "grad_norm": 0.07938221842050552, "learning_rate": 2.6427608932686843e-05, "loss": 1.594, "step": 1257 }, { "epoch": 2.27, "grad_norm": 0.07329648733139038, "learning_rate": 2.62975281763587e-05, "loss": 1.6775, "step": 1258 }, { "epoch": 2.27, "grad_norm": 0.08492547273635864, "learning_rate": 2.6167719864226702e-05, "loss": 1.6339, "step": 1259 }, { "epoch": 2.27, "grad_norm": 0.07563217729330063, "learning_rate": 2.603818447613261e-05, "loss": 1.6223, "step": 1260 }, { "epoch": 2.27, "grad_norm": 0.07341662049293518, "learning_rate": 2.5908922490909236e-05, "loss": 1.6311, "step": 1261 }, { "epoch": 2.28, "grad_norm": 0.07573051750659943, "learning_rate": 2.5779934386378822e-05, "loss": 1.675, "step": 1262 }, { "epoch": 2.28, "grad_norm": 0.0771564245223999, "learning_rate": 2.565122063935116e-05, "loss": 1.6441, "step": 1263 }, { "epoch": 2.28, "grad_norm": 0.07614730298519135, "learning_rate": 2.5522781725621813e-05, "loss": 1.6151, "step": 1264 }, { "epoch": 2.28, "grad_norm": 0.07348419725894928, "learning_rate": 2.539461811997056e-05, "loss": 1.6288, "step": 1265 }, { "epoch": 2.28, "grad_norm": 0.07890129089355469, "learning_rate": 2.5266730296159312e-05, "loss": 1.6564, "step": 1266 }, { "epoch": 2.29, "grad_norm": 0.07503450661897659, "learning_rate": 2.51391187269307e-05, "loss": 1.6097, "step": 1267 }, { "epoch": 2.29, "grad_norm": 0.07627269625663757, "learning_rate": 2.5011783884006058e-05, "loss": 1.6129, "step": 1268 }, { "epoch": 2.29, "grad_norm": 0.07332798093557358, "learning_rate": 2.4884726238083865e-05, "loss": 1.6485, "step": 1269 }, { "epoch": 2.29, "grad_norm": 0.07669366151094437, "learning_rate": 2.475794625883785e-05, "loss": 1.6302, "step": 1270 }, { "epoch": 2.29, "grad_norm": 0.07550175487995148, "learning_rate": 2.463144441491545e-05, "loss": 1.6248, "step": 1271 }, { "epoch": 2.29, "grad_norm": 0.07559078931808472, "learning_rate": 2.450522117393582e-05, "loss": 1.6298, "step": 1272 }, { "epoch": 2.3, "grad_norm": 0.07644450664520264, "learning_rate": 2.437927700248839e-05, "loss": 1.6247, "step": 1273 }, { "epoch": 2.3, "grad_norm": 0.0748947337269783, "learning_rate": 2.4253612366130897e-05, "loss": 1.6577, "step": 1274 }, { "epoch": 2.3, "grad_norm": 0.07806424796581268, "learning_rate": 2.4128227729387787e-05, "loss": 1.6529, "step": 1275 }, { "epoch": 2.3, "grad_norm": 0.07620666921138763, "learning_rate": 2.400312355574853e-05, "loss": 1.6554, "step": 1276 }, { "epoch": 2.3, "grad_norm": 0.07409234344959259, "learning_rate": 2.3878300307665756e-05, "loss": 1.6409, "step": 1277 }, { "epoch": 2.31, "grad_norm": 0.07535138726234436, "learning_rate": 2.3753758446553765e-05, "loss": 1.6733, "step": 1278 }, { "epoch": 2.31, "grad_norm": 0.07349901646375656, "learning_rate": 2.3629498432786557e-05, "loss": 1.6446, "step": 1279 }, { "epoch": 2.31, "grad_norm": 0.07573609054088593, "learning_rate": 2.3505520725696428e-05, "loss": 1.6683, "step": 1280 }, { "epoch": 2.31, "grad_norm": 0.07504112273454666, "learning_rate": 2.3381825783571953e-05, "loss": 1.6052, "step": 1281 }, { "epoch": 2.31, "grad_norm": 0.0894421637058258, "learning_rate": 2.325841406365661e-05, "loss": 1.5536, "step": 1282 }, { "epoch": 2.31, "grad_norm": 0.07583224773406982, "learning_rate": 2.3135286022146785e-05, "loss": 1.6603, "step": 1283 }, { "epoch": 2.32, "grad_norm": 0.07644784450531006, "learning_rate": 2.301244211419038e-05, "loss": 1.6207, "step": 1284 }, { "epoch": 2.32, "grad_norm": 0.07893984019756317, "learning_rate": 2.28898827938849e-05, "loss": 1.6615, "step": 1285 }, { "epoch": 2.32, "grad_norm": 0.07783831655979156, "learning_rate": 2.2767608514275817e-05, "loss": 1.6145, "step": 1286 }, { "epoch": 2.32, "grad_norm": 0.07868466526269913, "learning_rate": 2.2645619727355084e-05, "loss": 1.649, "step": 1287 }, { "epoch": 2.32, "grad_norm": 0.07510129362344742, "learning_rate": 2.252391688405915e-05, "loss": 1.5836, "step": 1288 }, { "epoch": 2.33, "grad_norm": 0.07628342509269714, "learning_rate": 2.2402500434267613e-05, "loss": 1.6033, "step": 1289 }, { "epoch": 2.33, "grad_norm": 0.07488032430410385, "learning_rate": 2.2281370826801295e-05, "loss": 1.7127, "step": 1290 }, { "epoch": 2.33, "grad_norm": 0.0785377100110054, "learning_rate": 2.2160528509420765e-05, "loss": 1.6378, "step": 1291 }, { "epoch": 2.33, "grad_norm": 0.0775563046336174, "learning_rate": 2.203997392882454e-05, "loss": 1.6515, "step": 1292 }, { "epoch": 2.33, "grad_norm": 0.0757281482219696, "learning_rate": 2.1919707530647614e-05, "loss": 1.6693, "step": 1293 }, { "epoch": 2.34, "grad_norm": 0.0739988461136818, "learning_rate": 2.1799729759459563e-05, "loss": 1.6119, "step": 1294 }, { "epoch": 2.34, "grad_norm": 0.07702571153640747, "learning_rate": 2.168004105876319e-05, "loss": 1.6474, "step": 1295 }, { "epoch": 2.34, "grad_norm": 0.07542101293802261, "learning_rate": 2.1560641870992616e-05, "loss": 1.611, "step": 1296 }, { "epoch": 2.34, "grad_norm": 0.07655727118253708, "learning_rate": 2.1441532637511798e-05, "loss": 1.6509, "step": 1297 }, { "epoch": 2.34, "grad_norm": 0.07552295923233032, "learning_rate": 2.1322713798612936e-05, "loss": 1.6654, "step": 1298 }, { "epoch": 2.34, "grad_norm": 0.07487693428993225, "learning_rate": 2.1204185793514677e-05, "loss": 1.6367, "step": 1299 }, { "epoch": 2.35, "grad_norm": 0.07979683578014374, "learning_rate": 2.1085949060360654e-05, "loss": 1.7248, "step": 1300 }, { "epoch": 2.35, "grad_norm": 0.07872039824724197, "learning_rate": 2.0968004036217747e-05, "loss": 1.6335, "step": 1301 }, { "epoch": 2.35, "grad_norm": 0.0761566311120987, "learning_rate": 2.0850351157074598e-05, "loss": 1.7088, "step": 1302 }, { "epoch": 2.35, "grad_norm": 0.07348429411649704, "learning_rate": 2.073299085783983e-05, "loss": 1.6468, "step": 1303 }, { "epoch": 2.35, "grad_norm": 0.07602974027395248, "learning_rate": 2.0615923572340633e-05, "loss": 1.6475, "step": 1304 }, { "epoch": 2.36, "grad_norm": 0.0768745020031929, "learning_rate": 2.0499149733320955e-05, "loss": 1.6442, "step": 1305 }, { "epoch": 2.36, "grad_norm": 0.07266237586736679, "learning_rate": 2.038266977244011e-05, "loss": 1.6415, "step": 1306 }, { "epoch": 2.36, "grad_norm": 0.10471116751432419, "learning_rate": 2.0266484120271e-05, "loss": 1.6714, "step": 1307 }, { "epoch": 2.36, "grad_norm": 0.07516157627105713, "learning_rate": 2.015059320629862e-05, "loss": 1.6468, "step": 1308 }, { "epoch": 2.36, "grad_norm": 0.07514899969100952, "learning_rate": 2.0034997458918492e-05, "loss": 1.6045, "step": 1309 }, { "epoch": 2.36, "grad_norm": 0.0757506862282753, "learning_rate": 1.9919697305434982e-05, "loss": 1.6393, "step": 1310 }, { "epoch": 2.37, "grad_norm": 0.0748157650232315, "learning_rate": 1.9804693172059864e-05, "loss": 1.6376, "step": 1311 }, { "epoch": 2.37, "grad_norm": 0.0749751403927803, "learning_rate": 1.9689985483910555e-05, "loss": 1.6007, "step": 1312 }, { "epoch": 2.37, "grad_norm": 0.08079908788204193, "learning_rate": 1.9575574665008767e-05, "loss": 1.6027, "step": 1313 }, { "epoch": 2.37, "grad_norm": 0.07813858240842819, "learning_rate": 1.9461461138278735e-05, "loss": 1.6286, "step": 1314 }, { "epoch": 2.37, "grad_norm": 0.07494987547397614, "learning_rate": 1.9347645325545737e-05, "loss": 1.6543, "step": 1315 }, { "epoch": 2.38, "grad_norm": 0.08152856677770615, "learning_rate": 1.9234127647534604e-05, "loss": 1.6948, "step": 1316 }, { "epoch": 2.38, "grad_norm": 0.07672107219696045, "learning_rate": 1.9120908523868096e-05, "loss": 1.6746, "step": 1317 }, { "epoch": 2.38, "grad_norm": 0.07567385584115982, "learning_rate": 1.900798837306529e-05, "loss": 1.6146, "step": 1318 }, { "epoch": 2.38, "grad_norm": 0.10249254107475281, "learning_rate": 1.8895367612540114e-05, "loss": 1.702, "step": 1319 }, { "epoch": 2.38, "grad_norm": 0.08051081001758575, "learning_rate": 1.8783046658599855e-05, "loss": 1.6444, "step": 1320 }, { "epoch": 2.38, "grad_norm": 0.07696827501058578, "learning_rate": 1.8671025926443465e-05, "loss": 1.6467, "step": 1321 }, { "epoch": 2.39, "grad_norm": 0.07554996758699417, "learning_rate": 1.855930583016019e-05, "loss": 1.6174, "step": 1322 }, { "epoch": 2.39, "grad_norm": 0.07727986574172974, "learning_rate": 1.8447886782727885e-05, "loss": 1.6209, "step": 1323 }, { "epoch": 2.39, "grad_norm": 0.07652744650840759, "learning_rate": 1.833676919601166e-05, "loss": 1.6059, "step": 1324 }, { "epoch": 2.39, "grad_norm": 0.07633455842733383, "learning_rate": 1.8225953480762182e-05, "loss": 1.6486, "step": 1325 }, { "epoch": 2.39, "grad_norm": 0.07672161608934402, "learning_rate": 1.811544004661424e-05, "loss": 1.6276, "step": 1326 }, { "epoch": 2.4, "grad_norm": 0.07692933082580566, "learning_rate": 1.800522930208528e-05, "loss": 1.6461, "step": 1327 }, { "epoch": 2.4, "grad_norm": 0.08672747015953064, "learning_rate": 1.789532165457385e-05, "loss": 1.6075, "step": 1328 }, { "epoch": 2.4, "grad_norm": 0.07557875663042068, "learning_rate": 1.7785717510358037e-05, "loss": 1.6336, "step": 1329 }, { "epoch": 2.4, "grad_norm": 0.07764589786529541, "learning_rate": 1.7676417274593982e-05, "loss": 1.6717, "step": 1330 }, { "epoch": 2.4, "grad_norm": 0.15912802517414093, "learning_rate": 1.756742135131455e-05, "loss": 1.59, "step": 1331 }, { "epoch": 2.4, "grad_norm": 0.07743611931800842, "learning_rate": 1.745873014342755e-05, "loss": 1.6451, "step": 1332 }, { "epoch": 2.41, "grad_norm": 0.07583177089691162, "learning_rate": 1.735034405271453e-05, "loss": 1.6161, "step": 1333 }, { "epoch": 2.41, "grad_norm": 0.07420673966407776, "learning_rate": 1.7242263479829045e-05, "loss": 1.6398, "step": 1334 }, { "epoch": 2.41, "grad_norm": 0.07718417793512344, "learning_rate": 1.7134488824295402e-05, "loss": 1.683, "step": 1335 }, { "epoch": 2.41, "grad_norm": 0.07624556869268417, "learning_rate": 1.7027020484506996e-05, "loss": 1.6235, "step": 1336 }, { "epoch": 2.41, "grad_norm": 0.07503955066204071, "learning_rate": 1.691985885772488e-05, "loss": 1.5878, "step": 1337 }, { "epoch": 2.42, "grad_norm": 0.07573457062244415, "learning_rate": 1.681300434007643e-05, "loss": 1.6218, "step": 1338 }, { "epoch": 2.42, "grad_norm": 0.07653183490037918, "learning_rate": 1.670645732655376e-05, "loss": 1.6583, "step": 1339 }, { "epoch": 2.42, "grad_norm": 0.07439745217561722, "learning_rate": 1.660021821101222e-05, "loss": 1.6452, "step": 1340 }, { "epoch": 2.42, "grad_norm": 0.07712557166814804, "learning_rate": 1.6494287386169016e-05, "loss": 1.7358, "step": 1341 }, { "epoch": 2.42, "grad_norm": 0.07855048775672913, "learning_rate": 1.638866524360182e-05, "loss": 1.6404, "step": 1342 }, { "epoch": 2.42, "grad_norm": 0.07642755657434464, "learning_rate": 1.6283352173747145e-05, "loss": 1.6483, "step": 1343 }, { "epoch": 2.43, "grad_norm": 0.08241719007492065, "learning_rate": 1.6178348565899092e-05, "loss": 1.6548, "step": 1344 }, { "epoch": 2.43, "grad_norm": 0.07523837685585022, "learning_rate": 1.607365480820775e-05, "loss": 1.6274, "step": 1345 }, { "epoch": 2.43, "grad_norm": 0.07957956939935684, "learning_rate": 1.5969271287677902e-05, "loss": 1.6353, "step": 1346 }, { "epoch": 2.43, "grad_norm": 0.0797073170542717, "learning_rate": 1.5865198390167492e-05, "loss": 1.6863, "step": 1347 }, { "epoch": 2.43, "grad_norm": 0.07916803658008575, "learning_rate": 1.5761436500386184e-05, "loss": 1.5653, "step": 1348 }, { "epoch": 2.44, "grad_norm": 0.07969414442777634, "learning_rate": 1.5657986001894077e-05, "loss": 1.6255, "step": 1349 }, { "epoch": 2.44, "grad_norm": 0.07772645354270935, "learning_rate": 1.5554847277100192e-05, "loss": 1.638, "step": 1350 }, { "epoch": 2.44, "grad_norm": 0.07736355066299438, "learning_rate": 1.5452020707261007e-05, "loss": 1.6596, "step": 1351 }, { "epoch": 2.44, "grad_norm": 0.07845966517925262, "learning_rate": 1.5349506672479097e-05, "loss": 1.6598, "step": 1352 }, { "epoch": 2.44, "grad_norm": 0.07737518846988678, "learning_rate": 1.5247305551701852e-05, "loss": 1.6352, "step": 1353 }, { "epoch": 2.44, "grad_norm": 0.07634854316711426, "learning_rate": 1.5145417722719824e-05, "loss": 1.6303, "step": 1354 }, { "epoch": 2.45, "grad_norm": 0.07973648607730865, "learning_rate": 1.5043843562165571e-05, "loss": 1.6417, "step": 1355 }, { "epoch": 2.45, "grad_norm": 0.07898874580860138, "learning_rate": 1.4942583445512103e-05, "loss": 1.6329, "step": 1356 }, { "epoch": 2.45, "grad_norm": 0.08404459059238434, "learning_rate": 1.4841637747071602e-05, "loss": 1.6569, "step": 1357 }, { "epoch": 2.45, "grad_norm": 0.07700961828231812, "learning_rate": 1.4741006839993942e-05, "loss": 1.6417, "step": 1358 }, { "epoch": 2.45, "grad_norm": 0.0769503116607666, "learning_rate": 1.4640691096265358e-05, "loss": 1.6464, "step": 1359 }, { "epoch": 2.46, "grad_norm": 0.07737623155117035, "learning_rate": 1.454069088670712e-05, "loss": 1.6401, "step": 1360 }, { "epoch": 2.46, "grad_norm": 0.08405579626560211, "learning_rate": 1.4441006580974114e-05, "loss": 1.6672, "step": 1361 }, { "epoch": 2.46, "grad_norm": 0.07663515955209732, "learning_rate": 1.4341638547553404e-05, "loss": 1.5674, "step": 1362 }, { "epoch": 2.46, "grad_norm": 0.0760325938463211, "learning_rate": 1.4242587153762976e-05, "loss": 1.7036, "step": 1363 }, { "epoch": 2.46, "grad_norm": 0.07548896968364716, "learning_rate": 1.4143852765750387e-05, "loss": 1.6533, "step": 1364 }, { "epoch": 2.46, "grad_norm": 0.07869727909564972, "learning_rate": 1.4045435748491298e-05, "loss": 1.6345, "step": 1365 }, { "epoch": 2.47, "grad_norm": 0.07685807347297668, "learning_rate": 1.3947336465788274e-05, "loss": 1.6381, "step": 1366 }, { "epoch": 2.47, "grad_norm": 0.07771617919206619, "learning_rate": 1.3849555280269256e-05, "loss": 1.6773, "step": 1367 }, { "epoch": 2.47, "grad_norm": 0.08084233105182648, "learning_rate": 1.3752092553386463e-05, "loss": 1.6054, "step": 1368 }, { "epoch": 2.47, "grad_norm": 0.07997636497020721, "learning_rate": 1.36549486454148e-05, "loss": 1.7101, "step": 1369 }, { "epoch": 2.47, "grad_norm": 0.07927645742893219, "learning_rate": 1.3558123915450671e-05, "loss": 1.6258, "step": 1370 }, { "epoch": 2.47, "eval_loss": 1.7314895391464233, "eval_runtime": 76.2484, "eval_samples_per_second": 65.575, "eval_steps_per_second": 16.394, "step": 1370 }, { "epoch": 2.48, "grad_norm": 0.0754527747631073, "learning_rate": 1.3461618721410662e-05, "loss": 1.6419, "step": 1371 }, { "epoch": 2.48, "grad_norm": 0.0918116644024849, "learning_rate": 1.3365433420030204e-05, "loss": 1.5937, "step": 1372 }, { "epoch": 2.48, "grad_norm": 0.07520640641450882, "learning_rate": 1.3269568366862151e-05, "loss": 1.6486, "step": 1373 }, { "epoch": 2.48, "grad_norm": 0.08471286296844482, "learning_rate": 1.3174023916275557e-05, "loss": 1.6405, "step": 1374 }, { "epoch": 2.48, "grad_norm": 0.08112510293722153, "learning_rate": 1.307880042145445e-05, "loss": 1.6273, "step": 1375 }, { "epoch": 2.48, "grad_norm": 0.0757022500038147, "learning_rate": 1.2983898234396308e-05, "loss": 1.6296, "step": 1376 }, { "epoch": 2.49, "grad_norm": 0.0767267569899559, "learning_rate": 1.2889317705910985e-05, "loss": 1.6137, "step": 1377 }, { "epoch": 2.49, "grad_norm": 0.07709117978811264, "learning_rate": 1.2795059185619229e-05, "loss": 1.6304, "step": 1378 }, { "epoch": 2.49, "grad_norm": 0.0785246416926384, "learning_rate": 1.2701123021951556e-05, "loss": 1.5905, "step": 1379 }, { "epoch": 2.49, "grad_norm": 0.07662135362625122, "learning_rate": 1.260750956214679e-05, "loss": 1.5816, "step": 1380 }, { "epoch": 2.49, "grad_norm": 0.07675299048423767, "learning_rate": 1.2514219152250894e-05, "loss": 1.5891, "step": 1381 }, { "epoch": 2.5, "grad_norm": 0.07987385243177414, "learning_rate": 1.2421252137115702e-05, "loss": 1.6048, "step": 1382 }, { "epoch": 2.5, "grad_norm": 0.07960965484380722, "learning_rate": 1.2328608860397605e-05, "loss": 1.6599, "step": 1383 }, { "epoch": 2.5, "grad_norm": 0.07663369178771973, "learning_rate": 1.2236289664556233e-05, "loss": 1.686, "step": 1384 }, { "epoch": 2.5, "grad_norm": 0.07579533755779266, "learning_rate": 1.2144294890853236e-05, "loss": 1.6305, "step": 1385 }, { "epoch": 2.5, "grad_norm": 0.07666274905204773, "learning_rate": 1.2052624879351104e-05, "loss": 1.6163, "step": 1386 }, { "epoch": 2.5, "grad_norm": 0.07610160112380981, "learning_rate": 1.1961279968911743e-05, "loss": 1.6307, "step": 1387 }, { "epoch": 2.51, "grad_norm": 0.07812687754631042, "learning_rate": 1.1870260497195329e-05, "loss": 1.6504, "step": 1388 }, { "epoch": 2.51, "grad_norm": 0.07521472871303558, "learning_rate": 1.1779566800659081e-05, "loss": 1.7001, "step": 1389 }, { "epoch": 2.51, "grad_norm": 0.07430490106344223, "learning_rate": 1.1689199214555968e-05, "loss": 1.6824, "step": 1390 }, { "epoch": 2.51, "grad_norm": 0.07681223750114441, "learning_rate": 1.1599158072933436e-05, "loss": 1.6566, "step": 1391 }, { "epoch": 2.51, "grad_norm": 0.07529665529727936, "learning_rate": 1.1509443708632229e-05, "loss": 1.6335, "step": 1392 }, { "epoch": 2.52, "grad_norm": 0.07649347931146622, "learning_rate": 1.1420056453285178e-05, "loss": 1.6499, "step": 1393 }, { "epoch": 2.52, "grad_norm": 0.07575827836990356, "learning_rate": 1.1330996637315927e-05, "loss": 1.658, "step": 1394 }, { "epoch": 2.52, "grad_norm": 0.07583682984113693, "learning_rate": 1.1242264589937723e-05, "loss": 1.7121, "step": 1395 }, { "epoch": 2.52, "grad_norm": 0.07645382732152939, "learning_rate": 1.1153860639152169e-05, "loss": 1.6648, "step": 1396 }, { "epoch": 2.52, "grad_norm": 0.0770021378993988, "learning_rate": 1.1065785111748117e-05, "loss": 1.625, "step": 1397 }, { "epoch": 2.52, "grad_norm": 0.07449906319379807, "learning_rate": 1.0978038333300334e-05, "loss": 1.6125, "step": 1398 }, { "epoch": 2.53, "grad_norm": 0.07793298363685608, "learning_rate": 1.0890620628168358e-05, "loss": 1.6799, "step": 1399 }, { "epoch": 2.53, "grad_norm": 0.07441260665655136, "learning_rate": 1.0803532319495302e-05, "loss": 1.6538, "step": 1400 }, { "epoch": 2.53, "grad_norm": 0.07645926624536514, "learning_rate": 1.0716773729206697e-05, "loss": 1.6688, "step": 1401 }, { "epoch": 2.53, "grad_norm": 0.08136122673749924, "learning_rate": 1.0630345178009182e-05, "loss": 1.6351, "step": 1402 }, { "epoch": 2.53, "grad_norm": 0.0778769850730896, "learning_rate": 1.0544246985389405e-05, "loss": 1.6574, "step": 1403 }, { "epoch": 2.54, "grad_norm": 0.07509773224592209, "learning_rate": 1.0458479469612882e-05, "loss": 1.6286, "step": 1404 }, { "epoch": 2.54, "grad_norm": 0.08043016493320465, "learning_rate": 1.0373042947722744e-05, "loss": 1.6707, "step": 1405 }, { "epoch": 2.54, "grad_norm": 0.07571176439523697, "learning_rate": 1.0287937735538566e-05, "loss": 1.6817, "step": 1406 }, { "epoch": 2.54, "grad_norm": 0.07955561578273773, "learning_rate": 1.020316414765522e-05, "loss": 1.6208, "step": 1407 }, { "epoch": 2.54, "grad_norm": 0.087374247610569, "learning_rate": 1.011872249744178e-05, "loss": 1.6246, "step": 1408 }, { "epoch": 2.54, "grad_norm": 0.07566358894109726, "learning_rate": 1.0034613097040224e-05, "loss": 1.6411, "step": 1409 }, { "epoch": 2.55, "grad_norm": 0.07635074853897095, "learning_rate": 9.950836257364371e-06, "loss": 1.6691, "step": 1410 }, { "epoch": 2.55, "grad_norm": 0.08172682672739029, "learning_rate": 9.867392288098743e-06, "loss": 1.6469, "step": 1411 }, { "epoch": 2.55, "grad_norm": 0.0748620480298996, "learning_rate": 9.7842814976974e-06, "loss": 1.6715, "step": 1412 }, { "epoch": 2.55, "grad_norm": 0.0790306031703949, "learning_rate": 9.70150419338276e-06, "loss": 1.6829, "step": 1413 }, { "epoch": 2.55, "grad_norm": 0.07460641115903854, "learning_rate": 9.619060681144487e-06, "loss": 1.637, "step": 1414 }, { "epoch": 2.56, "grad_norm": 0.08058908581733704, "learning_rate": 9.536951265738403e-06, "loss": 1.6023, "step": 1415 }, { "epoch": 2.56, "grad_norm": 0.0775287076830864, "learning_rate": 9.455176250685338e-06, "loss": 1.66, "step": 1416 }, { "epoch": 2.56, "grad_norm": 0.07818607240915298, "learning_rate": 9.373735938269956e-06, "loss": 1.6157, "step": 1417 }, { "epoch": 2.56, "grad_norm": 0.07963356375694275, "learning_rate": 9.292630629539667e-06, "loss": 1.6459, "step": 1418 }, { "epoch": 2.56, "grad_norm": 0.08181466907262802, "learning_rate": 9.211860624303604e-06, "loss": 1.6539, "step": 1419 }, { "epoch": 2.56, "grad_norm": 0.07678928226232529, "learning_rate": 9.131426221131378e-06, "loss": 1.6305, "step": 1420 }, { "epoch": 2.57, "grad_norm": 0.07490074634552002, "learning_rate": 9.05132771735201e-06, "loss": 1.6394, "step": 1421 }, { "epoch": 2.57, "grad_norm": 0.08047986775636673, "learning_rate": 8.971565409052907e-06, "loss": 1.6954, "step": 1422 }, { "epoch": 2.57, "grad_norm": 0.07774240523576736, "learning_rate": 8.89213959107873e-06, "loss": 1.6125, "step": 1423 }, { "epoch": 2.57, "grad_norm": 0.07721046358346939, "learning_rate": 8.813050557030222e-06, "loss": 1.6437, "step": 1424 }, { "epoch": 2.57, "grad_norm": 0.08010726422071457, "learning_rate": 8.734298599263235e-06, "loss": 1.6619, "step": 1425 }, { "epoch": 2.58, "grad_norm": 0.09973439574241638, "learning_rate": 8.655884008887594e-06, "loss": 1.6783, "step": 1426 }, { "epoch": 2.58, "grad_norm": 0.07684063911437988, "learning_rate": 8.577807075766042e-06, "loss": 1.6459, "step": 1427 }, { "epoch": 2.58, "grad_norm": 0.07541782408952713, "learning_rate": 8.500068088513158e-06, "loss": 1.6552, "step": 1428 }, { "epoch": 2.58, "grad_norm": 0.0786990150809288, "learning_rate": 8.422667334494249e-06, "loss": 1.6124, "step": 1429 }, { "epoch": 2.58, "grad_norm": 0.07959920167922974, "learning_rate": 8.345605099824405e-06, "loss": 1.593, "step": 1430 }, { "epoch": 2.58, "grad_norm": 0.07940792292356491, "learning_rate": 8.268881669367301e-06, "loss": 1.6052, "step": 1431 }, { "epoch": 2.59, "grad_norm": 0.07526662945747375, "learning_rate": 8.192497326734216e-06, "loss": 1.6067, "step": 1432 }, { "epoch": 2.59, "grad_norm": 0.0817658007144928, "learning_rate": 8.116452354283e-06, "loss": 1.6593, "step": 1433 }, { "epoch": 2.59, "grad_norm": 0.07827943563461304, "learning_rate": 8.040747033117014e-06, "loss": 1.6328, "step": 1434 }, { "epoch": 2.59, "grad_norm": 0.08363451808691025, "learning_rate": 7.96538164308407e-06, "loss": 1.6607, "step": 1435 }, { "epoch": 2.59, "grad_norm": 0.07701697200536728, "learning_rate": 7.890356462775373e-06, "loss": 1.603, "step": 1436 }, { "epoch": 2.6, "grad_norm": 0.0786963701248169, "learning_rate": 7.81567176952459e-06, "loss": 1.679, "step": 1437 }, { "epoch": 2.6, "grad_norm": 0.07547624409198761, "learning_rate": 7.741327839406753e-06, "loss": 1.6602, "step": 1438 }, { "epoch": 2.6, "grad_norm": 0.0758601650595665, "learning_rate": 7.66732494723723e-06, "loss": 1.62, "step": 1439 }, { "epoch": 2.6, "grad_norm": 0.07983959466218948, "learning_rate": 7.593663366570691e-06, "loss": 1.6876, "step": 1440 }, { "epoch": 2.6, "grad_norm": 0.08505720645189285, "learning_rate": 7.520343369700245e-06, "loss": 1.6197, "step": 1441 }, { "epoch": 2.6, "grad_norm": 0.07820601016283035, "learning_rate": 7.4473652276562e-06, "loss": 1.6471, "step": 1442 }, { "epoch": 2.61, "grad_norm": 0.07744170725345612, "learning_rate": 7.374729210205244e-06, "loss": 1.6692, "step": 1443 }, { "epoch": 2.61, "grad_norm": 0.07542102038860321, "learning_rate": 7.302435585849399e-06, "loss": 1.6092, "step": 1444 }, { "epoch": 2.61, "grad_norm": 0.08577460050582886, "learning_rate": 7.230484621825006e-06, "loss": 1.6645, "step": 1445 }, { "epoch": 2.61, "grad_norm": 0.07756751030683517, "learning_rate": 7.158876584101726e-06, "loss": 1.5752, "step": 1446 }, { "epoch": 2.61, "grad_norm": 0.08456861227750778, "learning_rate": 7.087611737381572e-06, "loss": 1.6311, "step": 1447 }, { "epoch": 2.62, "grad_norm": 0.0784740075469017, "learning_rate": 7.0166903450980094e-06, "loss": 1.6077, "step": 1448 }, { "epoch": 2.62, "grad_norm": 0.0794738158583641, "learning_rate": 6.946112669414806e-06, "loss": 1.6897, "step": 1449 }, { "epoch": 2.62, "grad_norm": 0.0770711898803711, "learning_rate": 6.87587897122528e-06, "loss": 1.735, "step": 1450 }, { "epoch": 2.62, "grad_norm": 0.07808208465576172, "learning_rate": 6.8059895101511005e-06, "loss": 1.5798, "step": 1451 }, { "epoch": 2.62, "grad_norm": 0.07920092344284058, "learning_rate": 6.736444544541576e-06, "loss": 1.6477, "step": 1452 }, { "epoch": 2.62, "grad_norm": 0.07922721654176712, "learning_rate": 6.667244331472478e-06, "loss": 1.643, "step": 1453 }, { "epoch": 2.63, "grad_norm": 0.09416685998439789, "learning_rate": 6.598389126745208e-06, "loss": 1.6475, "step": 1454 }, { "epoch": 2.63, "grad_norm": 0.07828591018915176, "learning_rate": 6.5298791848858455e-06, "loss": 1.6842, "step": 1455 }, { "epoch": 2.63, "grad_norm": 0.07749258726835251, "learning_rate": 6.461714759144233e-06, "loss": 1.6337, "step": 1456 }, { "epoch": 2.63, "grad_norm": 0.07924207299947739, "learning_rate": 6.393896101492902e-06, "loss": 1.6539, "step": 1457 }, { "epoch": 2.63, "grad_norm": 0.07810190320014954, "learning_rate": 6.326423462626296e-06, "loss": 1.6358, "step": 1458 }, { "epoch": 2.64, "grad_norm": 0.08121967315673828, "learning_rate": 6.259297091959815e-06, "loss": 1.6076, "step": 1459 }, { "epoch": 2.64, "grad_norm": 0.075941301882267, "learning_rate": 6.192517237628792e-06, "loss": 1.6788, "step": 1460 }, { "epoch": 2.64, "grad_norm": 0.0804860070347786, "learning_rate": 6.126084146487676e-06, "loss": 1.6496, "step": 1461 }, { "epoch": 2.64, "grad_norm": 0.0761600062251091, "learning_rate": 6.059998064109129e-06, "loss": 1.625, "step": 1462 }, { "epoch": 2.64, "grad_norm": 0.08090834319591522, "learning_rate": 5.994259234783062e-06, "loss": 1.6949, "step": 1463 }, { "epoch": 2.64, "grad_norm": 0.07986024022102356, "learning_rate": 5.928867901515722e-06, "loss": 1.6532, "step": 1464 }, { "epoch": 2.65, "grad_norm": 0.07632812112569809, "learning_rate": 5.863824306028831e-06, "loss": 1.66, "step": 1465 }, { "epoch": 2.65, "grad_norm": 0.08371198177337646, "learning_rate": 5.7991286887587035e-06, "loss": 1.6525, "step": 1466 }, { "epoch": 2.65, "grad_norm": 0.07661554962396622, "learning_rate": 5.734781288855351e-06, "loss": 1.6424, "step": 1467 }, { "epoch": 2.65, "grad_norm": 0.07702632248401642, "learning_rate": 5.670782344181547e-06, "loss": 1.6552, "step": 1468 }, { "epoch": 2.65, "grad_norm": 0.07916314899921417, "learning_rate": 5.607132091311995e-06, "loss": 1.6498, "step": 1469 }, { "epoch": 2.66, "grad_norm": 0.07836530357599258, "learning_rate": 5.543830765532476e-06, "loss": 1.6076, "step": 1470 }, { "epoch": 2.66, "grad_norm": 0.07685720175504684, "learning_rate": 5.480878600838912e-06, "loss": 1.5777, "step": 1471 }, { "epoch": 2.66, "grad_norm": 0.07941797375679016, "learning_rate": 5.418275829936537e-06, "loss": 1.6466, "step": 1472 }, { "epoch": 2.66, "grad_norm": 0.08379315584897995, "learning_rate": 5.3560226842390596e-06, "loss": 1.6398, "step": 1473 }, { "epoch": 2.66, "grad_norm": 0.0849771574139595, "learning_rate": 5.294119393867791e-06, "loss": 1.6046, "step": 1474 }, { "epoch": 2.66, "grad_norm": 0.07908991724252701, "learning_rate": 5.232566187650767e-06, "loss": 1.6461, "step": 1475 }, { "epoch": 2.67, "grad_norm": 0.07623674720525742, "learning_rate": 5.171363293121901e-06, "loss": 1.5721, "step": 1476 }, { "epoch": 2.67, "grad_norm": 0.08036508411169052, "learning_rate": 5.110510936520207e-06, "loss": 1.6266, "step": 1477 }, { "epoch": 2.67, "grad_norm": 0.07726433873176575, "learning_rate": 5.05000934278892e-06, "loss": 1.6661, "step": 1478 }, { "epoch": 2.67, "grad_norm": 0.07796676456928253, "learning_rate": 4.9898587355746375e-06, "loss": 1.675, "step": 1479 }, { "epoch": 2.67, "grad_norm": 0.08995962888002396, "learning_rate": 4.930059337226523e-06, "loss": 1.5471, "step": 1480 }, { "epoch": 2.68, "grad_norm": 0.08005818724632263, "learning_rate": 4.87061136879553e-06, "loss": 1.6852, "step": 1481 }, { "epoch": 2.68, "grad_norm": 0.0763825848698616, "learning_rate": 4.811515050033466e-06, "loss": 1.6805, "step": 1482 }, { "epoch": 2.68, "grad_norm": 0.08022331446409225, "learning_rate": 4.752770599392287e-06, "loss": 1.6555, "step": 1483 }, { "epoch": 2.68, "grad_norm": 0.07905282825231552, "learning_rate": 4.694378234023267e-06, "loss": 1.5547, "step": 1484 }, { "epoch": 2.68, "grad_norm": 0.07715322077274323, "learning_rate": 4.636338169776178e-06, "loss": 1.5875, "step": 1485 }, { "epoch": 2.69, "grad_norm": 0.07762684673070908, "learning_rate": 4.578650621198477e-06, "loss": 1.6558, "step": 1486 }, { "epoch": 2.69, "grad_norm": 0.07905148714780807, "learning_rate": 4.521315801534531e-06, "loss": 1.6486, "step": 1487 }, { "epoch": 2.69, "grad_norm": 0.07684370875358582, "learning_rate": 4.464333922724839e-06, "loss": 1.6274, "step": 1488 }, { "epoch": 2.69, "grad_norm": 0.07536882162094116, "learning_rate": 4.407705195405276e-06, "loss": 1.6385, "step": 1489 }, { "epoch": 2.69, "grad_norm": 0.07777073234319687, "learning_rate": 4.3514298289062284e-06, "loss": 1.6934, "step": 1490 }, { "epoch": 2.69, "grad_norm": 0.07902389764785767, "learning_rate": 4.29550803125186e-06, "loss": 1.6187, "step": 1491 }, { "epoch": 2.7, "grad_norm": 0.08110350370407104, "learning_rate": 4.2399400091594154e-06, "loss": 1.6635, "step": 1492 }, { "epoch": 2.7, "grad_norm": 0.07521898299455643, "learning_rate": 4.1847259680383385e-06, "loss": 1.6255, "step": 1493 }, { "epoch": 2.7, "grad_norm": 0.07719499617815018, "learning_rate": 4.129866111989578e-06, "loss": 1.6531, "step": 1494 }, { "epoch": 2.7, "grad_norm": 0.07666776329278946, "learning_rate": 4.075360643804838e-06, "loss": 1.6492, "step": 1495 }, { "epoch": 2.7, "grad_norm": 0.07902183383703232, "learning_rate": 4.021209764965828e-06, "loss": 1.5977, "step": 1496 }, { "epoch": 2.71, "grad_norm": 0.07575275003910065, "learning_rate": 3.967413675643472e-06, "loss": 1.6477, "step": 1497 }, { "epoch": 2.71, "grad_norm": 0.07690516859292984, "learning_rate": 3.9139725746971885e-06, "loss": 1.6464, "step": 1498 }, { "epoch": 2.71, "grad_norm": 0.07752062380313873, "learning_rate": 3.860886659674201e-06, "loss": 1.6077, "step": 1499 }, { "epoch": 2.71, "grad_norm": 0.07717850804328918, "learning_rate": 3.808156126808782e-06, "loss": 1.6582, "step": 1500 }, { "epoch": 2.71, "grad_norm": 0.07689668983221054, "learning_rate": 3.7557811710214885e-06, "loss": 1.6682, "step": 1501 }, { "epoch": 2.71, "grad_norm": 0.07667512446641922, "learning_rate": 3.7037619859184523e-06, "loss": 1.6858, "step": 1502 }, { "epoch": 2.72, "grad_norm": 0.10544613003730774, "learning_rate": 3.652098763790768e-06, "loss": 1.644, "step": 1503 }, { "epoch": 2.72, "grad_norm": 0.07981459051370621, "learning_rate": 3.6007916956136033e-06, "loss": 1.6161, "step": 1504 }, { "epoch": 2.72, "grad_norm": 0.07780975103378296, "learning_rate": 3.5498409710456372e-06, "loss": 1.6486, "step": 1505 }, { "epoch": 2.72, "grad_norm": 0.07703933864831924, "learning_rate": 3.499246778428311e-06, "loss": 1.6499, "step": 1506 }, { "epoch": 2.72, "grad_norm": 0.07707490772008896, "learning_rate": 3.4490093047851447e-06, "loss": 1.6792, "step": 1507 }, { "epoch": 2.72, "eval_loss": 1.7316620349884033, "eval_runtime": 76.592, "eval_samples_per_second": 65.281, "eval_steps_per_second": 16.32, "step": 1507 }, { "epoch": 2.73, "grad_norm": 0.07655075937509537, "learning_rate": 3.3991287358209777e-06, "loss": 1.6349, "step": 1508 }, { "epoch": 2.73, "grad_norm": 0.07839100807905197, "learning_rate": 3.349605255921373e-06, "loss": 1.6159, "step": 1509 }, { "epoch": 2.73, "grad_norm": 0.07581314444541931, "learning_rate": 3.3004390481519043e-06, "loss": 1.66, "step": 1510 }, { "epoch": 2.73, "grad_norm": 0.07724065333604813, "learning_rate": 3.2516302942574793e-06, "loss": 1.6371, "step": 1511 }, { "epoch": 2.73, "grad_norm": 0.0758710652589798, "learning_rate": 3.2031791746616303e-06, "loss": 1.6081, "step": 1512 }, { "epoch": 2.73, "grad_norm": 0.07650534063577652, "learning_rate": 3.15508586846589e-06, "loss": 1.6669, "step": 1513 }, { "epoch": 2.74, "grad_norm": 0.07545692473649979, "learning_rate": 3.107350553449162e-06, "loss": 1.6492, "step": 1514 }, { "epoch": 2.74, "grad_norm": 0.07454677671194077, "learning_rate": 3.059973406066963e-06, "loss": 1.6204, "step": 1515 }, { "epoch": 2.74, "grad_norm": 0.07949978113174438, "learning_rate": 3.0129546014508567e-06, "loss": 1.6916, "step": 1516 }, { "epoch": 2.74, "grad_norm": 0.07890604436397552, "learning_rate": 2.966294313407769e-06, "loss": 1.6409, "step": 1517 }, { "epoch": 2.74, "grad_norm": 0.07671340554952621, "learning_rate": 2.919992714419373e-06, "loss": 1.6536, "step": 1518 }, { "epoch": 2.75, "grad_norm": 0.08768926560878754, "learning_rate": 2.8740499756414128e-06, "loss": 1.615, "step": 1519 }, { "epoch": 2.75, "grad_norm": 0.07849384844303131, "learning_rate": 2.828466266903085e-06, "loss": 1.6728, "step": 1520 }, { "epoch": 2.75, "grad_norm": 0.07685536891222, "learning_rate": 2.783241756706445e-06, "loss": 1.6158, "step": 1521 }, { "epoch": 2.75, "grad_norm": 0.08592838793992996, "learning_rate": 2.738376612225757e-06, "loss": 1.6527, "step": 1522 }, { "epoch": 2.75, "grad_norm": 0.08121158927679062, "learning_rate": 2.6938709993068247e-06, "loss": 1.6206, "step": 1523 }, { "epoch": 2.75, "grad_norm": 0.08821628242731094, "learning_rate": 2.649725082466481e-06, "loss": 1.6483, "step": 1524 }, { "epoch": 2.76, "grad_norm": 0.07733853161334991, "learning_rate": 2.6059390248919126e-06, "loss": 1.6366, "step": 1525 }, { "epoch": 2.76, "grad_norm": 0.07937391847372055, "learning_rate": 2.562512988440069e-06, "loss": 1.5898, "step": 1526 }, { "epoch": 2.76, "grad_norm": 0.07704456895589828, "learning_rate": 2.519447133637054e-06, "loss": 1.6543, "step": 1527 }, { "epoch": 2.76, "grad_norm": 0.0772838145494461, "learning_rate": 2.4767416196775584e-06, "loss": 1.6607, "step": 1528 }, { "epoch": 2.76, "grad_norm": 0.07885396480560303, "learning_rate": 2.434396604424283e-06, "loss": 1.6156, "step": 1529 }, { "epoch": 2.77, "grad_norm": 0.07838328927755356, "learning_rate": 2.392412244407294e-06, "loss": 1.6134, "step": 1530 }, { "epoch": 2.77, "grad_norm": 0.07852654904127121, "learning_rate": 2.350788694823469e-06, "loss": 1.7033, "step": 1531 }, { "epoch": 2.77, "grad_norm": 0.078957699239254, "learning_rate": 2.3095261095359756e-06, "loss": 1.6777, "step": 1532 }, { "epoch": 2.77, "grad_norm": 0.07925454527139664, "learning_rate": 2.268624641073669e-06, "loss": 1.5862, "step": 1533 }, { "epoch": 2.77, "grad_norm": 0.07859499752521515, "learning_rate": 2.2280844406304515e-06, "loss": 1.6773, "step": 1534 }, { "epoch": 2.77, "grad_norm": 0.0811610296368599, "learning_rate": 2.1879056580648593e-06, "loss": 1.5913, "step": 1535 }, { "epoch": 2.78, "grad_norm": 0.08480032533407211, "learning_rate": 2.1480884418993983e-06, "loss": 1.6432, "step": 1536 }, { "epoch": 2.78, "grad_norm": 0.0798884704709053, "learning_rate": 2.108632939320032e-06, "loss": 1.6325, "step": 1537 }, { "epoch": 2.78, "grad_norm": 0.079264335334301, "learning_rate": 2.0695392961756264e-06, "loss": 1.6202, "step": 1538 }, { "epoch": 2.78, "grad_norm": 0.07931558042764664, "learning_rate": 2.0308076569774404e-06, "loss": 1.6321, "step": 1539 }, { "epoch": 2.78, "grad_norm": 0.07699020951986313, "learning_rate": 1.992438164898569e-06, "loss": 1.6196, "step": 1540 }, { "epoch": 2.79, "grad_norm": 0.07788260281085968, "learning_rate": 1.95443096177339e-06, "loss": 1.6368, "step": 1541 }, { "epoch": 2.79, "grad_norm": 0.07767998427152634, "learning_rate": 1.916786188097075e-06, "loss": 1.5978, "step": 1542 }, { "epoch": 2.79, "grad_norm": 0.07789082825183868, "learning_rate": 1.8795039830250772e-06, "loss": 1.5867, "step": 1543 }, { "epoch": 2.79, "grad_norm": 0.07725279778242111, "learning_rate": 1.8425844843726115e-06, "loss": 1.6223, "step": 1544 }, { "epoch": 2.79, "grad_norm": 0.08963463455438614, "learning_rate": 1.8060278286140763e-06, "loss": 1.6351, "step": 1545 }, { "epoch": 2.79, "grad_norm": 0.07855743169784546, "learning_rate": 1.7698341508826543e-06, "loss": 1.6531, "step": 1546 }, { "epoch": 2.8, "grad_norm": 0.0768176019191742, "learning_rate": 1.7340035849697567e-06, "loss": 1.5919, "step": 1547 }, { "epoch": 2.8, "grad_norm": 0.07631691545248032, "learning_rate": 1.698536263324546e-06, "loss": 1.6603, "step": 1548 }, { "epoch": 2.8, "grad_norm": 0.07766193896532059, "learning_rate": 1.6634323170533928e-06, "loss": 1.6337, "step": 1549 }, { "epoch": 2.8, "grad_norm": 0.07914331555366516, "learning_rate": 1.6286918759194858e-06, "loss": 1.6216, "step": 1550 }, { "epoch": 2.8, "grad_norm": 0.0915398821234703, "learning_rate": 1.5943150683422891e-06, "loss": 1.6142, "step": 1551 }, { "epoch": 2.81, "grad_norm": 0.10567398369312286, "learning_rate": 1.5603020213970754e-06, "loss": 1.6061, "step": 1552 }, { "epoch": 2.81, "grad_norm": 0.08098272234201431, "learning_rate": 1.526652860814448e-06, "loss": 1.5978, "step": 1553 }, { "epoch": 2.81, "grad_norm": 0.08282123506069183, "learning_rate": 1.4933677109799093e-06, "loss": 1.6219, "step": 1554 }, { "epoch": 2.81, "grad_norm": 0.07702944427728653, "learning_rate": 1.4604466949333928e-06, "loss": 1.6337, "step": 1555 }, { "epoch": 2.81, "grad_norm": 0.07632174342870712, "learning_rate": 1.4278899343687425e-06, "loss": 1.6726, "step": 1556 }, { "epoch": 2.81, "grad_norm": 0.08127190917730331, "learning_rate": 1.395697549633379e-06, "loss": 1.6383, "step": 1557 }, { "epoch": 2.82, "grad_norm": 0.07634363323450089, "learning_rate": 1.3638696597277679e-06, "loss": 1.6535, "step": 1558 }, { "epoch": 2.82, "grad_norm": 0.07834632694721222, "learning_rate": 1.3324063823049848e-06, "loss": 1.6169, "step": 1559 }, { "epoch": 2.82, "grad_norm": 0.07717548310756683, "learning_rate": 1.3013078336703398e-06, "loss": 1.6614, "step": 1560 }, { "epoch": 2.82, "grad_norm": 0.07857763767242432, "learning_rate": 1.2705741287808658e-06, "loss": 1.6374, "step": 1561 }, { "epoch": 2.82, "grad_norm": 0.07776670902967453, "learning_rate": 1.2402053812449744e-06, "loss": 1.6416, "step": 1562 }, { "epoch": 2.83, "grad_norm": 0.0788004994392395, "learning_rate": 1.2102017033219892e-06, "loss": 1.6561, "step": 1563 }, { "epoch": 2.83, "grad_norm": 0.0791986957192421, "learning_rate": 1.1805632059217031e-06, "loss": 1.6006, "step": 1564 }, { "epoch": 2.83, "grad_norm": 0.07873232662677765, "learning_rate": 1.1512899986040438e-06, "loss": 1.6443, "step": 1565 }, { "epoch": 2.83, "grad_norm": 0.07714524865150452, "learning_rate": 1.1223821895786301e-06, "loss": 1.6367, "step": 1566 }, { "epoch": 2.83, "grad_norm": 0.07893408089876175, "learning_rate": 1.0938398857043396e-06, "loss": 1.6386, "step": 1567 }, { "epoch": 2.83, "grad_norm": 0.07740840315818787, "learning_rate": 1.0656631924889749e-06, "loss": 1.5838, "step": 1568 }, { "epoch": 2.84, "grad_norm": 0.08581645786762238, "learning_rate": 1.0378522140888414e-06, "loss": 1.6802, "step": 1569 }, { "epoch": 2.84, "grad_norm": 0.08146387338638306, "learning_rate": 1.0104070533083375e-06, "loss": 1.6646, "step": 1570 }, { "epoch": 2.84, "grad_norm": 0.07950106263160706, "learning_rate": 9.833278115996214e-07, "loss": 1.6837, "step": 1571 }, { "epoch": 2.84, "grad_norm": 0.08287668228149414, "learning_rate": 9.566145890622325e-07, "loss": 1.6601, "step": 1572 }, { "epoch": 2.84, "grad_norm": 0.07892429083585739, "learning_rate": 9.302674844426707e-07, "loss": 1.6103, "step": 1573 }, { "epoch": 2.85, "grad_norm": 0.07956201583147049, "learning_rate": 9.042865951340962e-07, "loss": 1.6307, "step": 1574 }, { "epoch": 2.85, "grad_norm": 0.07830474525690079, "learning_rate": 8.786720171759189e-07, "loss": 1.6266, "step": 1575 }, { "epoch": 2.85, "grad_norm": 0.07838620990514755, "learning_rate": 8.534238452534759e-07, "loss": 1.598, "step": 1576 }, { "epoch": 2.85, "grad_norm": 0.07806780934333801, "learning_rate": 8.285421726976772e-07, "loss": 1.6334, "step": 1577 }, { "epoch": 2.85, "grad_norm": 0.07619772851467133, "learning_rate": 8.04027091484616e-07, "loss": 1.6459, "step": 1578 }, { "epoch": 2.85, "grad_norm": 0.07711511105298996, "learning_rate": 7.798786922353029e-07, "loss": 1.6736, "step": 1579 }, { "epoch": 2.86, "grad_norm": 0.0804474800825119, "learning_rate": 7.560970642152887e-07, "loss": 1.6689, "step": 1580 }, { "epoch": 2.86, "grad_norm": 0.07842190563678741, "learning_rate": 7.326822953343304e-07, "loss": 1.6666, "step": 1581 }, { "epoch": 2.86, "grad_norm": 0.08277352154254913, "learning_rate": 7.09634472146059e-07, "loss": 1.6985, "step": 1582 }, { "epoch": 2.86, "grad_norm": 0.07743370532989502, "learning_rate": 6.869536798477127e-07, "loss": 1.6212, "step": 1583 }, { "epoch": 2.86, "grad_norm": 0.07882619649171829, "learning_rate": 6.646400022797705e-07, "loss": 1.6788, "step": 1584 }, { "epoch": 2.87, "grad_norm": 0.07653092592954636, "learning_rate": 6.426935219256413e-07, "loss": 1.6545, "step": 1585 }, { "epoch": 2.87, "grad_norm": 0.07817487418651581, "learning_rate": 6.211143199113867e-07, "loss": 1.6245, "step": 1586 }, { "epoch": 2.87, "grad_norm": 0.07678999751806259, "learning_rate": 5.999024760054095e-07, "loss": 1.6262, "step": 1587 }, { "epoch": 2.87, "grad_norm": 0.07842559367418289, "learning_rate": 5.790580686181546e-07, "loss": 1.6376, "step": 1588 }, { "epoch": 2.87, "grad_norm": 0.07722920924425125, "learning_rate": 5.585811748018311e-07, "loss": 1.6657, "step": 1589 }, { "epoch": 2.87, "grad_norm": 0.2405773401260376, "learning_rate": 5.384718702501012e-07, "loss": 1.6521, "step": 1590 }, { "epoch": 2.88, "grad_norm": 0.07780345529317856, "learning_rate": 5.18730229297848e-07, "loss": 1.6058, "step": 1591 }, { "epoch": 2.88, "grad_norm": 0.07912715524435043, "learning_rate": 4.993563249208411e-07, "loss": 1.6604, "step": 1592 }, { "epoch": 2.88, "grad_norm": 0.07572833448648453, "learning_rate": 4.803502287355044e-07, "loss": 1.6078, "step": 1593 }, { "epoch": 2.88, "grad_norm": 0.07777310907840729, "learning_rate": 4.6171201099864947e-07, "loss": 1.6293, "step": 1594 }, { "epoch": 2.88, "grad_norm": 0.07772635668516159, "learning_rate": 4.434417406072311e-07, "loss": 1.615, "step": 1595 }, { "epoch": 2.89, "grad_norm": 0.07916691899299622, "learning_rate": 4.2553948509802545e-07, "loss": 1.6613, "step": 1596 }, { "epoch": 2.89, "grad_norm": 0.07659539580345154, "learning_rate": 4.080053106474524e-07, "loss": 1.6493, "step": 1597 }, { "epoch": 2.89, "grad_norm": 0.07651732116937637, "learning_rate": 3.9083928207132024e-07, "loss": 1.6018, "step": 1598 }, { "epoch": 2.89, "grad_norm": 0.08317805081605911, "learning_rate": 3.74041462824537e-07, "loss": 1.7027, "step": 1599 }, { "epoch": 2.89, "grad_norm": 0.07855066657066345, "learning_rate": 3.576119150009438e-07, "loss": 1.5972, "step": 1600 }, { "epoch": 2.89, "grad_norm": 0.08029285073280334, "learning_rate": 3.415506993330153e-07, "loss": 1.6352, "step": 1601 }, { "epoch": 2.9, "grad_norm": 0.07904045283794403, "learning_rate": 3.258578751917041e-07, "loss": 1.6286, "step": 1602 }, { "epoch": 2.9, "grad_norm": 0.07854597270488739, "learning_rate": 3.1053350058617423e-07, "loss": 1.6636, "step": 1603 }, { "epoch": 2.9, "grad_norm": 0.08219031989574432, "learning_rate": 2.955776321636017e-07, "loss": 1.6632, "step": 1604 }, { "epoch": 2.9, "grad_norm": 0.07771878689527512, "learning_rate": 2.8099032520896297e-07, "loss": 1.632, "step": 1605 }, { "epoch": 2.9, "grad_norm": 0.0790058895945549, "learning_rate": 2.667716336448356e-07, "loss": 1.6525, "step": 1606 }, { "epoch": 2.91, "grad_norm": 0.07743880897760391, "learning_rate": 2.529216100311871e-07, "loss": 1.5851, "step": 1607 }, { "epoch": 2.91, "grad_norm": 0.0782419815659523, "learning_rate": 2.3944030556520833e-07, "loss": 1.6763, "step": 1608 }, { "epoch": 2.91, "grad_norm": 0.07781913131475449, "learning_rate": 2.2632777008106952e-07, "loss": 1.655, "step": 1609 }, { "epoch": 2.91, "grad_norm": 0.09047593921422958, "learning_rate": 2.1358405204982e-07, "loss": 1.6639, "step": 1610 }, { "epoch": 2.91, "grad_norm": 0.0770074650645256, "learning_rate": 2.0120919857912203e-07, "loss": 1.6986, "step": 1611 }, { "epoch": 2.91, "grad_norm": 0.07914358377456665, "learning_rate": 1.8920325541311735e-07, "loss": 1.5649, "step": 1612 }, { "epoch": 2.92, "grad_norm": 0.07777966558933258, "learning_rate": 1.7756626693227197e-07, "loss": 1.6788, "step": 1613 }, { "epoch": 2.92, "grad_norm": 0.07993672788143158, "learning_rate": 1.6629827615319837e-07, "loss": 1.6255, "step": 1614 }, { "epoch": 2.92, "grad_norm": 0.07582458108663559, "learning_rate": 1.5539932472847797e-07, "loss": 1.5872, "step": 1615 }, { "epoch": 2.92, "grad_norm": 0.08094622194766998, "learning_rate": 1.4486945294652776e-07, "loss": 1.5596, "step": 1616 }, { "epoch": 2.92, "grad_norm": 0.07855695486068726, "learning_rate": 1.3470869973144507e-07, "loss": 1.6493, "step": 1617 }, { "epoch": 2.93, "grad_norm": 0.07752160727977753, "learning_rate": 1.2491710264287416e-07, "loss": 1.5791, "step": 1618 }, { "epoch": 2.93, "grad_norm": 0.08005109429359436, "learning_rate": 1.1549469787585088e-07, "loss": 1.5994, "step": 1619 }, { "epoch": 2.93, "grad_norm": 0.0767727866768837, "learning_rate": 1.0644152026068054e-07, "loss": 1.6649, "step": 1620 }, { "epoch": 2.93, "grad_norm": 0.08210410922765732, "learning_rate": 9.77576032628047e-08, "loss": 1.6618, "step": 1621 }, { "epoch": 2.93, "grad_norm": 0.08205511420965195, "learning_rate": 8.944297898267895e-08, "loss": 1.6589, "step": 1622 }, { "epoch": 2.93, "grad_norm": 0.07729317992925644, "learning_rate": 8.14976781556509e-08, "loss": 1.6271, "step": 1623 }, { "epoch": 2.94, "grad_norm": 0.07487628608942032, "learning_rate": 7.392173015184911e-08, "loss": 1.6264, "step": 1624 }, { "epoch": 2.94, "grad_norm": 0.08334935456514359, "learning_rate": 6.671516297606095e-08, "loss": 1.6378, "step": 1625 }, { "epoch": 2.94, "grad_norm": 0.07653997093439102, "learning_rate": 5.987800326767711e-08, "loss": 1.6271, "step": 1626 }, { "epoch": 2.94, "grad_norm": 0.08070964366197586, "learning_rate": 5.341027630052509e-08, "loss": 1.6517, "step": 1627 }, { "epoch": 2.94, "grad_norm": 0.07557233422994614, "learning_rate": 4.7312005982835806e-08, "loss": 1.5697, "step": 1628 }, { "epoch": 2.95, "grad_norm": 0.07819821685552597, "learning_rate": 4.158321485708827e-08, "loss": 1.6233, "step": 1629 }, { "epoch": 2.95, "grad_norm": 0.07728429138660431, "learning_rate": 3.6223924100020625e-08, "loss": 1.6488, "step": 1630 }, { "epoch": 2.95, "grad_norm": 0.07817883789539337, "learning_rate": 3.1234153522452515e-08, "loss": 1.6477, "step": 1631 }, { "epoch": 2.95, "grad_norm": 0.07970202714204788, "learning_rate": 2.6613921569274003e-08, "loss": 1.6271, "step": 1632 }, { "epoch": 2.95, "grad_norm": 0.08091612160205841, "learning_rate": 2.2363245319378946e-08, "loss": 1.6467, "step": 1633 }, { "epoch": 2.95, "grad_norm": 0.07684524357318878, "learning_rate": 1.8482140485542865e-08, "loss": 1.6378, "step": 1634 }, { "epoch": 2.96, "grad_norm": 0.07528408616781235, "learning_rate": 1.497062141444516e-08, "loss": 1.5309, "step": 1635 }, { "epoch": 2.96, "grad_norm": 0.07820209860801697, "learning_rate": 1.1828701086558092e-08, "loss": 1.6583, "step": 1636 }, { "epoch": 2.96, "grad_norm": 0.0803627148270607, "learning_rate": 9.056391116102347e-09, "loss": 1.678, "step": 1637 }, { "epoch": 2.96, "grad_norm": 0.08125375211238861, "learning_rate": 6.653701751058172e-09, "loss": 1.6782, "step": 1638 }, { "epoch": 2.96, "grad_norm": 0.07821867614984512, "learning_rate": 4.62064187304323e-09, "loss": 1.5848, "step": 1639 }, { "epoch": 2.97, "grad_norm": 0.07541393488645554, "learning_rate": 2.957218997357014e-09, "loss": 1.6003, "step": 1640 }, { "epoch": 2.97, "grad_norm": 0.08002740889787674, "learning_rate": 1.6634392729142357e-09, "loss": 1.6278, "step": 1641 }, { "epoch": 2.97, "grad_norm": 0.07914552837610245, "learning_rate": 7.393074822226176e-10, "loss": 1.64, "step": 1642 }, { "epoch": 2.97, "grad_norm": 0.07930779457092285, "learning_rate": 1.848270413606912e-10, "loss": 1.6959, "step": 1643 }, { "epoch": 2.97, "grad_norm": 0.08046304434537888, "learning_rate": 0.0, "loss": 1.6364, "step": 1644 }, { "epoch": 2.97, "eval_loss": 1.731703281402588, "eval_runtime": 76.5258, "eval_samples_per_second": 65.337, "eval_steps_per_second": 16.334, "step": 1644 } ], "logging_steps": 1, "max_steps": 1644, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 548, "total_flos": 4.905764733434462e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }