diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,17124 +1,11440 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 14.990723562152134, + "epoch": 9.993815708101423, "eval_steps": 500, - "global_step": 12120, + "global_step": 8080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012368583797155227, - "grad_norm": 3.171875, - "learning_rate": 1.6501650165016504e-07, - "loss": 2.8095, + "grad_norm": 6.28125, + "learning_rate": 2.4752475247524754e-07, + "loss": 2.8093, "step": 1 }, { "epoch": 0.006184291898577613, - "grad_norm": 3.1875, - "learning_rate": 8.25082508250825e-07, - "loss": 2.8204, + "grad_norm": 4.125, + "learning_rate": 1.2376237623762377e-06, + "loss": 2.8212, "step": 5 }, { "epoch": 0.012368583797155226, - "grad_norm": 1.9140625, - "learning_rate": 1.65016501650165e-06, - "loss": 2.7934, + "grad_norm": 4.03125, + "learning_rate": 2.4752475247524753e-06, + "loss": 2.7914, "step": 10 }, { "epoch": 0.01855287569573284, - "grad_norm": 2.171875, - "learning_rate": 2.4752475247524753e-06, - "loss": 2.7932, + "grad_norm": 3.421875, + "learning_rate": 3.7128712871287128e-06, + "loss": 2.7819, "step": 15 }, { "epoch": 0.024737167594310452, - "grad_norm": 1.9921875, - "learning_rate": 3.3003300330033e-06, - "loss": 2.842, + "grad_norm": 3.546875, + "learning_rate": 4.950495049504951e-06, + "loss": 2.8066, "step": 20 }, { "epoch": 0.030921459492888066, - "grad_norm": 1.96875, - "learning_rate": 4.125412541254126e-06, - "loss": 2.8041, + "grad_norm": 6.25, + "learning_rate": 6.1881188118811885e-06, + "loss": 2.7294, "step": 25 }, { "epoch": 0.03710575139146568, - "grad_norm": 27.5, - "learning_rate": 4.950495049504951e-06, - "loss": 2.7951, + "grad_norm": 2.4375, + "learning_rate": 7.4257425742574256e-06, + "loss": 2.668, "step": 30 }, { "epoch": 0.04329004329004329, - "grad_norm": 1.546875, - "learning_rate": 5.775577557755775e-06, - "loss": 2.8127, + "grad_norm": 13.375, + "learning_rate": 8.663366336633663e-06, + "loss": 2.6216, "step": 35 }, { "epoch": 0.049474335188620905, - "grad_norm": 1.8671875, - "learning_rate": 6.6006600660066e-06, - "loss": 2.8001, + "grad_norm": 3.4375, + "learning_rate": 9.900990099009901e-06, + "loss": 2.5458, "step": 40 }, { "epoch": 0.055658627087198514, - "grad_norm": 1.625, - "learning_rate": 7.4257425742574256e-06, - "loss": 2.76, + "grad_norm": 3.484375, + "learning_rate": 1.113861386138614e-05, + "loss": 2.4339, "step": 45 }, { "epoch": 0.06184291898577613, - "grad_norm": 1.5546875, - "learning_rate": 8.250825082508252e-06, - "loss": 2.7348, + "grad_norm": 2.140625, + "learning_rate": 1.2376237623762377e-05, + "loss": 2.3575, "step": 50 }, { "epoch": 0.06802721088435375, - "grad_norm": 2.765625, - "learning_rate": 9.075907590759077e-06, - "loss": 2.7171, + "grad_norm": 1.125, + "learning_rate": 1.3613861386138616e-05, + "loss": 2.2852, "step": 55 }, { "epoch": 0.07421150278293136, - "grad_norm": 1.4140625, - "learning_rate": 9.900990099009901e-06, - "loss": 2.6948, + "grad_norm": 1.3671875, + "learning_rate": 1.4851485148514851e-05, + "loss": 2.2098, "step": 60 }, { "epoch": 0.08039579468150897, - "grad_norm": 1.5546875, - "learning_rate": 1.0726072607260726e-05, - "loss": 2.6657, + "grad_norm": 6.09375, + "learning_rate": 1.608910891089109e-05, + "loss": 2.1279, "step": 65 }, { "epoch": 0.08658008658008658, - "grad_norm": 1.3359375, - "learning_rate": 1.155115511551155e-05, - "loss": 2.6411, + "grad_norm": 1.625, + "learning_rate": 1.7326732673267325e-05, + "loss": 2.0726, "step": 70 }, { "epoch": 0.09276437847866419, - "grad_norm": 2.8125, - "learning_rate": 1.2376237623762377e-05, - "loss": 2.5744, + "grad_norm": 1.2578125, + "learning_rate": 1.8564356435643564e-05, + "loss": 1.9797, "step": 75 }, { "epoch": 0.09894867037724181, - "grad_norm": 3.0, - "learning_rate": 1.32013201320132e-05, - "loss": 2.5252, + "grad_norm": 1.09375, + "learning_rate": 1.9801980198019803e-05, + "loss": 1.9154, "step": 80 }, { "epoch": 0.10513296227581942, - "grad_norm": 1.4921875, - "learning_rate": 1.4026402640264028e-05, - "loss": 2.4702, + "grad_norm": 1.9140625, + "learning_rate": 2.103960396039604e-05, + "loss": 1.8348, "step": 85 }, { "epoch": 0.11131725417439703, - "grad_norm": 1.453125, - "learning_rate": 1.4851485148514851e-05, - "loss": 2.3871, + "grad_norm": 1.890625, + "learning_rate": 2.227722772277228e-05, + "loss": 1.7348, "step": 90 }, { "epoch": 0.11750154607297464, - "grad_norm": 2.84375, - "learning_rate": 1.567656765676568e-05, - "loss": 2.3574, + "grad_norm": 1.296875, + "learning_rate": 2.3514851485148515e-05, + "loss": 1.705, "step": 95 }, { "epoch": 0.12368583797155226, - "grad_norm": 1.6015625, - "learning_rate": 1.6501650165016504e-05, - "loss": 2.3075, + "grad_norm": 1.875, + "learning_rate": 2.4752475247524754e-05, + "loss": 1.643, "step": 100 }, { "epoch": 0.12987012987012986, - "grad_norm": 1.6484375, - "learning_rate": 1.7326732673267325e-05, - "loss": 2.2358, + "grad_norm": 0.5703125, + "learning_rate": 2.5990099009900993e-05, + "loss": 1.5904, "step": 105 }, { "epoch": 0.1360544217687075, - "grad_norm": 2.25, - "learning_rate": 1.8151815181518153e-05, - "loss": 2.1817, + "grad_norm": 2.078125, + "learning_rate": 2.722772277227723e-05, + "loss": 1.537, "step": 110 }, { "epoch": 0.1422387136672851, - "grad_norm": 1.1328125, - "learning_rate": 1.8976897689768978e-05, - "loss": 2.1457, + "grad_norm": 0.546875, + "learning_rate": 2.8465346534653464e-05, + "loss": 1.4979, "step": 115 }, { "epoch": 0.14842300556586271, - "grad_norm": 0.7890625, - "learning_rate": 1.9801980198019803e-05, - "loss": 2.1015, + "grad_norm": 0.474609375, + "learning_rate": 2.9702970297029702e-05, + "loss": 1.4599, "step": 120 }, { "epoch": 0.15460729746444032, - "grad_norm": 5.375, - "learning_rate": 2.0627062706270627e-05, - "loss": 2.0488, + "grad_norm": 0.68359375, + "learning_rate": 3.094059405940594e-05, + "loss": 1.4298, "step": 125 }, { "epoch": 0.16079158936301793, - "grad_norm": 1.0078125, - "learning_rate": 2.1452145214521452e-05, - "loss": 1.9967, + "grad_norm": 0.57421875, + "learning_rate": 3.217821782178218e-05, + "loss": 1.3996, "step": 130 }, { "epoch": 0.16697588126159554, - "grad_norm": 1.8359375, - "learning_rate": 2.227722772277228e-05, - "loss": 1.9499, + "grad_norm": 0.431640625, + "learning_rate": 3.341584158415842e-05, + "loss": 1.3705, "step": 135 }, { "epoch": 0.17316017316017315, - "grad_norm": 5.3125, - "learning_rate": 2.31023102310231e-05, - "loss": 1.9143, + "grad_norm": 0.482421875, + "learning_rate": 3.465346534653465e-05, + "loss": 1.3501, "step": 140 }, { "epoch": 0.17934446505875076, - "grad_norm": 5.625, - "learning_rate": 2.392739273927393e-05, - "loss": 1.8633, + "grad_norm": 0.373046875, + "learning_rate": 3.589108910891089e-05, + "loss": 1.3185, "step": 145 }, { "epoch": 0.18552875695732837, - "grad_norm": 1.9921875, - "learning_rate": 2.4752475247524754e-05, - "loss": 1.8392, + "grad_norm": 0.376953125, + "learning_rate": 3.712871287128713e-05, + "loss": 1.3105, "step": 150 }, { "epoch": 0.191713048855906, - "grad_norm": 1.859375, - "learning_rate": 2.557755775577558e-05, - "loss": 1.7943, + "grad_norm": 0.734375, + "learning_rate": 3.8366336633663367e-05, + "loss": 1.2914, "step": 155 }, { "epoch": 0.19789734075448362, - "grad_norm": 1.6015625, - "learning_rate": 2.64026402640264e-05, - "loss": 1.7733, + "grad_norm": 0.423828125, + "learning_rate": 3.9603960396039605e-05, + "loss": 1.2799, "step": 160 }, { "epoch": 0.20408163265306123, - "grad_norm": 1.5, - "learning_rate": 2.722772277227723e-05, - "loss": 1.7403, + "grad_norm": 0.4765625, + "learning_rate": 4.0841584158415844e-05, + "loss": 1.2651, "step": 165 }, { "epoch": 0.21026592455163884, - "grad_norm": 1.53125, - "learning_rate": 2.8052805280528056e-05, - "loss": 1.6989, + "grad_norm": 0.5078125, + "learning_rate": 4.207920792079208e-05, + "loss": 1.2502, "step": 170 }, { "epoch": 0.21645021645021645, - "grad_norm": 1.4453125, - "learning_rate": 2.8877887788778878e-05, - "loss": 1.6679, + "grad_norm": 0.486328125, + "learning_rate": 4.331683168316832e-05, + "loss": 1.2377, "step": 175 }, { "epoch": 0.22263450834879406, - "grad_norm": 1.59375, - "learning_rate": 2.9702970297029702e-05, - "loss": 1.629, + "grad_norm": 0.46484375, + "learning_rate": 4.455445544554456e-05, + "loss": 1.2192, "step": 180 }, { "epoch": 0.22881880024737167, - "grad_norm": 0.8671875, - "learning_rate": 3.052805280528053e-05, - "loss": 1.6264, + "grad_norm": 0.5078125, + "learning_rate": 4.57920792079208e-05, + "loss": 1.2274, "step": 185 }, { "epoch": 0.23500309214594928, - "grad_norm": 1.2109375, - "learning_rate": 3.135313531353136e-05, - "loss": 1.581, + "grad_norm": 0.578125, + "learning_rate": 4.702970297029703e-05, + "loss": 1.2065, "step": 190 }, { "epoch": 0.24118738404452691, - "grad_norm": 0.435546875, - "learning_rate": 3.217821782178218e-05, - "loss": 1.5598, + "grad_norm": 0.380859375, + "learning_rate": 4.826732673267327e-05, + "loss": 1.1975, "step": 195 }, { "epoch": 0.24737167594310452, - "grad_norm": 0.8828125, - "learning_rate": 3.300330033003301e-05, - "loss": 1.5259, + "grad_norm": 0.35546875, + "learning_rate": 4.950495049504951e-05, + "loss": 1.188, "step": 200 }, { "epoch": 0.2535559678416821, - "grad_norm": 0.38671875, - "learning_rate": 3.382838283828383e-05, - "loss": 1.4943, + "grad_norm": 0.376953125, + "learning_rate": 5.074257425742575e-05, + "loss": 1.1719, "step": 205 }, { "epoch": 0.2597402597402597, - "grad_norm": 0.59375, - "learning_rate": 3.465346534653465e-05, - "loss": 1.4984, + "grad_norm": 0.5078125, + "learning_rate": 5.1980198019801986e-05, + "loss": 1.187, "step": 210 }, { "epoch": 0.2659245516388373, - "grad_norm": 0.431640625, - "learning_rate": 3.5478547854785485e-05, - "loss": 1.4686, + "grad_norm": 0.439453125, + "learning_rate": 5.3217821782178224e-05, + "loss": 1.1684, "step": 215 }, { "epoch": 0.272108843537415, - "grad_norm": 0.6484375, - "learning_rate": 3.6303630363036307e-05, - "loss": 1.4699, + "grad_norm": 0.490234375, + "learning_rate": 5.445544554455446e-05, + "loss": 1.1803, "step": 220 }, { "epoch": 0.2782931354359926, - "grad_norm": 1.1484375, - "learning_rate": 3.712871287128713e-05, - "loss": 1.4291, + "grad_norm": 0.46875, + "learning_rate": 5.56930693069307e-05, + "loss": 1.1543, "step": 225 }, { "epoch": 0.2844774273345702, - "grad_norm": 0.4609375, - "learning_rate": 3.7953795379537956e-05, - "loss": 1.4235, + "grad_norm": 0.486328125, + "learning_rate": 5.693069306930693e-05, + "loss": 1.1618, "step": 230 }, { "epoch": 0.2906617192331478, - "grad_norm": 0.48828125, - "learning_rate": 3.877887788778878e-05, - "loss": 1.4041, + "grad_norm": 0.376953125, + "learning_rate": 5.8168316831683166e-05, + "loss": 1.1529, "step": 235 }, { "epoch": 0.29684601113172543, - "grad_norm": 0.40234375, - "learning_rate": 3.9603960396039605e-05, - "loss": 1.3848, + "grad_norm": 0.4765625, + "learning_rate": 5.9405940594059404e-05, + "loss": 1.1397, "step": 240 }, { "epoch": 0.30303030303030304, - "grad_norm": 0.90234375, - "learning_rate": 4.042904290429043e-05, - "loss": 1.3611, + "grad_norm": 0.462890625, + "learning_rate": 6.064356435643564e-05, + "loss": 1.1355, "step": 245 }, { "epoch": 0.30921459492888065, - "grad_norm": 0.314453125, - "learning_rate": 4.1254125412541255e-05, - "loss": 1.3558, + "grad_norm": 0.57421875, + "learning_rate": 6.188118811881188e-05, + "loss": 1.1376, "step": 250 }, { "epoch": 0.31539888682745826, - "grad_norm": 0.455078125, - "learning_rate": 4.207920792079208e-05, - "loss": 1.3451, + "grad_norm": 0.83203125, + "learning_rate": 6.311881188118812e-05, + "loss": 1.1359, "step": 255 }, { "epoch": 0.32158317872603587, - "grad_norm": 0.328125, - "learning_rate": 4.2904290429042904e-05, - "loss": 1.3326, + "grad_norm": 0.64453125, + "learning_rate": 6.435643564356436e-05, + "loss": 1.1293, "step": 260 }, { "epoch": 0.3277674706246135, - "grad_norm": 0.62890625, - "learning_rate": 4.372937293729373e-05, - "loss": 1.3084, + "grad_norm": 0.78125, + "learning_rate": 6.55940594059406e-05, + "loss": 1.117, "step": 265 }, { "epoch": 0.3339517625231911, - "grad_norm": 0.306640625, - "learning_rate": 4.455445544554456e-05, - "loss": 1.3091, + "grad_norm": 0.54296875, + "learning_rate": 6.683168316831684e-05, + "loss": 1.1217, "step": 270 }, { "epoch": 0.3401360544217687, - "grad_norm": 0.33984375, - "learning_rate": 4.537953795379538e-05, - "loss": 1.3078, + "grad_norm": 0.62109375, + "learning_rate": 6.806930693069308e-05, + "loss": 1.1266, "step": 275 }, { "epoch": 0.3463203463203463, - "grad_norm": 0.4296875, - "learning_rate": 4.62046204620462e-05, - "loss": 1.2944, + "grad_norm": 0.90234375, + "learning_rate": 6.93069306930693e-05, + "loss": 1.1214, "step": 280 }, { "epoch": 0.3525046382189239, - "grad_norm": 0.447265625, - "learning_rate": 4.702970297029703e-05, - "loss": 1.2689, + "grad_norm": 0.4921875, + "learning_rate": 7.054455445544554e-05, + "loss": 1.1032, "step": 285 }, { "epoch": 0.3586889301175015, - "grad_norm": 0.52734375, - "learning_rate": 4.785478547854786e-05, - "loss": 1.2907, + "grad_norm": 0.76171875, + "learning_rate": 7.178217821782178e-05, + "loss": 1.1243, "step": 290 }, { "epoch": 0.36487322201607914, - "grad_norm": 0.3515625, - "learning_rate": 4.867986798679868e-05, - "loss": 1.2734, + "grad_norm": 0.455078125, + "learning_rate": 7.301980198019802e-05, + "loss": 1.1129, "step": 295 }, { "epoch": 0.37105751391465674, - "grad_norm": 0.408203125, - "learning_rate": 4.950495049504951e-05, - "loss": 1.2533, + "grad_norm": 0.4609375, + "learning_rate": 7.425742574257426e-05, + "loss": 1.0952, "step": 300 }, { "epoch": 0.3772418058132344, - "grad_norm": 0.392578125, - "learning_rate": 5.0330033003300336e-05, - "loss": 1.2568, + "grad_norm": 0.453125, + "learning_rate": 7.54950495049505e-05, + "loss": 1.1083, "step": 305 }, { "epoch": 0.383426097711812, - "grad_norm": 0.48828125, - "learning_rate": 5.115511551155116e-05, - "loss": 1.2476, + "grad_norm": 0.50390625, + "learning_rate": 7.673267326732673e-05, + "loss": 1.0979, "step": 310 }, { "epoch": 0.38961038961038963, - "grad_norm": 0.5078125, - "learning_rate": 5.1980198019801986e-05, - "loss": 1.2506, + "grad_norm": 0.494140625, + "learning_rate": 7.797029702970297e-05, + "loss": 1.1026, "step": 315 }, { "epoch": 0.39579468150896724, - "grad_norm": 0.337890625, - "learning_rate": 5.28052805280528e-05, - "loss": 1.2311, + "grad_norm": 0.443359375, + "learning_rate": 7.920792079207921e-05, + "loss": 1.0854, "step": 320 }, { "epoch": 0.40197897340754485, - "grad_norm": 0.357421875, - "learning_rate": 5.3630363036303635e-05, - "loss": 1.2302, + "grad_norm": 0.55078125, + "learning_rate": 8.044554455445545e-05, + "loss": 1.0892, "step": 325 }, { "epoch": 0.40816326530612246, - "grad_norm": 0.380859375, - "learning_rate": 5.445544554455446e-05, - "loss": 1.2182, + "grad_norm": 1.1640625, + "learning_rate": 8.168316831683169e-05, + "loss": 1.0795, "step": 330 }, { "epoch": 0.41434755720470007, - "grad_norm": 0.376953125, - "learning_rate": 5.528052805280528e-05, - "loss": 1.2322, + "grad_norm": 0.67578125, + "learning_rate": 8.292079207920793e-05, + "loss": 1.099, "step": 335 }, { "epoch": 0.4205318491032777, - "grad_norm": 0.466796875, - "learning_rate": 5.610561056105611e-05, - "loss": 1.2181, + "grad_norm": 0.65625, + "learning_rate": 8.415841584158417e-05, + "loss": 1.087, "step": 340 }, { "epoch": 0.4267161410018553, - "grad_norm": 0.55078125, - "learning_rate": 5.693069306930693e-05, - "loss": 1.2136, + "grad_norm": 0.48828125, + "learning_rate": 8.53960396039604e-05, + "loss": 1.0812, "step": 345 }, { "epoch": 0.4329004329004329, - "grad_norm": 0.546875, - "learning_rate": 5.7755775577557755e-05, - "loss": 1.2089, + "grad_norm": 0.69921875, + "learning_rate": 8.663366336633664e-05, + "loss": 1.0831, "step": 350 }, { "epoch": 0.4390847247990105, - "grad_norm": 0.4375, - "learning_rate": 5.858085808580859e-05, - "loss": 1.197, + "grad_norm": 0.5390625, + "learning_rate": 8.787128712871288e-05, + "loss": 1.072, "step": 355 }, { "epoch": 0.4452690166975881, - "grad_norm": 0.55859375, - "learning_rate": 5.9405940594059404e-05, - "loss": 1.1929, + "grad_norm": 0.67578125, + "learning_rate": 8.910891089108912e-05, + "loss": 1.0684, "step": 360 }, { "epoch": 0.4514533085961657, - "grad_norm": 0.55859375, - "learning_rate": 6.023102310231023e-05, - "loss": 1.1859, + "grad_norm": 0.625, + "learning_rate": 9.034653465346536e-05, + "loss": 1.0639, "step": 365 }, { "epoch": 0.45763760049474334, - "grad_norm": 0.443359375, - "learning_rate": 6.105610561056106e-05, - "loss": 1.1969, + "grad_norm": 1.234375, + "learning_rate": 9.15841584158416e-05, + "loss": 1.0802, "step": 370 }, { "epoch": 0.46382189239332094, - "grad_norm": 0.46484375, - "learning_rate": 6.188118811881188e-05, - "loss": 1.1898, + "grad_norm": 0.78515625, + "learning_rate": 9.282178217821784e-05, + "loss": 1.0749, "step": 375 }, { "epoch": 0.47000618429189855, - "grad_norm": 0.3828125, - "learning_rate": 6.270627062706272e-05, - "loss": 1.1736, + "grad_norm": 1.03125, + "learning_rate": 9.405940594059406e-05, + "loss": 1.0647, "step": 380 }, { "epoch": 0.47619047619047616, - "grad_norm": 0.458984375, - "learning_rate": 6.353135313531354e-05, - "loss": 1.1827, + "grad_norm": 0.78515625, + "learning_rate": 9.52970297029703e-05, + "loss": 1.0707, "step": 385 }, { "epoch": 0.48237476808905383, - "grad_norm": 0.4921875, - "learning_rate": 6.435643564356436e-05, - "loss": 1.1792, + "grad_norm": 1.0703125, + "learning_rate": 9.653465346534654e-05, + "loss": 1.0695, "step": 390 }, { "epoch": 0.48855905998763144, - "grad_norm": 0.375, - "learning_rate": 6.518151815181518e-05, - "loss": 1.1674, + "grad_norm": 0.671875, + "learning_rate": 9.777227722772278e-05, + "loss": 1.0599, "step": 395 }, { "epoch": 0.49474335188620905, - "grad_norm": 0.349609375, - "learning_rate": 6.600660066006602e-05, - "loss": 1.1585, + "grad_norm": 0.62890625, + "learning_rate": 9.900990099009902e-05, + "loss": 1.0522, "step": 400 }, { "epoch": 0.5009276437847866, - "grad_norm": 0.333984375, - "learning_rate": 6.683168316831684e-05, - "loss": 1.1601, + "grad_norm": 0.4453125, + "learning_rate": 0.00010024752475247526, + "loss": 1.0553, "step": 405 }, { "epoch": 0.5071119356833642, - "grad_norm": 0.400390625, - "learning_rate": 6.765676567656766e-05, - "loss": 1.1643, + "grad_norm": 0.482421875, + "learning_rate": 0.0001014851485148515, + "loss": 1.0575, "step": 410 }, { "epoch": 0.5132962275819418, - "grad_norm": 0.404296875, - "learning_rate": 6.848184818481849e-05, - "loss": 1.1668, + "grad_norm": 0.57421875, + "learning_rate": 0.00010272277227722773, + "loss": 1.0644, "step": 415 }, { "epoch": 0.5194805194805194, - "grad_norm": 0.4609375, - "learning_rate": 6.93069306930693e-05, - "loss": 1.1485, + "grad_norm": 1.015625, + "learning_rate": 0.00010396039603960397, + "loss": 1.0479, "step": 420 }, { "epoch": 0.525664811379097, - "grad_norm": 0.376953125, - "learning_rate": 7.013201320132014e-05, - "loss": 1.1561, + "grad_norm": 0.60546875, + "learning_rate": 0.00010519801980198021, + "loss": 1.0548, "step": 425 }, { "epoch": 0.5318491032776747, - "grad_norm": 0.40625, - "learning_rate": 7.095709570957097e-05, - "loss": 1.1572, + "grad_norm": 0.56640625, + "learning_rate": 0.00010643564356435645, + "loss": 1.0569, "step": 430 }, { "epoch": 0.5380333951762524, - "grad_norm": 0.41015625, - "learning_rate": 7.178217821782178e-05, - "loss": 1.1527, + "grad_norm": 0.486328125, + "learning_rate": 0.00010767326732673269, + "loss": 1.0541, "step": 435 }, { "epoch": 0.54421768707483, - "grad_norm": 0.419921875, - "learning_rate": 7.260726072607261e-05, - "loss": 1.1559, + "grad_norm": 0.4765625, + "learning_rate": 0.00010891089108910893, + "loss": 1.0588, "step": 440 }, { "epoch": 0.5504019789734076, - "grad_norm": 0.40234375, - "learning_rate": 7.343234323432343e-05, - "loss": 1.1587, + "grad_norm": 0.55078125, + "learning_rate": 0.00011014851485148517, + "loss": 1.0609, "step": 445 }, { "epoch": 0.5565862708719852, - "grad_norm": 0.46484375, - "learning_rate": 7.425742574257426e-05, - "loss": 1.1394, + "grad_norm": 0.6171875, + "learning_rate": 0.0001113861386138614, + "loss": 1.0463, "step": 450 }, { "epoch": 0.5627705627705628, - "grad_norm": 0.384765625, - "learning_rate": 7.508250825082509e-05, - "loss": 1.1451, + "grad_norm": 0.51953125, + "learning_rate": 0.00011262376237623762, + "loss": 1.0491, "step": 455 }, { "epoch": 0.5689548546691404, - "grad_norm": 0.40625, - "learning_rate": 7.590759075907591e-05, - "loss": 1.1405, + "grad_norm": 0.455078125, + "learning_rate": 0.00011386138613861385, + "loss": 1.0472, "step": 460 }, { "epoch": 0.575139146567718, "grad_norm": 0.55859375, - "learning_rate": 7.673267326732673e-05, - "loss": 1.1397, + "learning_rate": 0.00011509900990099009, + "loss": 1.0456, "step": 465 }, { "epoch": 0.5813234384662956, - "grad_norm": 0.49609375, - "learning_rate": 7.755775577557755e-05, - "loss": 1.1213, + "grad_norm": 0.515625, + "learning_rate": 0.00011633663366336633, + "loss": 1.0289, "step": 470 }, { "epoch": 0.5875077303648732, - "grad_norm": 0.40234375, - "learning_rate": 7.838283828382839e-05, - "loss": 1.1313, + "grad_norm": 0.447265625, + "learning_rate": 0.00011757425742574257, + "loss": 1.04, "step": 475 }, { "epoch": 0.5936920222634509, - "grad_norm": 0.52734375, - "learning_rate": 7.920792079207921e-05, - "loss": 1.1268, + "grad_norm": 0.5234375, + "learning_rate": 0.00011881188118811881, + "loss": 1.0322, "step": 480 }, { "epoch": 0.5998763141620285, - "grad_norm": 0.39453125, - "learning_rate": 8.003300330033003e-05, - "loss": 1.1137, + "grad_norm": 0.4609375, + "learning_rate": 0.00012004950495049505, + "loss": 1.0225, "step": 485 }, { "epoch": 0.6060606060606061, - "grad_norm": 0.578125, - "learning_rate": 8.085808580858087e-05, - "loss": 1.1352, + "grad_norm": 0.451171875, + "learning_rate": 0.00012128712871287129, + "loss": 1.0438, "step": 490 }, { "epoch": 0.6122448979591837, - "grad_norm": 0.4140625, - "learning_rate": 8.168316831683169e-05, - "loss": 1.1321, + "grad_norm": 0.6484375, + "learning_rate": 0.00012252475247524753, + "loss": 1.0431, "step": 495 }, { "epoch": 0.6184291898577613, - "grad_norm": 0.396484375, - "learning_rate": 8.250825082508251e-05, - "loss": 1.116, + "grad_norm": 0.65234375, + "learning_rate": 0.00012376237623762376, + "loss": 1.0262, "step": 500 }, { "epoch": 0.6246134817563389, - "grad_norm": 0.5703125, - "learning_rate": 8.333333333333334e-05, - "loss": 1.1233, + "grad_norm": 0.60546875, + "learning_rate": 0.000125, + "loss": 1.0338, "step": 505 }, { "epoch": 0.6307977736549165, - "grad_norm": 0.50390625, - "learning_rate": 8.415841584158417e-05, - "loss": 1.1291, + "grad_norm": 0.65625, + "learning_rate": 0.00012623762376237624, + "loss": 1.0423, "step": 510 }, { "epoch": 0.6369820655534941, - "grad_norm": 0.69140625, - "learning_rate": 8.498349834983499e-05, - "loss": 1.1307, + "grad_norm": 0.3984375, + "learning_rate": 0.00012747524752475248, + "loss": 1.0428, "step": 515 }, { "epoch": 0.6431663574520717, - "grad_norm": 0.54296875, - "learning_rate": 8.580858085808581e-05, - "loss": 1.1188, + "grad_norm": 0.57421875, + "learning_rate": 0.00012871287128712872, + "loss": 1.0311, "step": 520 }, { "epoch": 0.6493506493506493, - "grad_norm": 0.490234375, - "learning_rate": 8.663366336633664e-05, - "loss": 1.116, + "grad_norm": 0.5859375, + "learning_rate": 0.00012995049504950496, + "loss": 1.0284, "step": 525 }, { "epoch": 0.655534941249227, - "grad_norm": 0.51953125, - "learning_rate": 8.745874587458746e-05, - "loss": 1.1128, + "grad_norm": 0.5546875, + "learning_rate": 0.0001311881188118812, + "loss": 1.0259, "step": 530 }, { "epoch": 0.6617192331478046, - "grad_norm": 0.58203125, - "learning_rate": 8.828382838283829e-05, - "loss": 1.1046, + "grad_norm": 0.6171875, + "learning_rate": 0.00013242574257425743, + "loss": 1.0183, "step": 535 }, { "epoch": 0.6679035250463822, - "grad_norm": 0.69921875, - "learning_rate": 8.910891089108912e-05, - "loss": 1.1184, + "grad_norm": 1.1015625, + "learning_rate": 0.00013366336633663367, + "loss": 1.0342, "step": 540 }, { "epoch": 0.6740878169449598, - "grad_norm": 0.55859375, - "learning_rate": 8.993399339933993e-05, - "loss": 1.1039, + "grad_norm": 0.64453125, + "learning_rate": 0.0001349009900990099, + "loss": 1.0192, "step": 545 }, { "epoch": 0.6802721088435374, - "grad_norm": 0.65234375, - "learning_rate": 9.075907590759076e-05, - "loss": 1.1097, + "grad_norm": 0.74609375, + "learning_rate": 0.00013613861386138615, + "loss": 1.0251, "step": 550 }, { "epoch": 0.686456400742115, - "grad_norm": 0.486328125, - "learning_rate": 9.15841584158416e-05, - "loss": 1.1088, + "grad_norm": 1.0078125, + "learning_rate": 0.0001373762376237624, + "loss": 1.0268, "step": 555 }, { "epoch": 0.6926406926406926, - "grad_norm": 0.85546875, - "learning_rate": 9.24092409240924e-05, - "loss": 1.0955, + "grad_norm": 0.7265625, + "learning_rate": 0.0001386138613861386, + "loss": 1.0152, "step": 560 }, { "epoch": 0.6988249845392702, - "grad_norm": 0.57421875, - "learning_rate": 9.323432343234324e-05, - "loss": 1.1105, + "grad_norm": 0.703125, + "learning_rate": 0.00013985148514851484, + "loss": 1.0283, "step": 565 }, { "epoch": 0.7050092764378478, - "grad_norm": 0.6953125, - "learning_rate": 9.405940594059406e-05, - "loss": 1.1024, + "grad_norm": 0.53125, + "learning_rate": 0.00014108910891089108, + "loss": 1.0177, "step": 570 }, { "epoch": 0.7111935683364254, - "grad_norm": 0.5078125, - "learning_rate": 9.488448844884488e-05, - "loss": 1.0922, + "grad_norm": 0.58203125, + "learning_rate": 0.00014232673267326732, + "loss": 1.0115, "step": 575 }, { "epoch": 0.717377860235003, - "grad_norm": 0.515625, - "learning_rate": 9.570957095709572e-05, - "loss": 1.0985, - "step": 580 + "grad_norm": 0.5546875, + "learning_rate": 0.00014356435643564356, + "loss": 1.0195, + "step": 580 }, { "epoch": 0.7235621521335807, - "grad_norm": 0.5390625, - "learning_rate": 9.653465346534654e-05, - "loss": 1.0908, + "grad_norm": 0.5703125, + "learning_rate": 0.0001448019801980198, + "loss": 1.0104, "step": 585 }, { "epoch": 0.7297464440321583, - "grad_norm": 0.60546875, - "learning_rate": 9.735973597359736e-05, - "loss": 1.0882, + "grad_norm": 0.6171875, + "learning_rate": 0.00014603960396039603, + "loss": 1.0054, "step": 590 }, { "epoch": 0.7359307359307359, - "grad_norm": 0.451171875, - "learning_rate": 9.818481848184818e-05, - "loss": 1.102, + "grad_norm": 0.76953125, + "learning_rate": 0.00014727722772277227, + "loss": 1.0204, "step": 595 }, { "epoch": 0.7421150278293135, - "grad_norm": 0.5625, - "learning_rate": 9.900990099009902e-05, - "loss": 1.0799, + "grad_norm": 0.447265625, + "learning_rate": 0.0001485148514851485, + "loss": 0.9982, "step": 600 }, { "epoch": 0.7482993197278912, - "grad_norm": 0.57421875, - "learning_rate": 9.983498349834984e-05, - "loss": 1.0933, + "grad_norm": 0.498046875, + "learning_rate": 0.00014975247524752475, + "loss": 1.0143, "step": 605 }, { "epoch": 0.7544836116264688, - "grad_norm": 0.4765625, - "learning_rate": 0.00010066006600660067, - "loss": 1.0881, + "grad_norm": 0.6171875, + "learning_rate": 0.000150990099009901, + "loss": 1.0077, "step": 610 }, { "epoch": 0.7606679035250464, - "grad_norm": 0.4375, - "learning_rate": 0.0001014851485148515, - "loss": 1.0926, + "grad_norm": 0.515625, + "learning_rate": 0.00015222772277227723, + "loss": 1.0123, "step": 615 }, { "epoch": 0.766852195423624, "grad_norm": 0.69921875, - "learning_rate": 0.00010231023102310232, - "loss": 1.0931, + "learning_rate": 0.00015346534653465347, + "loss": 1.0123, "step": 620 }, { "epoch": 0.7730364873222016, - "grad_norm": 0.55859375, - "learning_rate": 0.00010313531353135315, - "loss": 1.0923, + "grad_norm": 0.4765625, + "learning_rate": 0.0001547029702970297, + "loss": 1.012, "step": 625 }, { "epoch": 0.7792207792207793, - "grad_norm": 0.62109375, - "learning_rate": 0.00010396039603960397, - "loss": 1.0854, + "grad_norm": 0.423828125, + "learning_rate": 0.00015594059405940594, + "loss": 1.0055, "step": 630 }, { "epoch": 0.7854050711193569, - "grad_norm": 0.5078125, - "learning_rate": 0.00010478547854785479, - "loss": 1.0864, + "grad_norm": 0.42578125, + "learning_rate": 0.00015717821782178218, + "loss": 1.0076, "step": 635 }, { "epoch": 0.7915893630179345, - "grad_norm": 0.447265625, - "learning_rate": 0.0001056105610561056, - "loss": 1.0863, + "grad_norm": 0.89453125, + "learning_rate": 0.00015841584158415842, + "loss": 1.007, "step": 640 }, { "epoch": 0.7977736549165121, - "grad_norm": 0.73046875, - "learning_rate": 0.00010643564356435645, - "loss": 1.0843, + "grad_norm": 0.462890625, + "learning_rate": 0.00015965346534653466, + "loss": 1.004, "step": 645 }, { "epoch": 0.8039579468150897, - "grad_norm": 0.71484375, - "learning_rate": 0.00010726072607260727, - "loss": 1.0822, + "grad_norm": 0.4765625, + "learning_rate": 0.0001608910891089109, + "loss": 1.0002, "step": 650 }, { "epoch": 0.8101422387136673, - "grad_norm": 0.76953125, - "learning_rate": 0.00010808580858085808, - "loss": 1.0802, + "grad_norm": 0.412109375, + "learning_rate": 0.00016212871287128714, + "loss": 0.9985, "step": 655 }, { "epoch": 0.8163265306122449, - "grad_norm": 0.55078125, - "learning_rate": 0.00010891089108910893, - "loss": 1.0822, + "grad_norm": 0.392578125, + "learning_rate": 0.00016336633663366338, + "loss": 0.9997, "step": 660 }, { "epoch": 0.8225108225108225, - "grad_norm": 0.5546875, - "learning_rate": 0.00010973597359735975, - "loss": 1.076, + "grad_norm": 0.421875, + "learning_rate": 0.00016460396039603961, + "loss": 0.9955, "step": 665 }, { "epoch": 0.8286951144094001, - "grad_norm": 0.515625, - "learning_rate": 0.00011056105610561056, - "loss": 1.0688, + "grad_norm": 0.490234375, + "learning_rate": 0.00016584158415841585, + "loss": 0.9878, "step": 670 }, { "epoch": 0.8348794063079777, - "grad_norm": 0.515625, - "learning_rate": 0.0001113861386138614, - "loss": 1.0839, + "grad_norm": 0.439453125, + "learning_rate": 0.0001670792079207921, + "loss": 1.0049, "step": 675 }, { "epoch": 0.8410636982065554, - "grad_norm": 0.68359375, - "learning_rate": 0.00011221122112211223, - "loss": 1.0802, + "grad_norm": 0.70703125, + "learning_rate": 0.00016831683168316833, + "loss": 1.0004, "step": 680 }, { "epoch": 0.847247990105133, - "grad_norm": 0.515625, - "learning_rate": 0.00011303630363036303, - "loss": 1.0699, + "grad_norm": 0.74609375, + "learning_rate": 0.00016955445544554457, + "loss": 0.9907, "step": 685 }, { "epoch": 0.8534322820037106, - "grad_norm": 0.5546875, - "learning_rate": 0.00011386138613861385, - "loss": 1.0692, + "grad_norm": 0.6015625, + "learning_rate": 0.0001707920792079208, + "loss": 0.992, "step": 690 }, { "epoch": 0.8596165739022882, - "grad_norm": 0.470703125, - "learning_rate": 0.0001146864686468647, - "loss": 1.0756, + "grad_norm": 0.671875, + "learning_rate": 0.00017202970297029705, + "loss": 0.9973, "step": 695 }, { "epoch": 0.8658008658008658, - "grad_norm": 0.53125, - "learning_rate": 0.00011551155115511551, - "loss": 1.0819, + "grad_norm": 0.47265625, + "learning_rate": 0.00017326732673267329, + "loss": 1.0015, "step": 700 }, { "epoch": 0.8719851576994434, - "grad_norm": 0.478515625, - "learning_rate": 0.00011633663366336633, - "loss": 1.071, + "grad_norm": 0.55859375, + "learning_rate": 0.00017450495049504952, + "loss": 0.9928, "step": 705 }, { "epoch": 0.878169449598021, - "grad_norm": 0.515625, - "learning_rate": 0.00011716171617161718, - "loss": 1.069, + "grad_norm": 0.51171875, + "learning_rate": 0.00017574257425742576, + "loss": 0.9891, "step": 710 }, { "epoch": 0.8843537414965986, - "grad_norm": 0.62890625, - "learning_rate": 0.00011798679867986799, - "loss": 1.0676, + "grad_norm": 0.376953125, + "learning_rate": 0.000176980198019802, + "loss": 0.9895, "step": 715 }, { "epoch": 0.8905380333951762, - "grad_norm": 0.69140625, - "learning_rate": 0.00011881188118811881, - "loss": 1.0562, + "grad_norm": 0.53515625, + "learning_rate": 0.00017821782178217824, + "loss": 0.9783, "step": 720 }, { "epoch": 0.8967223252937538, - "grad_norm": 0.5859375, - "learning_rate": 0.00011963696369636966, - "loss": 1.0725, + "grad_norm": 0.9375, + "learning_rate": 0.00017945544554455448, + "loss": 0.9943, "step": 725 }, { "epoch": 0.9029066171923315, - "grad_norm": 0.60546875, - "learning_rate": 0.00012046204620462047, - "loss": 1.0705, + "grad_norm": 0.63671875, + "learning_rate": 0.00018069306930693072, + "loss": 0.9906, "step": 730 }, { "epoch": 0.9090909090909091, - "grad_norm": 0.5703125, - "learning_rate": 0.00012128712871287129, - "loss": 1.071, + "grad_norm": 0.416015625, + "learning_rate": 0.00018193069306930696, + "loss": 0.9906, "step": 735 }, { "epoch": 0.9152752009894867, - "grad_norm": 0.64453125, - "learning_rate": 0.00012211221122112212, - "loss": 1.0615, + "grad_norm": 0.5234375, + "learning_rate": 0.0001831683168316832, + "loss": 0.9826, "step": 740 }, { "epoch": 0.9214594928880643, - "grad_norm": 0.56640625, - "learning_rate": 0.00012293729372937296, - "loss": 1.0725, + "grad_norm": 0.470703125, + "learning_rate": 0.00018440594059405943, + "loss": 0.9912, "step": 745 }, { "epoch": 0.9276437847866419, - "grad_norm": 0.609375, - "learning_rate": 0.00012376237623762376, - "loss": 1.0583, + "grad_norm": 0.4609375, + "learning_rate": 0.00018564356435643567, + "loss": 0.9796, "step": 750 }, { "epoch": 0.9338280766852195, - "grad_norm": 0.7265625, - "learning_rate": 0.0001245874587458746, - "loss": 1.059, + "grad_norm": 0.53125, + "learning_rate": 0.0001868811881188119, + "loss": 0.9801, "step": 755 }, { "epoch": 0.9400123685837971, - "grad_norm": 0.54296875, - "learning_rate": 0.00012541254125412543, - "loss": 1.0561, + "grad_norm": 0.5234375, + "learning_rate": 0.00018811881188118812, + "loss": 0.9776, "step": 760 }, { "epoch": 0.9461966604823747, - "grad_norm": 0.7578125, - "learning_rate": 0.00012623762376237624, - "loss": 1.0576, + "grad_norm": 0.380859375, + "learning_rate": 0.00018935643564356436, + "loss": 0.9785, "step": 765 }, { "epoch": 0.9523809523809523, - "grad_norm": 0.5859375, - "learning_rate": 0.00012706270627062708, - "loss": 1.056, + "grad_norm": 0.40625, + "learning_rate": 0.0001905940594059406, + "loss": 0.9771, "step": 770 }, { "epoch": 0.95856524427953, - "grad_norm": 0.58984375, - "learning_rate": 0.0001278877887788779, - "loss": 1.0583, + "grad_norm": 0.44140625, + "learning_rate": 0.00019183168316831684, + "loss": 0.9786, "step": 775 }, { "epoch": 0.9647495361781077, - "grad_norm": 0.53515625, - "learning_rate": 0.00012871287128712872, - "loss": 1.0501, + "grad_norm": 0.443359375, + "learning_rate": 0.00019306930693069308, + "loss": 0.9706, "step": 780 }, { "epoch": 0.9709338280766853, - "grad_norm": 0.578125, - "learning_rate": 0.00012953795379537955, - "loss": 1.057, + "grad_norm": 0.4296875, + "learning_rate": 0.00019430693069306932, + "loss": 0.9776, "step": 785 }, { "epoch": 0.9771181199752629, - "grad_norm": 0.55078125, - "learning_rate": 0.00013036303630363036, - "loss": 1.0529, + "grad_norm": 0.486328125, + "learning_rate": 0.00019554455445544556, + "loss": 0.9755, "step": 790 }, { "epoch": 0.9833024118738405, - "grad_norm": 0.482421875, - "learning_rate": 0.0001311881188118812, - "loss": 1.0481, + "grad_norm": 0.42578125, + "learning_rate": 0.0001967821782178218, + "loss": 0.9723, "step": 795 }, { "epoch": 0.9894867037724181, - "grad_norm": 0.66796875, - "learning_rate": 0.00013201320132013203, - "loss": 1.0671, + "grad_norm": 0.609375, + "learning_rate": 0.00019801980198019803, + "loss": 0.9878, "step": 800 }, { "epoch": 0.9956709956709957, - "grad_norm": 0.546875, - "learning_rate": 0.00013283828382838284, - "loss": 1.0494, + "grad_norm": 0.4609375, + "learning_rate": 0.00019925742574257427, + "loss": 0.9714, "step": 805 }, { "epoch": 0.9993815708101422, - "eval_loss": 2.4662458896636963, - "eval_runtime": 0.6569, - "eval_samples_per_second": 15.223, - "eval_steps_per_second": 1.522, + "eval_loss": 2.4534544944763184, + "eval_runtime": 0.806, + "eval_samples_per_second": 12.408, + "eval_steps_per_second": 1.241, "step": 808 }, { "epoch": 1.0018552875695732, - "grad_norm": 0.6953125, - "learning_rate": 0.00013366336633663367, - "loss": 1.0467, + "grad_norm": 0.423828125, + "learning_rate": 0.0001999999626730957, + "loss": 0.9646, "step": 810 }, { "epoch": 1.008039579468151, - "grad_norm": 0.7578125, - "learning_rate": 0.0001344884488448845, - "loss": 1.0493, + "grad_norm": 0.5, + "learning_rate": 0.000199999542745742, + "loss": 0.9632, "step": 815 }, { "epoch": 1.0142238713667284, - "grad_norm": 0.70703125, - "learning_rate": 0.00013531353135313532, - "loss": 1.0417, + "grad_norm": 0.5390625, + "learning_rate": 0.00019999865623437013, + "loss": 0.9571, "step": 820 }, { "epoch": 1.0204081632653061, - "grad_norm": 0.640625, - "learning_rate": 0.00013613861386138615, - "loss": 1.0413, + "grad_norm": 0.4609375, + "learning_rate": 0.00019999730314311637, + "loss": 0.9559, "step": 825 }, { "epoch": 1.0265924551638836, - "grad_norm": 0.8046875, - "learning_rate": 0.00013696369636963699, - "loss": 1.07, + "grad_norm": 0.4140625, + "learning_rate": 0.0001999954834782941, + "loss": 0.9832, "step": 830 }, { "epoch": 1.0327767470624614, - "grad_norm": 0.6015625, - "learning_rate": 0.0001377887788778878, - "loss": 1.0425, + "grad_norm": 0.431640625, + "learning_rate": 0.00019999319724839354, + "loss": 0.9583, "step": 835 }, { "epoch": 1.0389610389610389, - "grad_norm": 0.54296875, - "learning_rate": 0.0001386138613861386, - "loss": 1.0537, + "grad_norm": 0.486328125, + "learning_rate": 0.000199990444464082, + "loss": 0.9696, "step": 840 }, { "epoch": 1.0451453308596166, - "grad_norm": 0.5625, - "learning_rate": 0.00013943894389438946, - "loss": 1.0425, + "grad_norm": 0.73046875, + "learning_rate": 0.0001999872251382036, + "loss": 0.9661, "step": 845 }, { "epoch": 1.051329622758194, - "grad_norm": 0.6171875, - "learning_rate": 0.00014026402640264027, - "loss": 1.0478, + "grad_norm": 0.54296875, + "learning_rate": 0.00019998353928577919, + "loss": 0.9668, "step": 850 }, { "epoch": 1.0575139146567718, - "grad_norm": 0.48046875, - "learning_rate": 0.00014108910891089108, - "loss": 1.0427, + "grad_norm": 0.5078125, + "learning_rate": 0.00019997938692400648, + "loss": 0.9591, "step": 855 }, { "epoch": 1.0636982065553493, - "grad_norm": 0.55078125, - "learning_rate": 0.00014191419141914194, - "loss": 1.0406, + "grad_norm": 0.34375, + "learning_rate": 0.00019997476807225985, + "loss": 0.9564, "step": 860 }, { "epoch": 1.069882498453927, - "grad_norm": 0.466796875, - "learning_rate": 0.00014273927392739275, - "loss": 1.0384, + "grad_norm": 0.369140625, + "learning_rate": 0.0001999696827520902, + "loss": 0.9541, "step": 865 }, { "epoch": 1.0760667903525047, - "grad_norm": 0.61328125, - "learning_rate": 0.00014356435643564356, - "loss": 1.031, + "grad_norm": 0.375, + "learning_rate": 0.00019996413098722493, + "loss": 0.9466, "step": 870 }, { "epoch": 1.0822510822510822, - "grad_norm": 0.6484375, - "learning_rate": 0.00014438943894389442, - "loss": 1.0371, + "grad_norm": 0.38671875, + "learning_rate": 0.00019995811280356778, + "loss": 0.9512, "step": 875 }, { "epoch": 1.08843537414966, - "grad_norm": 0.482421875, - "learning_rate": 0.00014521452145214523, - "loss": 1.0256, + "grad_norm": 0.37109375, + "learning_rate": 0.00019995162822919883, + "loss": 0.9392, "step": 880 }, { "epoch": 1.0946196660482375, - "grad_norm": 0.6640625, - "learning_rate": 0.00014603960396039603, - "loss": 1.049, + "grad_norm": 0.412109375, + "learning_rate": 0.00019994467729437412, + "loss": 0.9599, "step": 885 }, { "epoch": 1.1008039579468152, - "grad_norm": 0.84765625, - "learning_rate": 0.00014686468646864687, - "loss": 1.0373, + "grad_norm": 0.439453125, + "learning_rate": 0.00019993726003152582, + "loss": 0.9506, "step": 890 }, { "epoch": 1.1069882498453927, - "grad_norm": 0.48828125, - "learning_rate": 0.0001476897689768977, - "loss": 1.0404, + "grad_norm": 0.412109375, + "learning_rate": 0.0001999293764752618, + "loss": 0.9553, "step": 895 }, { "epoch": 1.1131725417439704, - "grad_norm": 0.54296875, - "learning_rate": 0.0001485148514851485, - "loss": 1.0224, + "grad_norm": 0.453125, + "learning_rate": 0.00019992102666236566, + "loss": 0.9369, "step": 900 }, { "epoch": 1.119356833642548, - "grad_norm": 0.455078125, - "learning_rate": 0.00014933993399339935, - "loss": 1.035, + "grad_norm": 0.451171875, + "learning_rate": 0.00019991221063179652, + "loss": 0.9516, "step": 905 }, { "epoch": 1.1255411255411256, - "grad_norm": 0.609375, - "learning_rate": 0.00015016501650165018, - "loss": 1.0376, + "grad_norm": 0.40625, + "learning_rate": 0.00019990292842468868, + "loss": 0.954, "step": 910 }, { "epoch": 1.1317254174397031, - "grad_norm": 0.55078125, - "learning_rate": 0.000150990099009901, - "loss": 1.0351, + "grad_norm": 0.396484375, + "learning_rate": 0.00019989318008435165, + "loss": 0.9495, "step": 915 }, { "epoch": 1.1379097093382808, - "grad_norm": 0.65625, - "learning_rate": 0.00015181518151815182, - "loss": 1.041, + "grad_norm": 0.484375, + "learning_rate": 0.00019988296565626987, + "loss": 0.957, "step": 920 }, { "epoch": 1.1440940012368583, - "grad_norm": 0.66015625, - "learning_rate": 0.00015264026402640266, - "loss": 1.0341, + "grad_norm": 0.53515625, + "learning_rate": 0.00019987228518810244, + "loss": 0.9508, "step": 925 }, { "epoch": 1.150278293135436, - "grad_norm": 0.58203125, - "learning_rate": 0.00015346534653465347, - "loss": 1.0292, + "grad_norm": 0.455078125, + "learning_rate": 0.0001998611387296829, + "loss": 0.9433, "step": 930 }, { "epoch": 1.1564625850340136, - "grad_norm": 0.5625, - "learning_rate": 0.0001542904290429043, - "loss": 1.0407, + "grad_norm": 0.36328125, + "learning_rate": 0.00019984952633301915, + "loss": 0.9576, "step": 935 }, { "epoch": 1.1626468769325913, - "grad_norm": 0.6015625, - "learning_rate": 0.0001551155115511551, - "loss": 1.0252, + "grad_norm": 0.408203125, + "learning_rate": 0.00019983744805229296, + "loss": 0.9433, "step": 940 }, { "epoch": 1.1688311688311688, - "grad_norm": 0.6328125, - "learning_rate": 0.00015594059405940594, - "loss": 1.0396, + "grad_norm": 0.435546875, + "learning_rate": 0.00019982490394385995, + "loss": 0.9551, "step": 945 }, { "epoch": 1.1750154607297465, - "grad_norm": 0.63671875, - "learning_rate": 0.00015676567656765678, - "loss": 1.0375, + "grad_norm": 0.4921875, + "learning_rate": 0.00019981189406624922, + "loss": 0.953, "step": 950 }, { "epoch": 1.181199752628324, - "grad_norm": 0.51953125, - "learning_rate": 0.00015759075907590759, - "loss": 1.0245, + "grad_norm": 0.384765625, + "learning_rate": 0.00019979841848016298, + "loss": 0.9393, "step": 955 }, { "epoch": 1.1873840445269017, - "grad_norm": 0.49609375, - "learning_rate": 0.00015841584158415842, - "loss": 1.0366, + "grad_norm": 0.392578125, + "learning_rate": 0.00019978447724847652, + "loss": 0.9523, "step": 960 }, { "epoch": 1.1935683364254792, - "grad_norm": 0.60546875, - "learning_rate": 0.00015924092409240926, - "loss": 1.0336, + "grad_norm": 0.49609375, + "learning_rate": 0.0001997700704362377, + "loss": 0.9481, "step": 965 }, { "epoch": 1.199752628324057, - "grad_norm": 0.52734375, - "learning_rate": 0.00016006600660066006, - "loss": 1.0353, + "grad_norm": 0.400390625, + "learning_rate": 0.00019975519811066663, + "loss": 0.95, "step": 970 }, { "epoch": 1.2059369202226344, - "grad_norm": 0.58203125, - "learning_rate": 0.0001608910891089109, - "loss": 1.0294, + "grad_norm": 0.46875, + "learning_rate": 0.0001997398603411556, + "loss": 0.9416, "step": 975 }, { "epoch": 1.2121212121212122, - "grad_norm": 0.6796875, - "learning_rate": 0.00016171617161716173, - "loss": 1.0297, + "grad_norm": 0.388671875, + "learning_rate": 0.0001997240571992685, + "loss": 0.9443, "step": 980 }, { "epoch": 1.2183055040197897, - "grad_norm": 0.494140625, - "learning_rate": 0.00016254125412541254, - "loss": 1.0302, + "grad_norm": 0.4140625, + "learning_rate": 0.00019970778875874056, + "loss": 0.9414, "step": 985 }, { "epoch": 1.2244897959183674, - "grad_norm": 0.59375, - "learning_rate": 0.00016336633663366338, - "loss": 1.0242, + "grad_norm": 0.416015625, + "learning_rate": 0.00019969105509547812, + "loss": 0.939, "step": 990 }, { "epoch": 1.2306740878169449, - "grad_norm": 0.765625, - "learning_rate": 0.0001641914191419142, - "loss": 1.0314, + "grad_norm": 0.45703125, + "learning_rate": 0.00019967385628755812, + "loss": 0.9453, "step": 995 }, { "epoch": 1.2368583797155226, - "grad_norm": 0.5859375, - "learning_rate": 0.00016501650165016502, - "loss": 1.0194, + "grad_norm": 0.3828125, + "learning_rate": 0.0001996561924152278, + "loss": 0.935, "step": 1000 }, { "epoch": 1.2430426716141, - "grad_norm": 0.5546875, - "learning_rate": 0.00016584158415841585, - "loss": 1.0178, + "grad_norm": 0.345703125, + "learning_rate": 0.0001996380635609044, + "loss": 0.9314, "step": 1005 }, { "epoch": 1.2492269635126778, - "grad_norm": 0.59765625, - "learning_rate": 0.0001666666666666667, - "loss": 1.0241, + "grad_norm": 0.4765625, + "learning_rate": 0.00019961946980917456, + "loss": 0.9376, "step": 1010 }, { "epoch": 1.2554112554112553, - "grad_norm": 0.68359375, - "learning_rate": 0.0001674917491749175, - "loss": 1.0271, + "grad_norm": 0.40234375, + "learning_rate": 0.00019960041124679421, + "loss": 0.941, "step": 1015 }, { "epoch": 1.261595547309833, - "grad_norm": 0.515625, - "learning_rate": 0.00016831683168316833, - "loss": 1.0307, + "grad_norm": 0.443359375, + "learning_rate": 0.00019958088796268793, + "loss": 0.9455, "step": 1020 }, { "epoch": 1.2677798392084108, - "grad_norm": 0.57421875, - "learning_rate": 0.00016914191419141917, - "loss": 1.0366, + "grad_norm": 3.625, + "learning_rate": 0.00019956090004794868, + "loss": 0.9533, "step": 1025 }, { "epoch": 1.2739641311069883, - "grad_norm": 0.86328125, - "learning_rate": 0.00016996699669966997, - "loss": 1.0276, + "grad_norm": 0.99609375, + "learning_rate": 0.0001995404475958373, + "loss": 0.9513, "step": 1030 }, { "epoch": 1.2801484230055657, - "grad_norm": 0.8125, - "learning_rate": 0.0001707920792079208, - "loss": 1.0177, + "grad_norm": 0.765625, + "learning_rate": 0.00019951953070178208, + "loss": 0.9381, "step": 1035 }, { "epoch": 1.2863327149041435, - "grad_norm": 0.51953125, - "learning_rate": 0.00017161716171617162, - "loss": 1.0301, + "grad_norm": 0.447265625, + "learning_rate": 0.00019949814946337838, + "loss": 0.9484, "step": 1040 }, { "epoch": 1.2925170068027212, - "grad_norm": 0.5546875, - "learning_rate": 0.00017244224422442245, - "loss": 1.0193, + "grad_norm": 0.609375, + "learning_rate": 0.00019947630398038811, + "loss": 0.9378, "step": 1045 }, { "epoch": 1.2987012987012987, - "grad_norm": 0.5078125, - "learning_rate": 0.00017326732673267329, - "loss": 1.0295, + "grad_norm": 0.384765625, + "learning_rate": 0.00019945399435473922, + "loss": 0.9462, "step": 1050 }, { "epoch": 1.3048855905998762, - "grad_norm": 0.490234375, - "learning_rate": 0.0001740924092409241, - "loss": 1.0248, + "grad_norm": 0.373046875, + "learning_rate": 0.00019943122069052534, + "loss": 0.9385, "step": 1055 }, { "epoch": 1.311069882498454, - "grad_norm": 0.6328125, - "learning_rate": 0.00017491749174917493, - "loss": 1.0218, + "grad_norm": 0.4765625, + "learning_rate": 0.00019940798309400526, + "loss": 0.9374, "step": 1060 }, { "epoch": 1.3172541743970316, - "grad_norm": 0.6171875, - "learning_rate": 0.00017574257425742576, - "loss": 1.0141, + "grad_norm": 0.4140625, + "learning_rate": 0.0001993842816736024, + "loss": 0.9305, "step": 1065 }, { "epoch": 1.3234384662956091, - "grad_norm": 0.5546875, - "learning_rate": 0.00017656765676567657, - "loss": 1.0232, + "grad_norm": 0.419921875, + "learning_rate": 0.00019936011653990426, + "loss": 0.9379, "step": 1070 }, { "epoch": 1.3296227581941866, - "grad_norm": 0.56640625, - "learning_rate": 0.0001773927392739274, - "loss": 1.0198, + "grad_norm": 0.3359375, + "learning_rate": 0.00019933548780566202, + "loss": 0.9347, "step": 1075 }, { "epoch": 1.3358070500927643, - "grad_norm": 0.490234375, - "learning_rate": 0.00017821782178217824, - "loss": 1.0236, + "grad_norm": 0.39453125, + "learning_rate": 0.00019931039558578997, + "loss": 0.9397, "step": 1080 }, { "epoch": 1.341991341991342, - "grad_norm": 0.5625, - "learning_rate": 0.00017904290429042905, - "loss": 1.031, + "grad_norm": 0.55859375, + "learning_rate": 0.00019928483999736492, + "loss": 0.946, "step": 1085 }, { "epoch": 1.3481756338899196, - "grad_norm": 0.51171875, - "learning_rate": 0.00017986798679867986, - "loss": 1.0143, + "grad_norm": 0.349609375, + "learning_rate": 0.00019925882115962568, + "loss": 0.9304, "step": 1090 }, { "epoch": 1.3543599257884973, - "grad_norm": 0.74609375, - "learning_rate": 0.00018069306930693072, - "loss": 1.0273, + "grad_norm": 0.58984375, + "learning_rate": 0.00019923233919397258, + "loss": 0.9405, "step": 1095 }, { "epoch": 1.3605442176870748, - "grad_norm": 0.59765625, - "learning_rate": 0.00018151815181518153, - "loss": 1.0228, + "grad_norm": 0.365234375, + "learning_rate": 0.0001992053942239668, + "loss": 0.9394, "step": 1100 }, { "epoch": 1.3667285095856525, - "grad_norm": 0.6171875, - "learning_rate": 0.00018234323432343233, - "loss": 1.0276, + "grad_norm": 0.41796875, + "learning_rate": 0.0001991779863753298, + "loss": 0.9427, "step": 1105 }, { "epoch": 1.37291280148423, - "grad_norm": 0.5703125, - "learning_rate": 0.0001831683168316832, - "loss": 1.0196, + "grad_norm": 0.375, + "learning_rate": 0.00019915011577594286, + "loss": 0.935, "step": 1110 }, { "epoch": 1.3790970933828077, - "grad_norm": 0.466796875, - "learning_rate": 0.000183993399339934, - "loss": 1.0158, + "grad_norm": 0.380859375, + "learning_rate": 0.00019912178255584632, + "loss": 0.9321, "step": 1115 }, { "epoch": 1.3852813852813852, - "grad_norm": 0.5859375, - "learning_rate": 0.0001848184818481848, - "loss": 1.0063, + "grad_norm": 0.349609375, + "learning_rate": 0.00019909298684723904, + "loss": 0.9222, "step": 1120 }, { "epoch": 1.391465677179963, - "grad_norm": 0.6484375, - "learning_rate": 0.00018564356435643567, - "loss": 1.0192, + "grad_norm": 0.412109375, + "learning_rate": 0.00019906372878447784, + "loss": 0.9359, "step": 1125 }, { "epoch": 1.3976499690785404, - "grad_norm": 0.84765625, - "learning_rate": 0.00018646864686468648, - "loss": 1.0265, + "grad_norm": 0.466796875, + "learning_rate": 0.00019903400850407676, + "loss": 0.94, "step": 1130 }, { "epoch": 1.4038342609771182, - "grad_norm": 0.52734375, - "learning_rate": 0.0001872937293729373, - "loss": 1.0139, + "grad_norm": 0.369140625, + "learning_rate": 0.00019900382614470652, + "loss": 0.9289, "step": 1135 }, { "epoch": 1.4100185528756957, - "grad_norm": 0.54296875, - "learning_rate": 0.00018811881188118812, - "loss": 1.0196, + "grad_norm": 0.361328125, + "learning_rate": 0.00019897318184719385, + "loss": 0.9355, "step": 1140 }, { "epoch": 1.4162028447742734, - "grad_norm": 0.72265625, - "learning_rate": 0.00018894389438943896, - "loss": 1.0174, + "grad_norm": 0.34375, + "learning_rate": 0.00019894207575452076, + "loss": 0.9321, "step": 1145 }, { "epoch": 1.4223871366728509, - "grad_norm": 0.80859375, - "learning_rate": 0.00018976897689768977, - "loss": 1.0146, + "grad_norm": 0.369140625, + "learning_rate": 0.000198910508011824, + "loss": 0.9298, "step": 1150 }, { "epoch": 1.4285714285714286, - "grad_norm": 0.6953125, - "learning_rate": 0.0001905940594059406, - "loss": 1.0182, + "grad_norm": 0.427734375, + "learning_rate": 0.0001988784787663943, + "loss": 0.9316, "step": 1155 }, { "epoch": 1.434755720470006, - "grad_norm": 0.47265625, - "learning_rate": 0.00019141914191419144, - "loss": 1.0137, + "grad_norm": 0.41015625, + "learning_rate": 0.00019884598816767563, + "loss": 0.9288, "step": 1160 }, { "epoch": 1.4409400123685838, - "grad_norm": 0.5234375, - "learning_rate": 0.00019224422442244224, - "loss": 1.0134, + "grad_norm": 0.380859375, + "learning_rate": 0.00019881303636726466, + "loss": 0.927, "step": 1165 }, { "epoch": 1.4471243042671613, - "grad_norm": 0.7109375, - "learning_rate": 0.00019306930693069308, - "loss": 1.0123, + "grad_norm": 0.38671875, + "learning_rate": 0.00019877962351890993, + "loss": 0.9287, "step": 1170 }, { "epoch": 1.453308596165739, - "grad_norm": 0.54296875, - "learning_rate": 0.0001938943894389439, - "loss": 1.011, + "grad_norm": 0.369140625, + "learning_rate": 0.0001987457497785112, + "loss": 0.927, "step": 1175 }, { "epoch": 1.4594928880643168, - "grad_norm": 0.75, - "learning_rate": 0.00019471947194719472, - "loss": 0.9988, + "grad_norm": 0.369140625, + "learning_rate": 0.00019871141530411853, + "loss": 0.9161, "step": 1180 }, { "epoch": 1.4656771799628943, - "grad_norm": 0.60546875, - "learning_rate": 0.00019554455445544556, - "loss": 1.0111, + "grad_norm": 0.357421875, + "learning_rate": 0.00019867662025593194, + "loss": 0.9287, "step": 1185 }, { "epoch": 1.4718614718614718, - "grad_norm": 0.51953125, - "learning_rate": 0.00019636963696369636, - "loss": 1.0151, + "grad_norm": 0.369140625, + "learning_rate": 0.0001986413647963003, + "loss": 0.93, "step": 1190 }, { "epoch": 1.4780457637600495, - "grad_norm": 0.6171875, - "learning_rate": 0.0001971947194719472, - "loss": 1.0119, + "grad_norm": 0.33203125, + "learning_rate": 0.00019860564908972064, + "loss": 0.9287, "step": 1195 }, { "epoch": 1.4842300556586272, - "grad_norm": 0.71484375, - "learning_rate": 0.00019801980198019803, - "loss": 1.0085, + "grad_norm": 0.3828125, + "learning_rate": 0.00019856947330283752, + "loss": 0.9222, "step": 1200 }, { "epoch": 1.4904143475572047, - "grad_norm": 0.7421875, - "learning_rate": 0.00019884488448844884, - "loss": 1.0071, + "grad_norm": 0.359375, + "learning_rate": 0.0001985328376044422, + "loss": 0.9219, "step": 1205 }, { "epoch": 1.4965986394557822, - "grad_norm": 0.6171875, - "learning_rate": 0.00019966996699669968, - "loss": 1.0071, + "grad_norm": 0.349609375, + "learning_rate": 0.00019849574216547171, + "loss": 0.9223, "step": 1210 }, { "epoch": 1.50278293135436, - "grad_norm": 0.51953125, - "learning_rate": 0.0001999999626730957, - "loss": 1.0041, + "grad_norm": 0.333984375, + "learning_rate": 0.00019845818715900822, + "loss": 0.9213, "step": 1215 }, { "epoch": 1.5089672232529376, - "grad_norm": 0.73046875, - "learning_rate": 0.00019999973456433681, - "loss": 1.0051, + "grad_norm": 0.431640625, + "learning_rate": 0.00019842017276027832, + "loss": 0.921, "step": 1220 }, { "epoch": 1.5151515151515151, - "grad_norm": 0.68359375, - "learning_rate": 0.00019999929908446061, - "loss": 1.0011, + "grad_norm": 0.396484375, + "learning_rate": 0.00019838169914665178, + "loss": 0.9168, "step": 1225 }, { "epoch": 1.5213358070500926, - "grad_norm": 0.59375, - "learning_rate": 0.00019999865623437013, - "loss": 1.007, + "grad_norm": 0.416015625, + "learning_rate": 0.00019834276649764124, + "loss": 0.9215, "step": 1230 }, { "epoch": 1.5275200989486704, - "grad_norm": 0.73828125, - "learning_rate": 0.00019999780601539848, - "loss": 0.9903, + "grad_norm": 0.380859375, + "learning_rate": 0.000198303374994901, + "loss": 0.9073, "step": 1235 }, { "epoch": 1.533704390847248, - "grad_norm": 0.67578125, - "learning_rate": 0.00019999674842930876, - "loss": 1.0055, + "grad_norm": 0.333984375, + "learning_rate": 0.00019826352482222638, + "loss": 0.9226, "step": 1240 }, { "epoch": 1.5398886827458256, - "grad_norm": 0.859375, - "learning_rate": 0.0001999954834782941, - "loss": 0.9958, + "grad_norm": 0.3515625, + "learning_rate": 0.00019822321616555277, + "loss": 0.9128, "step": 1245 }, { "epoch": 1.546072974644403, - "grad_norm": 0.6328125, - "learning_rate": 0.00019999401116497763, - "loss": 1.0027, + "grad_norm": 0.361328125, + "learning_rate": 0.0001981824492129548, + "loss": 0.918, "step": 1250 }, { "epoch": 1.5522572665429808, - "grad_norm": 0.61328125, - "learning_rate": 0.00019999233149241253, - "loss": 1.0078, + "grad_norm": 0.38671875, + "learning_rate": 0.00019814122415464535, + "loss": 0.924, "step": 1255 }, { "epoch": 1.5584415584415585, - "grad_norm": 0.609375, - "learning_rate": 0.000199990444464082, - "loss": 1.0108, + "grad_norm": 0.365234375, + "learning_rate": 0.0001980995411829749, + "loss": 0.9257, "step": 1260 }, { "epoch": 1.564625850340136, - "grad_norm": 0.5390625, - "learning_rate": 0.0001999883500838992, - "loss": 1.0173, + "grad_norm": 0.3515625, + "learning_rate": 0.00019805740049243042, + "loss": 0.9307, "step": 1265 }, { "epoch": 1.5708101422387135, - "grad_norm": 0.546875, - "learning_rate": 0.00019998604835620717, - "loss": 1.0167, + "grad_norm": 0.333984375, + "learning_rate": 0.0001980148022796345, + "loss": 0.9338, "step": 1270 }, { "epoch": 1.5769944341372912, - "grad_norm": 0.59375, - "learning_rate": 0.00019998353928577919, - "loss": 1.0032, + "grad_norm": 0.3515625, + "learning_rate": 0.0001979717467433446, + "loss": 0.9194, "step": 1275 }, { "epoch": 1.583178726035869, - "grad_norm": 0.482421875, - "learning_rate": 0.00019998082287781826, - "loss": 1.0133, + "grad_norm": 0.34375, + "learning_rate": 0.00019792823408445174, + "loss": 0.9285, "step": 1280 }, { "epoch": 1.5893630179344465, - "grad_norm": 0.474609375, - "learning_rate": 0.00019997789913795747, - "loss": 0.9938, + "grad_norm": 0.359375, + "learning_rate": 0.00019788426450598006, + "loss": 0.9142, "step": 1285 }, { "epoch": 1.595547309833024, - "grad_norm": 0.53515625, - "learning_rate": 0.00019997476807225985, - "loss": 0.9967, + "grad_norm": 0.37109375, + "learning_rate": 0.0001978398382130855, + "loss": 0.9154, "step": 1290 }, { "epoch": 1.601731601731602, - "grad_norm": 0.6953125, - "learning_rate": 0.00019997142968721833, - "loss": 1.0054, + "grad_norm": 0.46484375, + "learning_rate": 0.00019779495541305498, + "loss": 0.9218, "step": 1295 }, { "epoch": 1.6079158936301794, - "grad_norm": 0.54296875, - "learning_rate": 0.00019996788398975578, - "loss": 1.0033, + "grad_norm": 0.365234375, + "learning_rate": 0.00019774961631530545, + "loss": 0.9219, "step": 1300 }, { "epoch": 1.614100185528757, - "grad_norm": 0.47265625, - "learning_rate": 0.00019996413098722493, - "loss": 0.9963, + "grad_norm": 0.365234375, + "learning_rate": 0.00019770382113138283, + "loss": 0.9121, "step": 1305 }, { "epoch": 1.6202844774273346, - "grad_norm": 0.6640625, - "learning_rate": 0.0001999601706874085, - "loss": 1.0035, + "grad_norm": 0.359375, + "learning_rate": 0.00019765757007496115, + "loss": 0.9229, "step": 1310 }, { "epoch": 1.6264687693259123, - "grad_norm": 0.482421875, - "learning_rate": 0.000199956003098519, - "loss": 0.9968, + "grad_norm": 0.328125, + "learning_rate": 0.0001976108633618414, + "loss": 0.9164, "step": 1315 }, { "epoch": 1.6326530612244898, - "grad_norm": 0.77734375, - "learning_rate": 0.00019995162822919883, - "loss": 0.9884, + "grad_norm": 0.4765625, + "learning_rate": 0.00019756370120995066, + "loss": 0.9068, "step": 1320 }, { "epoch": 1.6388373531230673, - "grad_norm": 0.9140625, - "learning_rate": 0.00019994704608852022, - "loss": 1.0013, + "grad_norm": 0.36328125, + "learning_rate": 0.00019751608383934097, + "loss": 0.9179, "step": 1325 }, { "epoch": 1.645021645021645, - "grad_norm": 0.7734375, - "learning_rate": 0.00019994225668598526, - "loss": 1.0153, + "grad_norm": 0.52734375, + "learning_rate": 0.00019746801147218842, + "loss": 0.9315, "step": 1330 }, { "epoch": 1.6512059369202228, - "grad_norm": 0.5625, - "learning_rate": 0.00019993726003152582, - "loss": 1.0002, + "grad_norm": 0.365234375, + "learning_rate": 0.00019741948433279197, + "loss": 0.9172, "step": 1335 }, { "epoch": 1.6573902288188003, - "grad_norm": 0.490234375, - "learning_rate": 0.0001999320561355035, - "loss": 1.0107, + "grad_norm": 0.376953125, + "learning_rate": 0.0001973705026475726, + "loss": 0.9281, "step": 1340 }, { "epoch": 1.6635745207173778, - "grad_norm": 0.486328125, - "learning_rate": 0.00019992664500870976, - "loss": 1.0183, + "grad_norm": 0.353515625, + "learning_rate": 0.00019732106664507203, + "loss": 0.9371, "step": 1345 }, { "epoch": 1.6697588126159555, - "grad_norm": 0.578125, - "learning_rate": 0.00019992102666236566, - "loss": 1.0067, + "grad_norm": 0.43359375, + "learning_rate": 0.0001972711765559518, + "loss": 0.9252, "step": 1350 }, { "epoch": 1.6759431045145332, - "grad_norm": 0.478515625, - "learning_rate": 0.00019991520110812215, - "loss": 1.0032, + "grad_norm": 0.34375, + "learning_rate": 0.00019722083261299216, + "loss": 0.9219, "step": 1355 }, { "epoch": 1.6821273964131107, - "grad_norm": 0.51171875, - "learning_rate": 0.00019990916835805974, - "loss": 0.994, + "grad_norm": 0.33203125, + "learning_rate": 0.00019717003505109095, + "loss": 0.9137, "step": 1360 }, { "epoch": 1.6883116883116882, - "grad_norm": 0.451171875, - "learning_rate": 0.00019990292842468868, - "loss": 0.9995, + "grad_norm": 0.330078125, + "learning_rate": 0.00019711878410726263, + "loss": 0.9163, "step": 1365 }, { "epoch": 1.694495980210266, - "grad_norm": 0.47265625, - "learning_rate": 0.00019989648132094873, - "loss": 0.9878, + "grad_norm": 0.361328125, + "learning_rate": 0.00019706708002063694, + "loss": 0.9065, "step": 1370 }, { "epoch": 1.7006802721088436, - "grad_norm": 0.66015625, - "learning_rate": 0.00019988982706020946, - "loss": 1.002, + "grad_norm": 0.3828125, + "learning_rate": 0.00019701492303245802, + "loss": 0.9216, "step": 1375 }, { "epoch": 1.7068645640074211, - "grad_norm": 0.65234375, - "learning_rate": 0.00019988296565626987, - "loss": 0.9913, + "grad_norm": 0.431640625, + "learning_rate": 0.00019696231338608316, + "loss": 0.9122, "step": 1380 }, { "epoch": 1.7130488559059986, - "grad_norm": 0.55078125, - "learning_rate": 0.00019987589712335856, - "loss": 1.0095, + "grad_norm": 0.421875, + "learning_rate": 0.00019690925132698165, + "loss": 0.9267, "step": 1385 }, { "epoch": 1.7192331478045764, - "grad_norm": 0.65625, - "learning_rate": 0.0001998686214761337, - "loss": 0.9962, + "grad_norm": 0.49609375, + "learning_rate": 0.00019685573710273376, + "loss": 0.9166, "step": 1390 }, { "epoch": 1.725417439703154, - "grad_norm": 0.6875, - "learning_rate": 0.0001998611387296829, - "loss": 0.9979, + "grad_norm": 0.447265625, + "learning_rate": 0.0001968017709630294, + "loss": 0.9188, "step": 1395 }, { "epoch": 1.7316017316017316, - "grad_norm": 0.55859375, - "learning_rate": 0.00019985344889952327, - "loss": 1.0057, + "grad_norm": 0.38671875, + "learning_rate": 0.0001967473531596671, + "loss": 0.923, "step": 1400 }, { "epoch": 1.737786023500309, - "grad_norm": 0.55078125, - "learning_rate": 0.00019984555200160128, - "loss": 0.9932, + "grad_norm": 0.345703125, + "learning_rate": 0.00019669248394655283, + "loss": 0.9151, "step": 1405 }, { "epoch": 1.7439703153988868, - "grad_norm": 0.546875, - "learning_rate": 0.00019983744805229296, - "loss": 0.9973, + "grad_norm": 0.345703125, + "learning_rate": 0.00019663716357969874, + "loss": 0.9187, "step": 1410 }, { "epoch": 1.7501546072974645, - "grad_norm": 0.59375, - "learning_rate": 0.00019982913706840353, - "loss": 0.9919, + "grad_norm": 0.392578125, + "learning_rate": 0.00019658139231722198, + "loss": 0.9111, "step": 1415 }, { "epoch": 1.756338899196042, - "grad_norm": 0.443359375, - "learning_rate": 0.00019982061906716764, - "loss": 0.9875, + "grad_norm": 0.353515625, + "learning_rate": 0.00019652517041934356, + "loss": 0.9076, "step": 1420 }, { "epoch": 1.7625231910946195, - "grad_norm": 0.5234375, - "learning_rate": 0.00019981189406624922, - "loss": 1.0013, + "grad_norm": 0.322265625, + "learning_rate": 0.00019646849814838706, + "loss": 0.9199, "step": 1425 }, { "epoch": 1.7687074829931972, - "grad_norm": 0.59765625, - "learning_rate": 0.00019980296208374143, - "loss": 0.9961, + "grad_norm": 0.46875, + "learning_rate": 0.00019641137576877744, + "loss": 0.9179, "step": 1430 }, { "epoch": 1.774891774891775, - "grad_norm": 0.5, - "learning_rate": 0.00019979382313816668, - "loss": 0.9928, + "grad_norm": 0.337890625, + "learning_rate": 0.0001963538035470398, + "loss": 0.9118, "step": 1435 }, { "epoch": 1.7810760667903525, - "grad_norm": 0.462890625, - "learning_rate": 0.00019978447724847652, - "loss": 0.9939, + "grad_norm": 0.359375, + "learning_rate": 0.0001962957817517982, + "loss": 0.917, "step": 1440 }, { "epoch": 1.78726035868893, - "grad_norm": 0.5390625, - "learning_rate": 0.00019977492443405174, - "loss": 0.987, + "grad_norm": 0.306640625, + "learning_rate": 0.00019623731065377426, + "loss": 0.9092, "step": 1445 }, { "epoch": 1.7934446505875077, - "grad_norm": 0.486328125, - "learning_rate": 0.00019976516471470216, - "loss": 1.0033, + "grad_norm": 0.349609375, + "learning_rate": 0.00019617839052578603, + "loss": 0.924, "step": 1450 }, { "epoch": 1.7996289424860854, - "grad_norm": 0.5625, - "learning_rate": 0.00019975519811066663, - "loss": 0.9884, + "grad_norm": 0.3515625, + "learning_rate": 0.0001961190216427467, + "loss": 0.9108, "step": 1455 }, { "epoch": 1.805813234384663, - "grad_norm": 0.66796875, - "learning_rate": 0.0001997450246426131, - "loss": 0.9823, + "grad_norm": 0.404296875, + "learning_rate": 0.00019605920428166323, + "loss": 0.9035, "step": 1460 }, { "epoch": 1.8119975262832406, - "grad_norm": 0.5546875, - "learning_rate": 0.00019973464433163844, - "loss": 0.9838, + "grad_norm": 0.341796875, + "learning_rate": 0.00019599893872163514, + "loss": 0.906, "step": 1465 }, { "epoch": 1.8181818181818183, - "grad_norm": 0.66796875, - "learning_rate": 0.0001997240571992685, - "loss": 0.993, + "grad_norm": 0.33203125, + "learning_rate": 0.00019593822524385316, + "loss": 0.9139, "step": 1470 }, { "epoch": 1.8243661100803958, - "grad_norm": 0.5625, - "learning_rate": 0.00019971326326745793, - "loss": 0.9835, + "grad_norm": 0.337890625, + "learning_rate": 0.00019587706413159804, + "loss": 0.9043, "step": 1475 }, { "epoch": 1.8305504019789733, - "grad_norm": 0.44140625, - "learning_rate": 0.00019970226255859038, - "loss": 0.99, + "grad_norm": 0.4140625, + "learning_rate": 0.000195815455670239, + "loss": 0.9131, "step": 1480 }, { "epoch": 1.836734693877551, - "grad_norm": 0.59375, - "learning_rate": 0.00019969105509547812, - "loss": 0.9923, + "grad_norm": 0.365234375, + "learning_rate": 0.00019575340014723263, + "loss": 0.914, "step": 1485 }, { "epoch": 1.8429189857761288, - "grad_norm": 0.55859375, - "learning_rate": 0.0001996796409013623, - "loss": 0.998, + "grad_norm": 0.40234375, + "learning_rate": 0.0001956908978521214, + "loss": 0.9206, "step": 1490 }, { "epoch": 1.8491032776747063, - "grad_norm": 0.62890625, - "learning_rate": 0.0001996680199999127, - "loss": 0.9902, + "grad_norm": 0.380859375, + "learning_rate": 0.00019562794907653235, + "loss": 0.9122, "step": 1495 }, { "epoch": 1.8552875695732838, - "grad_norm": 0.6875, - "learning_rate": 0.0001996561924152278, - "loss": 0.9845, + "grad_norm": 0.328125, + "learning_rate": 0.00019556455411417573, + "loss": 0.9059, "step": 1500 }, { "epoch": 1.8614718614718615, - "grad_norm": 0.703125, - "learning_rate": 0.0001996441581718347, - "loss": 0.9722, + "grad_norm": 0.3515625, + "learning_rate": 0.00019550071326084368, + "loss": 0.8951, "step": 1505 }, { "epoch": 1.8676561533704392, - "grad_norm": 0.53515625, - "learning_rate": 0.00019963191729468888, - "loss": 0.9983, - "step": 1510 + "grad_norm": 0.3828125, + "learning_rate": 0.0001954364268144088, + "loss": 0.9186, + "step": 1510 }, { "epoch": 1.8738404452690167, - "grad_norm": 0.5, - "learning_rate": 0.00019961946980917456, - "loss": 0.9817, + "grad_norm": 0.310546875, + "learning_rate": 0.0001953716950748227, + "loss": 0.9037, "step": 1515 }, { "epoch": 1.8800247371675942, - "grad_norm": 0.59375, - "learning_rate": 0.00019960681574110426, - "loss": 0.9803, + "grad_norm": 0.326171875, + "learning_rate": 0.00019530651834411474, + "loss": 0.9029, "step": 1520 }, { "epoch": 1.886209029066172, - "grad_norm": 0.498046875, - "learning_rate": 0.00019959395511671898, - "loss": 0.9839, + "grad_norm": 0.310546875, + "learning_rate": 0.00019524089692639053, + "loss": 0.9071, "step": 1525 }, { "epoch": 1.8923933209647497, - "grad_norm": 0.55078125, - "learning_rate": 0.00019958088796268793, - "loss": 0.9842, + "grad_norm": 0.345703125, + "learning_rate": 0.00019517483112783054, + "loss": 0.9062, "step": 1530 }, { "epoch": 1.8985776128633272, - "grad_norm": 0.546875, - "learning_rate": 0.00019956761430610874, - "loss": 0.9782, + "grad_norm": 0.33984375, + "learning_rate": 0.00019510832125668853, + "loss": 0.9028, "step": 1535 }, { "epoch": 1.9047619047619047, - "grad_norm": 0.462890625, - "learning_rate": 0.0001995541341745072, - "loss": 0.9888, + "grad_norm": 0.33984375, + "learning_rate": 0.00019504136762329047, + "loss": 0.9107, "step": 1540 }, { "epoch": 1.9109461966604824, - "grad_norm": 0.451171875, - "learning_rate": 0.0001995404475958373, - "loss": 0.9859, + "grad_norm": 0.421875, + "learning_rate": 0.00019497397054003265, + "loss": 0.9097, "step": 1545 }, { "epoch": 1.91713048855906, - "grad_norm": 0.5078125, - "learning_rate": 0.0001995265545984811, - "loss": 0.9986, + "grad_norm": 0.400390625, + "learning_rate": 0.00019490613032138062, + "loss": 0.9222, "step": 1550 }, { "epoch": 1.9233147804576376, - "grad_norm": 0.51953125, - "learning_rate": 0.00019951245521124874, - "loss": 0.9737, + "grad_norm": 0.3359375, + "learning_rate": 0.00019483784728386737, + "loss": 0.8979, "step": 1555 }, { "epoch": 1.929499072356215, - "grad_norm": 0.47265625, - "learning_rate": 0.00019949814946337838, - "loss": 0.977, + "grad_norm": 0.369140625, + "learning_rate": 0.0001947691217460921, + "loss": 0.9016, "step": 1560 }, { "epoch": 1.9356833642547928, - "grad_norm": 0.6015625, - "learning_rate": 0.00019948363738453607, - "loss": 0.9834, + "grad_norm": 0.349609375, + "learning_rate": 0.0001946999540287187, + "loss": 0.9059, "step": 1565 }, { "epoch": 1.9418676561533705, - "grad_norm": 0.6015625, - "learning_rate": 0.00019946891900481578, - "loss": 0.9903, + "grad_norm": 0.365234375, + "learning_rate": 0.0001946303444544741, + "loss": 0.9139, "step": 1570 }, { "epoch": 1.948051948051948, - "grad_norm": 0.484375, - "learning_rate": 0.00019945399435473922, - "loss": 0.9813, + "grad_norm": 0.34375, + "learning_rate": 0.000194560293348147, + "loss": 0.9062, "step": 1575 }, { "epoch": 1.9542362399505255, - "grad_norm": 0.5234375, - "learning_rate": 0.0001994388634652559, - "loss": 0.9774, + "grad_norm": 0.44921875, + "learning_rate": 0.00019448980103658613, + "loss": 0.9007, "step": 1580 }, { "epoch": 1.9604205318491033, - "grad_norm": 0.625, - "learning_rate": 0.00019942352636774296, - "loss": 0.9785, + "grad_norm": 0.314453125, + "learning_rate": 0.00019441886784869885, + "loss": 0.9022, "step": 1585 }, { "epoch": 1.966604823747681, - "grad_norm": 0.80859375, - "learning_rate": 0.00019940798309400526, - "loss": 0.9799, + "grad_norm": 0.427734375, + "learning_rate": 0.00019434749411544958, + "loss": 0.9035, "step": 1590 }, { "epoch": 1.9727891156462585, - "grad_norm": 0.64453125, - "learning_rate": 0.0001993922336762751, - "loss": 0.969, + "grad_norm": 0.4765625, + "learning_rate": 0.00019427568016985828, + "loss": 0.8953, "step": 1595 }, { "epoch": 1.978973407544836, - "grad_norm": 0.6484375, - "learning_rate": 0.00019937627814721237, - "loss": 0.9758, + "grad_norm": 0.515625, + "learning_rate": 0.0001942034263469989, + "loss": 0.901, "step": 1600 }, { "epoch": 1.9851576994434137, - "grad_norm": 0.49609375, - "learning_rate": 0.00019936011653990426, - "loss": 0.9819, + "grad_norm": 0.41015625, + "learning_rate": 0.00019413073298399778, + "loss": 0.9065, "step": 1605 }, { "epoch": 1.9913419913419914, - "grad_norm": 0.447265625, - "learning_rate": 0.00019934374888786537, - "loss": 0.9826, + "grad_norm": 0.3515625, + "learning_rate": 0.00019405760042003203, + "loss": 0.9068, "step": 1610 }, { "epoch": 1.997526283240569, - "grad_norm": 0.515625, - "learning_rate": 0.0001993271752250376, - "loss": 0.9661, + "grad_norm": 0.39453125, + "learning_rate": 0.00019398402899632812, + "loss": 0.8916, "step": 1615 }, { "epoch": 2.0, - "eval_loss": 2.4507155418395996, - "eval_runtime": 0.5376, - "eval_samples_per_second": 18.6, - "eval_steps_per_second": 1.86, + "eval_loss": 2.4784626960754395, + "eval_runtime": 0.5361, + "eval_samples_per_second": 18.652, + "eval_steps_per_second": 1.865, "step": 1617 }, { "epoch": 2.0037105751391464, - "grad_norm": 0.44921875, - "learning_rate": 0.00019931039558578997, - "loss": 0.9843, + "grad_norm": 0.361328125, + "learning_rate": 0.0001939100190561601, + "loss": 0.9017, "step": 1620 }, { "epoch": 2.0098948670377244, - "grad_norm": 0.55859375, - "learning_rate": 0.00019929341000491876, - "loss": 0.9676, + "grad_norm": 0.3359375, + "learning_rate": 0.00019383557094484807, + "loss": 0.8759, "step": 1625 }, { "epoch": 2.016079158936302, - "grad_norm": 0.458984375, - "learning_rate": 0.00019927621851764725, - "loss": 0.9727, + "grad_norm": 0.32421875, + "learning_rate": 0.00019376068500975667, + "loss": 0.8808, "step": 1630 }, { "epoch": 2.0222634508348794, - "grad_norm": 0.46484375, - "learning_rate": 0.00019925882115962568, - "loss": 0.9764, + "grad_norm": 0.3203125, + "learning_rate": 0.00019368536160029327, + "loss": 0.8838, "step": 1635 }, { "epoch": 2.028447742733457, - "grad_norm": 0.49609375, - "learning_rate": 0.00019924121796693127, - "loss": 0.9708, + "grad_norm": 0.357421875, + "learning_rate": 0.00019360960106790643, + "loss": 0.8802, "step": 1640 }, { "epoch": 2.034632034632035, - "grad_norm": 0.61328125, - "learning_rate": 0.00019922340897606805, - "loss": 0.9728, + "grad_norm": 0.349609375, + "learning_rate": 0.0001935334037660844, + "loss": 0.8811, "step": 1645 }, { "epoch": 2.0408163265306123, - "grad_norm": 0.486328125, - "learning_rate": 0.0001992053942239668, - "loss": 0.9724, + "grad_norm": 0.369140625, + "learning_rate": 0.00019345677005035315, + "loss": 0.8814, "step": 1650 }, { "epoch": 2.04700061842919, - "grad_norm": 0.65625, - "learning_rate": 0.00019918717374798502, - "loss": 0.9669, + "grad_norm": 0.427734375, + "learning_rate": 0.00019337970027827504, + "loss": 0.8767, "step": 1655 }, { "epoch": 2.0531849103277673, - "grad_norm": 0.61328125, - "learning_rate": 0.00019916874758590684, - "loss": 0.9634, + "grad_norm": 0.44140625, + "learning_rate": 0.00019330219480944694, + "loss": 0.875, "step": 1660 }, { "epoch": 2.0593692022263452, - "grad_norm": 0.546875, - "learning_rate": 0.00019915011577594286, - "loss": 0.9761, + "grad_norm": 0.400390625, + "learning_rate": 0.0001932242540054986, + "loss": 0.8844, "step": 1665 }, { "epoch": 2.0655534941249227, - "grad_norm": 0.546875, - "learning_rate": 0.00019913127835673023, - "loss": 0.9667, + "grad_norm": 0.333984375, + "learning_rate": 0.00019314587823009103, + "loss": 0.8775, "step": 1670 }, { "epoch": 2.0717377860235002, - "grad_norm": 0.6484375, - "learning_rate": 0.00019911223536733235, - "loss": 0.9747, + "grad_norm": 0.345703125, + "learning_rate": 0.00019306706784891477, + "loss": 0.8856, "step": 1675 }, { "epoch": 2.0779220779220777, - "grad_norm": 0.69140625, - "learning_rate": 0.00019909298684723904, - "loss": 0.9682, + "grad_norm": 0.361328125, + "learning_rate": 0.00019298782322968815, + "loss": 0.8767, "step": 1680 }, { "epoch": 2.0841063698206557, - "grad_norm": 0.69921875, - "learning_rate": 0.00019907353283636628, - "loss": 0.9606, + "grad_norm": 0.388671875, + "learning_rate": 0.00019290814474215556, + "loss": 0.8707, "step": 1685 }, { "epoch": 2.090290661719233, - "grad_norm": 0.51953125, - "learning_rate": 0.00019905387337505612, - "loss": 0.9635, + "grad_norm": 0.380859375, + "learning_rate": 0.0001928280327580858, + "loss": 0.8751, "step": 1690 }, { "epoch": 2.0964749536178107, - "grad_norm": 0.5234375, - "learning_rate": 0.00019903400850407676, - "loss": 0.9612, + "grad_norm": 0.357421875, + "learning_rate": 0.00019274748765127028, + "loss": 0.8709, "step": 1695 }, { "epoch": 2.102659245516388, - "grad_norm": 0.5078125, - "learning_rate": 0.0001990139382646223, - "loss": 0.9682, + "grad_norm": 0.337890625, + "learning_rate": 0.00019266650979752136, + "loss": 0.8805, "step": 1700 }, { "epoch": 2.108843537414966, - "grad_norm": 0.62109375, - "learning_rate": 0.00019899366269831274, - "loss": 0.9599, + "grad_norm": 0.4453125, + "learning_rate": 0.00019258509957467042, + "loss": 0.8732, "step": 1705 }, { "epoch": 2.1150278293135436, - "grad_norm": 0.45703125, - "learning_rate": 0.00019897318184719385, - "loss": 0.9769, + "grad_norm": 0.439453125, + "learning_rate": 0.00019250325736256633, + "loss": 0.8902, "step": 1710 }, { "epoch": 2.121212121212121, - "grad_norm": 0.59765625, - "learning_rate": 0.00019895249575373712, - "loss": 0.9683, + "grad_norm": 0.32421875, + "learning_rate": 0.00019242098354307354, + "loss": 0.8804, "step": 1715 }, { "epoch": 2.1273964131106986, - "grad_norm": 0.72265625, - "learning_rate": 0.00019893160446083963, - "loss": 0.9577, + "grad_norm": 0.34765625, + "learning_rate": 0.00019233827850007027, + "loss": 0.8706, "step": 1720 }, { "epoch": 2.1335807050092765, - "grad_norm": 0.5703125, - "learning_rate": 0.000198910508011824, - "loss": 0.9543, + "grad_norm": 0.357421875, + "learning_rate": 0.00019225514261944678, + "loss": 0.8682, "step": 1725 }, { "epoch": 2.139764996907854, - "grad_norm": 0.42578125, - "learning_rate": 0.00019888920645043831, - "loss": 0.9643, + "grad_norm": 0.345703125, + "learning_rate": 0.0001921715762891036, + "loss": 0.8767, "step": 1730 }, { "epoch": 2.1459492888064315, - "grad_norm": 0.55078125, - "learning_rate": 0.00019886769982085597, - "loss": 0.9651, + "grad_norm": 0.38671875, + "learning_rate": 0.00019208757989894965, + "loss": 0.8747, "step": 1735 }, { "epoch": 2.1521335807050095, - "grad_norm": 0.48046875, - "learning_rate": 0.00019884598816767563, - "loss": 0.9451, + "grad_norm": 0.384765625, + "learning_rate": 0.00019200315384090044, + "loss": 0.8601, "step": 1740 }, { "epoch": 2.158317872603587, - "grad_norm": 0.53515625, - "learning_rate": 0.00019882407153592107, - "loss": 0.9691, + "grad_norm": 0.43359375, + "learning_rate": 0.0001919182985088763, + "loss": 0.8821, "step": 1745 }, { "epoch": 2.1645021645021645, - "grad_norm": 0.5, - "learning_rate": 0.00019880194997104123, - "loss": 0.9687, + "grad_norm": 0.390625, + "learning_rate": 0.00019183301429880043, + "loss": 0.8812, "step": 1750 }, { "epoch": 2.170686456400742, - "grad_norm": 0.75390625, - "learning_rate": 0.00019877962351890993, - "loss": 0.9574, + "grad_norm": 0.330078125, + "learning_rate": 0.00019174730160859715, + "loss": 0.8707, "step": 1755 }, { "epoch": 2.17687074829932, - "grad_norm": 0.6328125, - "learning_rate": 0.00019875709222582594, - "loss": 0.9632, + "grad_norm": 0.306640625, + "learning_rate": 0.00019166116083819002, + "loss": 0.8756, "step": 1760 }, { "epoch": 2.1830550401978974, - "grad_norm": 0.478515625, - "learning_rate": 0.00019873435613851275, - "loss": 0.9643, + "grad_norm": 0.31640625, + "learning_rate": 0.00019157459238949991, + "loss": 0.8796, "step": 1765 }, { "epoch": 2.189239332096475, - "grad_norm": 0.435546875, - "learning_rate": 0.00019871141530411853, - "loss": 0.9664, + "grad_norm": 0.345703125, + "learning_rate": 0.00019148759666644325, + "loss": 0.8795, "step": 1770 }, { "epoch": 2.1954236239950524, - "grad_norm": 0.431640625, - "learning_rate": 0.00019868826977021615, - "loss": 0.9668, + "grad_norm": 0.35546875, + "learning_rate": 0.00019140017407493, + "loss": 0.8804, "step": 1775 }, { "epoch": 2.2016079158936304, - "grad_norm": 0.54296875, - "learning_rate": 0.00019866491958480284, - "loss": 0.9619, + "grad_norm": 0.345703125, + "learning_rate": 0.00019131232502286188, + "loss": 0.8748, "step": 1780 }, { "epoch": 2.207792207792208, - "grad_norm": 0.55859375, - "learning_rate": 0.0001986413647963003, - "loss": 0.9564, + "grad_norm": 0.431640625, + "learning_rate": 0.00019122404992013043, + "loss": 0.8694, "step": 1785 }, { "epoch": 2.2139764996907854, - "grad_norm": 0.478515625, - "learning_rate": 0.00019861760545355442, - "loss": 0.9649, + "grad_norm": 0.30859375, + "learning_rate": 0.00019113534917861502, + "loss": 0.8806, "step": 1790 }, { "epoch": 2.220160791589363, - "grad_norm": 0.458984375, - "learning_rate": 0.00019859364160583544, - "loss": 0.9453, + "grad_norm": 0.361328125, + "learning_rate": 0.00019104622321218105, + "loss": 0.8618, "step": 1795 }, { "epoch": 2.226345083487941, - "grad_norm": 0.578125, - "learning_rate": 0.00019856947330283752, - "loss": 0.9669, + "grad_norm": 0.390625, + "learning_rate": 0.0001909566724366779, + "loss": 0.88, "step": 1800 }, { "epoch": 2.2325293753865183, - "grad_norm": 0.43359375, - "learning_rate": 0.0001985451005946789, - "loss": 0.9721, + "grad_norm": 0.3828125, + "learning_rate": 0.0001908666972699371, + "loss": 0.887, "step": 1805 }, { "epoch": 2.238713667285096, - "grad_norm": 0.478515625, - "learning_rate": 0.00019852052353190166, - "loss": 0.9657, + "grad_norm": 0.36328125, + "learning_rate": 0.00019077629813177036, + "loss": 0.8812, "step": 1810 }, { "epoch": 2.2448979591836733, - "grad_norm": 0.53515625, - "learning_rate": 0.00019849574216547171, - "loss": 0.9648, + "grad_norm": 0.337890625, + "learning_rate": 0.00019068547544396754, + "loss": 0.8777, "step": 1815 }, { "epoch": 2.2510822510822512, - "grad_norm": 0.4765625, - "learning_rate": 0.0001984707565467785, - "loss": 0.9544, + "grad_norm": 0.32421875, + "learning_rate": 0.00019059422963029464, + "loss": 0.8711, "step": 1820 }, { "epoch": 2.2572665429808287, - "grad_norm": 0.427734375, - "learning_rate": 0.0001984455667276352, - "loss": 0.9737, + "grad_norm": 0.353515625, + "learning_rate": 0.00019050256111649206, + "loss": 0.8878, "step": 1825 }, { "epoch": 2.2634508348794062, - "grad_norm": 0.50390625, - "learning_rate": 0.00019842017276027832, - "loss": 0.9538, + "grad_norm": 0.33203125, + "learning_rate": 0.00019041047033027236, + "loss": 0.8706, "step": 1830 }, { "epoch": 2.2696351267779837, - "grad_norm": 0.47265625, - "learning_rate": 0.00019839457469736775, - "loss": 0.9745, + "grad_norm": 0.333984375, + "learning_rate": 0.0001903179577013184, + "loss": 0.8893, "step": 1835 }, { "epoch": 2.2758194186765617, - "grad_norm": 0.486328125, - "learning_rate": 0.00019836877259198662, - "loss": 0.958, + "grad_norm": 0.337890625, + "learning_rate": 0.00019022502366128135, + "loss": 0.8747, "step": 1840 }, { "epoch": 2.282003710575139, - "grad_norm": 0.51953125, - "learning_rate": 0.00019834276649764124, - "loss": 0.9637, + "grad_norm": 0.33203125, + "learning_rate": 0.00019013166864377851, + "loss": 0.8791, "step": 1845 }, { "epoch": 2.2881880024737167, - "grad_norm": 0.490234375, - "learning_rate": 0.0001983165564682608, - "loss": 0.9706, + "grad_norm": 0.3359375, + "learning_rate": 0.00019003789308439148, + "loss": 0.8871, "step": 1850 }, { "epoch": 2.2943722943722946, - "grad_norm": 0.52734375, - "learning_rate": 0.00019829014255819753, - "loss": 0.9671, + "grad_norm": 0.33203125, + "learning_rate": 0.00018994369742066403, + "loss": 0.8811, "step": 1855 }, { "epoch": 2.300556586270872, - "grad_norm": 0.61328125, - "learning_rate": 0.00019826352482222638, - "loss": 0.9571, + "grad_norm": 0.375, + "learning_rate": 0.0001898490820921001, + "loss": 0.8721, "step": 1860 }, { "epoch": 2.3067408781694496, - "grad_norm": 0.49609375, - "learning_rate": 0.000198236703315545, - "loss": 0.9581, + "grad_norm": 0.3359375, + "learning_rate": 0.00018975404754016165, + "loss": 0.8729, "step": 1865 }, { "epoch": 2.312925170068027, - "grad_norm": 0.57421875, - "learning_rate": 0.00019820967809377357, - "loss": 0.9676, + "grad_norm": 0.392578125, + "learning_rate": 0.00018965859420826684, + "loss": 0.8826, "step": 1870 }, { "epoch": 2.3191094619666046, - "grad_norm": 0.546875, - "learning_rate": 0.0001981824492129548, - "loss": 0.9489, + "grad_norm": 0.369140625, + "learning_rate": 0.00018956272254178763, + "loss": 0.8646, "step": 1875 }, { "epoch": 2.3252937538651826, - "grad_norm": 0.6640625, - "learning_rate": 0.00019815501672955358, - "loss": 0.9609, + "grad_norm": 0.375, + "learning_rate": 0.00018946643298804793, + "loss": 0.8784, "step": 1880 }, { "epoch": 2.33147804576376, - "grad_norm": 0.4921875, - "learning_rate": 0.0001981273807004572, - "loss": 0.9569, + "grad_norm": 0.396484375, + "learning_rate": 0.00018936972599632151, + "loss": 0.8744, "step": 1885 }, { "epoch": 2.3376623376623376, - "grad_norm": 0.5, - "learning_rate": 0.0001980995411829749, - "loss": 0.959, + "grad_norm": 0.39453125, + "learning_rate": 0.00018927260201782978, + "loss": 0.878, "step": 1890 }, { "epoch": 2.3438466295609155, - "grad_norm": 0.458984375, - "learning_rate": 0.00019807149823483798, - "loss": 0.9614, + "grad_norm": 0.349609375, + "learning_rate": 0.00018917506150573977, + "loss": 0.8791, "step": 1895 }, { "epoch": 2.350030921459493, - "grad_norm": 0.50390625, - "learning_rate": 0.00019804325191419956, - "loss": 0.9687, + "grad_norm": 0.380859375, + "learning_rate": 0.00018907710491516199, + "loss": 0.8831, "step": 1900 }, { "epoch": 2.3562152133580705, - "grad_norm": 0.43359375, - "learning_rate": 0.0001980148022796345, - "loss": 0.9598, + "grad_norm": 0.34375, + "learning_rate": 0.0001889787327031483, + "loss": 0.8786, "step": 1905 }, { "epoch": 2.362399505256648, - "grad_norm": 0.5390625, - "learning_rate": 0.00019798614939013932, - "loss": 0.9572, + "grad_norm": 0.486328125, + "learning_rate": 0.0001888799453286899, + "loss": 0.8768, "step": 1910 }, { "epoch": 2.3685837971552255, - "grad_norm": 0.50390625, - "learning_rate": 0.00019795729330513196, - "loss": 0.9586, + "grad_norm": 0.4453125, + "learning_rate": 0.00018878074325271498, + "loss": 0.8767, "step": 1915 }, { "epoch": 2.3747680890538034, - "grad_norm": 0.515625, - "learning_rate": 0.00019792823408445174, - "loss": 0.9532, + "grad_norm": 0.482421875, + "learning_rate": 0.00018868112693808665, + "loss": 0.8725, "step": 1920 }, { "epoch": 2.380952380952381, - "grad_norm": 0.6171875, - "learning_rate": 0.00019789897178835926, - "loss": 0.9444, + "grad_norm": 0.333984375, + "learning_rate": 0.00018858109684960082, + "loss": 0.8594, "step": 1925 }, { "epoch": 2.3871366728509584, - "grad_norm": 0.50390625, - "learning_rate": 0.0001978695064775363, - "loss": 0.9584, + "grad_norm": 0.34765625, + "learning_rate": 0.0001884806534539841, + "loss": 0.8748, "step": 1930 }, { "epoch": 2.3933209647495364, - "grad_norm": 0.5078125, - "learning_rate": 0.0001978398382130855, - "loss": 0.9628, + "grad_norm": 0.32421875, + "learning_rate": 0.0001883797972198914, + "loss": 0.8782, "step": 1935 }, { "epoch": 2.399505256648114, - "grad_norm": 0.6015625, - "learning_rate": 0.00019780996705653044, - "loss": 0.9545, + "grad_norm": 0.330078125, + "learning_rate": 0.00018827852861790398, + "loss": 0.8716, "step": 1940 }, { "epoch": 2.4056895485466914, - "grad_norm": 0.431640625, - "learning_rate": 0.00019777989306981542, - "loss": 0.9496, + "grad_norm": 0.3359375, + "learning_rate": 0.00018817684812052712, + "loss": 0.8684, "step": 1945 }, { "epoch": 2.411873840445269, - "grad_norm": 0.59765625, - "learning_rate": 0.00019774961631530545, - "loss": 0.9541, + "grad_norm": 0.333984375, + "learning_rate": 0.00018807475620218788, + "loss": 0.8726, "step": 1950 }, { "epoch": 2.418058132343847, - "grad_norm": 0.75, - "learning_rate": 0.00019771913685578585, - "loss": 0.9581, + "grad_norm": 0.427734375, + "learning_rate": 0.0001879722533392331, + "loss": 0.8738, "step": 1955 }, { "epoch": 2.4242424242424243, - "grad_norm": 0.74609375, - "learning_rate": 0.0001976884547544624, - "loss": 0.9535, + "grad_norm": 0.412109375, + "learning_rate": 0.00018786934000992688, + "loss": 0.872, "step": 1960 }, { "epoch": 2.430426716141002, - "grad_norm": 0.640625, - "learning_rate": 0.00019765757007496115, - "loss": 0.9595, + "grad_norm": 0.384765625, + "learning_rate": 0.0001877660166944486, + "loss": 0.8765, "step": 1965 }, { "epoch": 2.4366110080395793, - "grad_norm": 0.54296875, - "learning_rate": 0.0001976264828813281, - "loss": 0.9693, + "grad_norm": 0.33203125, + "learning_rate": 0.00018766228387489048, + "loss": 0.8849, "step": 1970 }, { "epoch": 2.4427952999381572, - "grad_norm": 0.62890625, - "learning_rate": 0.00019759519323802932, - "loss": 0.9582, + "grad_norm": 0.404296875, + "learning_rate": 0.0001875581420352556, + "loss": 0.8766, "step": 1975 }, { "epoch": 2.4489795918367347, - "grad_norm": 0.462890625, - "learning_rate": 0.00019756370120995066, - "loss": 0.9525, + "grad_norm": 0.341796875, + "learning_rate": 0.00018745359166145523, + "loss": 0.8724, "step": 1980 }, { "epoch": 2.4551638837353122, - "grad_norm": 0.474609375, - "learning_rate": 0.00019753200686239763, - "loss": 0.95, + "grad_norm": 0.376953125, + "learning_rate": 0.00018734863324130702, + "loss": 0.8675, "step": 1985 }, { "epoch": 2.4613481756338897, - "grad_norm": 0.62890625, - "learning_rate": 0.0001975001102610954, - "loss": 0.9582, + "grad_norm": 0.408203125, + "learning_rate": 0.00018724326726453244, + "loss": 0.8771, "step": 1990 }, { "epoch": 2.4675324675324677, - "grad_norm": 0.54296875, - "learning_rate": 0.00019746801147218842, - "loss": 0.9541, + "grad_norm": 0.4140625, + "learning_rate": 0.00018713749422275447, + "loss": 0.8745, "step": 1995 }, { "epoch": 2.473716759431045, - "grad_norm": 0.58984375, - "learning_rate": 0.0001974357105622405, - "loss": 0.9529, + "grad_norm": 0.3984375, + "learning_rate": 0.00018703131460949554, + "loss": 0.8707, "step": 2000 }, { "epoch": 2.4799010513296227, - "grad_norm": 0.5078125, - "learning_rate": 0.00019740320759823458, - "loss": 0.9606, + "grad_norm": 0.376953125, + "learning_rate": 0.000186924728920175, + "loss": 0.8764, "step": 2005 }, { "epoch": 2.4860853432282, - "grad_norm": 0.46875, - "learning_rate": 0.0001973705026475726, - "loss": 0.9632, + "grad_norm": 0.349609375, + "learning_rate": 0.0001868177376521069, + "loss": 0.8817, "step": 2010 }, { "epoch": 2.492269635126778, - "grad_norm": 0.56640625, - "learning_rate": 0.00019733759577807538, - "loss": 0.9567, + "grad_norm": 0.404296875, + "learning_rate": 0.0001867103413044977, + "loss": 0.8771, "step": 2015 }, { "epoch": 2.4984539270253556, - "grad_norm": 0.439453125, - "learning_rate": 0.00019730448705798239, - "loss": 0.9492, + "grad_norm": 0.314453125, + "learning_rate": 0.00018660254037844388, + "loss": 0.8693, "step": 2020 }, { "epoch": 2.504638218923933, - "grad_norm": 0.498046875, - "learning_rate": 0.0001972711765559518, - "loss": 0.96, + "grad_norm": 0.396484375, + "learning_rate": 0.00018649433537692964, + "loss": 0.8803, "step": 2025 }, { "epoch": 2.5108225108225106, - "grad_norm": 0.64453125, - "learning_rate": 0.0001972376643410601, - "loss": 0.9524, + "grad_norm": 0.392578125, + "learning_rate": 0.00018638572680482448, + "loss": 0.8728, "step": 2030 }, { "epoch": 2.5170068027210886, - "grad_norm": 0.470703125, - "learning_rate": 0.00019720395048280215, - "loss": 0.9538, + "grad_norm": 0.36328125, + "learning_rate": 0.00018627671516888104, + "loss": 0.8724, "step": 2035 }, { "epoch": 2.523191094619666, - "grad_norm": 0.478515625, - "learning_rate": 0.00019717003505109095, - "loss": 0.9492, + "grad_norm": 0.380859375, + "learning_rate": 0.0001861673009777325, + "loss": 0.8683, "step": 2040 }, { "epoch": 2.5293753865182436, - "grad_norm": 0.453125, - "learning_rate": 0.0001971359181162575, - "loss": 0.9496, + "grad_norm": 0.32421875, + "learning_rate": 0.0001860574847418903, + "loss": 0.8693, "step": 2045 }, { "epoch": 2.5355596784168215, - "grad_norm": 0.443359375, - "learning_rate": 0.00019710159974905064, - "loss": 0.9625, + "grad_norm": 0.326171875, + "learning_rate": 0.00018594726697374175, + "loss": 0.8809, "step": 2050 }, { "epoch": 2.541743970315399, - "grad_norm": 0.482421875, - "learning_rate": 0.00019706708002063694, - "loss": 0.9555, + "grad_norm": 0.328125, + "learning_rate": 0.00018583664818754776, + "loss": 0.8744, "step": 2055 }, { "epoch": 2.5479282622139765, - "grad_norm": 0.4453125, - "learning_rate": 0.00019703235900260055, - "loss": 0.9655, + "grad_norm": 0.326171875, + "learning_rate": 0.0001857256288994402, + "loss": 0.8833, "step": 2060 }, { "epoch": 2.554112554112554, - "grad_norm": 0.443359375, - "learning_rate": 0.00019699743676694303, - "loss": 0.9554, + "grad_norm": 0.30859375, + "learning_rate": 0.00018561420962741977, + "loss": 0.8742, "step": 2065 }, { "epoch": 2.5602968460111315, - "grad_norm": 0.474609375, - "learning_rate": 0.00019696231338608316, - "loss": 0.9522, + "grad_norm": 0.333984375, + "learning_rate": 0.00018550239089135334, + "loss": 0.8714, "step": 2070 }, { "epoch": 2.5664811379097094, - "grad_norm": 0.60546875, - "learning_rate": 0.00019692698893285693, - "loss": 0.952, + "grad_norm": 0.46484375, + "learning_rate": 0.00018539017321297162, + "loss": 0.8716, "step": 2075 }, { "epoch": 2.572665429808287, - "grad_norm": 0.43359375, - "learning_rate": 0.00019689146348051719, - "loss": 0.9524, + "grad_norm": 0.34765625, + "learning_rate": 0.00018527755711586678, + "loss": 0.8731, "step": 2080 }, { "epoch": 2.5788497217068644, - "grad_norm": 0.5078125, - "learning_rate": 0.00019685573710273376, - "loss": 0.9523, + "grad_norm": 0.408203125, + "learning_rate": 0.00018516454312548995, + "loss": 0.8722, "step": 2085 }, { "epoch": 2.5850340136054424, - "grad_norm": 0.51953125, - "learning_rate": 0.0001968198098735929, - "loss": 0.9491, + "grad_norm": 0.3671875, + "learning_rate": 0.0001850511317691487, + "loss": 0.8711, "step": 2090 }, { "epoch": 2.59121830550402, - "grad_norm": 0.40234375, - "learning_rate": 0.0001967836818675976, - "loss": 0.9496, + "grad_norm": 0.322265625, + "learning_rate": 0.00018493732357600478, + "loss": 0.8695, "step": 2095 }, { "epoch": 2.5974025974025974, - "grad_norm": 0.470703125, - "learning_rate": 0.0001967473531596671, - "loss": 0.9408, + "grad_norm": 0.3671875, + "learning_rate": 0.0001848231190770714, + "loss": 0.8641, "step": 2100 }, { "epoch": 2.603586889301175, - "grad_norm": 0.45703125, - "learning_rate": 0.0001967108238251368, - "loss": 0.9526, + "grad_norm": 0.3203125, + "learning_rate": 0.00018470851880521098, + "loss": 0.8726, "step": 2105 }, { "epoch": 2.6097711811997524, - "grad_norm": 0.462890625, - "learning_rate": 0.00019667409393975822, - "loss": 0.9476, + "grad_norm": 0.31640625, + "learning_rate": 0.0001845935232951325, + "loss": 0.8671, "step": 2110 }, { "epoch": 2.6159554730983303, - "grad_norm": 0.60546875, - "learning_rate": 0.00019663716357969874, - "loss": 0.9495, + "grad_norm": 0.392578125, + "learning_rate": 0.00018447813308338908, + "loss": 0.8691, "step": 2115 }, { "epoch": 2.622139764996908, - "grad_norm": 0.51953125, - "learning_rate": 0.00019660003282154147, - "loss": 0.945, + "grad_norm": 0.34375, + "learning_rate": 0.00018436234870837547, + "loss": 0.8645, "step": 2120 }, { "epoch": 2.6283240568954853, - "grad_norm": 0.421875, - "learning_rate": 0.00019656270174228503, - "loss": 0.9506, + "grad_norm": 0.353515625, + "learning_rate": 0.00018424617071032557, + "loss": 0.8724, "step": 2125 }, { "epoch": 2.6345083487940633, - "grad_norm": 0.427734375, - "learning_rate": 0.00019652517041934356, - "loss": 0.9483, + "grad_norm": 0.328125, + "learning_rate": 0.00018412959963130975, + "loss": 0.8703, "step": 2130 }, { "epoch": 2.6406926406926408, - "grad_norm": 0.458984375, - "learning_rate": 0.0001964874389305464, - "loss": 0.948, + "grad_norm": 0.353515625, + "learning_rate": 0.00018401263601523259, + "loss": 0.868, "step": 2135 }, { "epoch": 2.6468769325912183, - "grad_norm": 0.625, - "learning_rate": 0.00019644950735413788, - "loss": 0.9464, + "grad_norm": 0.33984375, + "learning_rate": 0.00018389528040783012, + "loss": 0.8662, "step": 2140 }, { "epoch": 2.6530612244897958, - "grad_norm": 0.66015625, - "learning_rate": 0.00019641137576877744, - "loss": 0.944, + "grad_norm": 0.314453125, + "learning_rate": 0.00018377753335666733, + "loss": 0.8641, "step": 2145 }, { "epoch": 2.6592455163883733, - "grad_norm": 0.54296875, - "learning_rate": 0.00019637304425353916, - "loss": 0.9437, + "grad_norm": 0.3359375, + "learning_rate": 0.00018365939541113566, + "loss": 0.8635, "step": 2150 }, { "epoch": 2.665429808286951, - "grad_norm": 0.56640625, - "learning_rate": 0.00019633451288791166, - "loss": 0.9494, + "grad_norm": 0.34375, + "learning_rate": 0.0001835408671224504, + "loss": 0.8702, "step": 2155 }, { "epoch": 2.6716141001855287, - "grad_norm": 0.6328125, - "learning_rate": 0.0001962957817517982, - "loss": 0.9497, + "grad_norm": 0.37109375, + "learning_rate": 0.00018342194904364813, + "loss": 0.8679, "step": 2160 }, { "epoch": 2.6777983920841066, - "grad_norm": 0.470703125, - "learning_rate": 0.00019625685092551612, - "loss": 0.9436, + "grad_norm": 0.31640625, + "learning_rate": 0.00018330264172958415, + "loss": 0.8634, "step": 2165 }, { "epoch": 2.683982683982684, - "grad_norm": 0.5390625, - "learning_rate": 0.0001962177204897969, - "loss": 0.9509, + "grad_norm": 0.337890625, + "learning_rate": 0.00018318294573692985, + "loss": 0.8745, "step": 2170 }, { "epoch": 2.6901669758812616, - "grad_norm": 0.515625, - "learning_rate": 0.00019617839052578603, - "loss": 0.9474, + "grad_norm": 0.3359375, + "learning_rate": 0.00018306286162417015, + "loss": 0.8697, "step": 2175 }, { "epoch": 2.696351267779839, - "grad_norm": 0.62890625, - "learning_rate": 0.0001961388611150427, - "loss": 0.942, + "grad_norm": 0.359375, + "learning_rate": 0.00018294238995160094, + "loss": 0.8625, "step": 2180 }, { "epoch": 2.7025355596784166, - "grad_norm": 0.58203125, - "learning_rate": 0.00019609913233953967, - "loss": 0.9558, + "grad_norm": 0.47265625, + "learning_rate": 0.00018282153128132628, + "loss": 0.8762, "step": 2185 }, { "epoch": 2.7087198515769946, - "grad_norm": 0.453125, - "learning_rate": 0.00019605920428166323, - "loss": 0.9616, + "grad_norm": 0.361328125, + "learning_rate": 0.00018270028617725607, + "loss": 0.883, "step": 2190 }, { "epoch": 2.714904143475572, - "grad_norm": 0.4453125, - "learning_rate": 0.0001960190770242128, - "loss": 0.9594, + "grad_norm": 0.3828125, + "learning_rate": 0.00018257865520510312, + "loss": 0.8819, "step": 2195 }, { "epoch": 2.7210884353741496, - "grad_norm": 0.44140625, - "learning_rate": 0.00019597875065040094, - "loss": 0.9455, + "grad_norm": 0.345703125, + "learning_rate": 0.00018245663893238075, + "loss": 0.8659, "step": 2200 }, { "epoch": 2.7272727272727275, - "grad_norm": 0.48828125, - "learning_rate": 0.00019593822524385316, - "loss": 0.9458, + "grad_norm": 0.384765625, + "learning_rate": 0.00018233423792839992, + "loss": 0.868, "step": 2205 }, { "epoch": 2.733457019171305, - "grad_norm": 0.435546875, - "learning_rate": 0.00019589750088860766, - "loss": 0.9468, + "grad_norm": 0.314453125, + "learning_rate": 0.00018221145276426683, + "loss": 0.8671, "step": 2210 }, { "epoch": 2.7396413110698825, - "grad_norm": 0.50390625, - "learning_rate": 0.00019585657766911524, - "loss": 0.9448, + "grad_norm": 0.30859375, + "learning_rate": 0.00018208828401288004, + "loss": 0.8668, "step": 2215 }, { "epoch": 2.74582560296846, - "grad_norm": 0.58203125, - "learning_rate": 0.000195815455670239, - "loss": 0.9415, + "grad_norm": 0.373046875, + "learning_rate": 0.00018196473224892784, + "loss": 0.8662, "step": 2220 }, { "epoch": 2.7520098948670375, - "grad_norm": 0.419921875, - "learning_rate": 0.00019577413497725438, - "loss": 0.9419, + "grad_norm": 0.33984375, + "learning_rate": 0.00018184079804888572, + "loss": 0.8663, "step": 2225 }, { "epoch": 2.7581941867656155, - "grad_norm": 0.5078125, - "learning_rate": 0.00019573261567584874, - "loss": 0.9409, + "grad_norm": 0.328125, + "learning_rate": 0.00018171648199101346, + "loss": 0.8639, "step": 2230 }, { "epoch": 2.764378478664193, - "grad_norm": 0.46875, - "learning_rate": 0.0001956908978521214, - "loss": 0.9544, + "grad_norm": 0.330078125, + "learning_rate": 0.00018159178465535256, + "loss": 0.8757, "step": 2235 }, { "epoch": 2.7705627705627704, - "grad_norm": 0.71875, - "learning_rate": 0.00019564898159258324, - "loss": 0.9327, + "grad_norm": 0.392578125, + "learning_rate": 0.00018146670662372354, + "loss": 0.8566, "step": 2240 }, { "epoch": 2.7767470624613484, - "grad_norm": 0.578125, - "learning_rate": 0.00019560686698415677, - "loss": 0.9478, + "grad_norm": 0.3046875, + "learning_rate": 0.00018134124847972316, + "loss": 0.8673, "step": 2245 }, { "epoch": 2.782931354359926, - "grad_norm": 0.462890625, - "learning_rate": 0.00019556455411417573, - "loss": 0.9384, + "grad_norm": 0.32421875, + "learning_rate": 0.00018121541080872176, + "loss": 0.8619, "step": 2250 }, { "epoch": 2.7891156462585034, - "grad_norm": 0.48046875, - "learning_rate": 0.00019552204307038502, - "loss": 0.9451, + "grad_norm": 0.298828125, + "learning_rate": 0.00018108919419786046, + "loss": 0.8684, "step": 2255 }, { "epoch": 2.795299938157081, - "grad_norm": 0.51953125, - "learning_rate": 0.0001954793339409405, - "loss": 0.9485, + "grad_norm": 0.345703125, + "learning_rate": 0.0001809625992360485, + "loss": 0.8708, "step": 2260 }, { "epoch": 2.8014842300556584, - "grad_norm": 0.5625, - "learning_rate": 0.0001954364268144088, - "loss": 0.9571, + "grad_norm": 0.345703125, + "learning_rate": 0.0001808356265139605, + "loss": 0.8784, "step": 2265 }, { "epoch": 2.8076685219542363, - "grad_norm": 0.46875, - "learning_rate": 0.00019539332177976714, - "loss": 0.9504, + "grad_norm": 0.31640625, + "learning_rate": 0.00018070827662403349, + "loss": 0.8718, "step": 2270 }, { "epoch": 2.813852813852814, - "grad_norm": 0.50390625, - "learning_rate": 0.00019535001892640317, - "loss": 0.9422, + "grad_norm": 0.33984375, + "learning_rate": 0.0001805805501604645, + "loss": 0.8626, "step": 2275 }, { "epoch": 2.8200371057513913, - "grad_norm": 0.45703125, - "learning_rate": 0.00019530651834411474, - "loss": 0.9473, + "grad_norm": 0.310546875, + "learning_rate": 0.0001804524477192075, + "loss": 0.8692, "step": 2280 }, { "epoch": 2.8262213976499693, - "grad_norm": 0.4375, - "learning_rate": 0.00019526282012310975, - "loss": 0.9467, + "grad_norm": 0.310546875, + "learning_rate": 0.00018032396989797072, + "loss": 0.8676, "step": 2285 }, { "epoch": 2.8324056895485468, - "grad_norm": 0.455078125, - "learning_rate": 0.00019521892435400587, - "loss": 0.955, + "grad_norm": 0.333984375, + "learning_rate": 0.0001801951172962139, + "loss": 0.8754, "step": 2290 }, { "epoch": 2.8385899814471243, - "grad_norm": 0.462890625, - "learning_rate": 0.00019517483112783054, - "loss": 0.9507, + "grad_norm": 0.298828125, + "learning_rate": 0.0001800658905151454, + "loss": 0.8706, "step": 2295 }, { "epoch": 2.8447742733457018, - "grad_norm": 0.47265625, - "learning_rate": 0.00019513054053602055, - "loss": 0.9447, + "grad_norm": 0.3203125, + "learning_rate": 0.0001799362901577196, + "loss": 0.8658, "step": 2300 }, { "epoch": 2.8509585652442793, - "grad_norm": 0.46484375, - "learning_rate": 0.00019508605267042214, - "loss": 0.9553, + "grad_norm": 0.326171875, + "learning_rate": 0.0001798063168286337, + "loss": 0.8768, "step": 2305 }, { "epoch": 2.857142857142857, - "grad_norm": 0.44140625, - "learning_rate": 0.00019504136762329047, - "loss": 0.9454, + "grad_norm": 0.3203125, + "learning_rate": 0.0001796759711343253, + "loss": 0.8665, "step": 2310 }, { "epoch": 2.8633271490414347, - "grad_norm": 0.55859375, - "learning_rate": 0.00019499648548728965, - "loss": 0.9529, + "grad_norm": 0.359375, + "learning_rate": 0.00017954525368296933, + "loss": 0.8761, "step": 2315 }, { "epoch": 2.869511440940012, - "grad_norm": 0.57421875, - "learning_rate": 0.00019495140635549261, - "loss": 0.9455, + "grad_norm": 0.392578125, + "learning_rate": 0.00017941416508447536, + "loss": 0.8671, "step": 2320 }, { "epoch": 2.87569573283859, - "grad_norm": 0.431640625, - "learning_rate": 0.00019490613032138062, - "loss": 0.9468, + "grad_norm": 0.333984375, + "learning_rate": 0.0001792827059504846, + "loss": 0.8687, "step": 2325 }, { "epoch": 2.8818800247371676, - "grad_norm": 0.515625, - "learning_rate": 0.00019486065747884333, - "loss": 0.9527, + "grad_norm": 0.376953125, + "learning_rate": 0.0001791508768943672, + "loss": 0.874, "step": 2330 }, { "epoch": 2.888064316635745, - "grad_norm": 0.451171875, - "learning_rate": 0.0001948149879221786, - "loss": 0.9526, + "grad_norm": 0.32421875, + "learning_rate": 0.00017901867853121925, + "loss": 0.8737, "step": 2335 }, { "epoch": 2.8942486085343226, - "grad_norm": 0.48046875, - "learning_rate": 0.0001947691217460921, - "loss": 0.9494, + "grad_norm": 0.33203125, + "learning_rate": 0.00017888611147786002, + "loss": 0.871, "step": 2340 }, { "epoch": 2.9004329004329006, - "grad_norm": 0.52734375, - "learning_rate": 0.00019472305904569729, - "loss": 0.9309, + "grad_norm": 0.365234375, + "learning_rate": 0.000178753176352829, + "loss": 0.8551, "step": 2345 }, { "epoch": 2.906617192331478, - "grad_norm": 0.44921875, - "learning_rate": 0.0001946767999165152, - "loss": 0.9505, + "grad_norm": 0.34765625, + "learning_rate": 0.00017861987377638312, + "loss": 0.8738, "step": 2350 }, { "epoch": 2.9128014842300556, - "grad_norm": 0.5, - "learning_rate": 0.0001946303444544741, - "loss": 0.9355, + "grad_norm": 0.357421875, + "learning_rate": 0.0001784862043704937, + "loss": 0.8611, "step": 2355 }, { "epoch": 2.9189857761286335, - "grad_norm": 0.4140625, - "learning_rate": 0.00019458369275590954, - "loss": 0.9423, + "grad_norm": 0.318359375, + "learning_rate": 0.00017835216875884368, + "loss": 0.8659, "step": 2360 }, { "epoch": 2.925170068027211, - "grad_norm": 0.470703125, - "learning_rate": 0.00019453684491756382, - "loss": 0.9392, + "grad_norm": 0.349609375, + "learning_rate": 0.0001782177675668247, + "loss": 0.8627, "step": 2365 }, { "epoch": 2.9313543599257885, - "grad_norm": 0.4609375, - "learning_rate": 0.00019448980103658613, - "loss": 0.9435, + "grad_norm": 0.296875, + "learning_rate": 0.00017808300142153406, + "loss": 0.8658, "step": 2370 }, { "epoch": 2.937538651824366, - "grad_norm": 0.6328125, - "learning_rate": 0.00019444256121053217, - "loss": 0.9505, + "grad_norm": 0.294921875, + "learning_rate": 0.00017794787095177196, + "loss": 0.8727, "step": 2375 }, { "epoch": 2.9437229437229435, - "grad_norm": 0.58984375, - "learning_rate": 0.00019439512553736394, - "loss": 0.945, + "grad_norm": 0.3203125, + "learning_rate": 0.00017781237678803847, + "loss": 0.868, "step": 2380 }, { "epoch": 2.9499072356215215, - "grad_norm": 0.5, - "learning_rate": 0.00019434749411544958, - "loss": 0.9408, + "grad_norm": 0.322265625, + "learning_rate": 0.00017767651956253054, + "loss": 0.8638, "step": 2385 }, { "epoch": 2.956091527520099, - "grad_norm": 0.48046875, - "learning_rate": 0.0001942996670435632, - "loss": 0.9521, + "grad_norm": 0.318359375, + "learning_rate": 0.00017754029990913926, + "loss": 0.8746, "step": 2390 }, { "epoch": 2.9622758194186765, - "grad_norm": 0.5625, - "learning_rate": 0.00019425164442088451, - "loss": 0.9453, + "grad_norm": 0.330078125, + "learning_rate": 0.00017740371846344655, + "loss": 0.8681, "step": 2395 }, { "epoch": 2.9684601113172544, - "grad_norm": 0.466796875, - "learning_rate": 0.0001942034263469989, - "loss": 0.9481, + "grad_norm": 0.384765625, + "learning_rate": 0.00017726677586272263, + "loss": 0.8708, "step": 2400 }, { "epoch": 2.974644403215832, - "grad_norm": 0.478515625, - "learning_rate": 0.000194155012921897, - "loss": 0.9466, + "grad_norm": 0.296875, + "learning_rate": 0.00017712947274592267, + "loss": 0.8712, "step": 2405 }, { "epoch": 2.9808286951144094, - "grad_norm": 0.474609375, - "learning_rate": 0.0001941064042459745, - "loss": 0.9376, + "grad_norm": 0.341796875, + "learning_rate": 0.00017699180975368396, + "loss": 0.8614, "step": 2410 }, { "epoch": 2.987012987012987, - "grad_norm": 0.49609375, - "learning_rate": 0.00019405760042003203, - "loss": 0.9422, + "grad_norm": 0.3125, + "learning_rate": 0.00017685378752832305, + "loss": 0.8642, "step": 2415 }, { "epoch": 2.9931972789115644, - "grad_norm": 0.51171875, - "learning_rate": 0.00019400860154527493, - "loss": 0.9541, + "grad_norm": 0.31640625, + "learning_rate": 0.00017671540671383243, + "loss": 0.8755, "step": 2420 }, { "epoch": 2.9993815708101423, - "grad_norm": 0.484375, - "learning_rate": 0.0001939594077233129, - "loss": 0.9526, + "grad_norm": 0.3203125, + "learning_rate": 0.00017657666795587788, + "loss": 0.8752, "step": 2425 }, { "epoch": 2.9993815708101423, - "eval_loss": 2.465909481048584, - "eval_runtime": 0.6411, - "eval_samples_per_second": 15.599, - "eval_steps_per_second": 1.56, + "eval_loss": 2.514435291290283, + "eval_runtime": 0.9539, + "eval_samples_per_second": 10.484, + "eval_steps_per_second": 1.048, "step": 2425 }, { "epoch": 3.00556586270872, - "grad_norm": 0.462890625, - "learning_rate": 0.0001939100190561601, - "loss": 0.935, + "grad_norm": 0.330078125, + "learning_rate": 0.00017643757190179523, + "loss": 0.8414, "step": 2430 }, { "epoch": 3.0117501546072973, - "grad_norm": 0.484375, - "learning_rate": 0.00019386043564623452, - "loss": 0.9371, + "grad_norm": 0.322265625, + "learning_rate": 0.00017629811920058733, + "loss": 0.8423, "step": 2435 }, { "epoch": 3.0179344465058753, - "grad_norm": 0.5234375, - "learning_rate": 0.00019381065759635822, - "loss": 0.932, + "grad_norm": 0.310546875, + "learning_rate": 0.0001761583105029213, + "loss": 0.8393, "step": 2440 }, { "epoch": 3.0241187384044528, - "grad_norm": 0.435546875, - "learning_rate": 0.00019376068500975667, - "loss": 0.93, + "grad_norm": 0.328125, + "learning_rate": 0.00017601814646112506, + "loss": 0.838, "step": 2445 }, { "epoch": 3.0303030303030303, - "grad_norm": 0.4765625, - "learning_rate": 0.0001937105179900589, - "loss": 0.9412, + "grad_norm": 0.318359375, + "learning_rate": 0.00017587762772918467, + "loss": 0.8458, "step": 2450 }, { "epoch": 3.0364873222016078, - "grad_norm": 0.53125, - "learning_rate": 0.00019366015664129714, - "loss": 0.9272, + "grad_norm": 0.333984375, + "learning_rate": 0.00017573675496274102, + "loss": 0.8339, "step": 2455 }, { "epoch": 3.0426716141001857, - "grad_norm": 0.5, - "learning_rate": 0.00019360960106790643, - "loss": 0.9351, + "grad_norm": 0.3125, + "learning_rate": 0.00017559552881908695, + "loss": 0.8413, "step": 2460 }, { "epoch": 3.048855905998763, - "grad_norm": 0.412109375, - "learning_rate": 0.00019355885137472488, - "loss": 0.9307, + "grad_norm": 0.337890625, + "learning_rate": 0.00017545394995716418, + "loss": 0.8363, "step": 2465 }, { "epoch": 3.0550401978973407, - "grad_norm": 0.5703125, - "learning_rate": 0.00019350790766699282, - "loss": 0.9332, + "grad_norm": 0.357421875, + "learning_rate": 0.00017531201903755994, + "loss": 0.8377, "step": 2470 }, { "epoch": 3.061224489795918, - "grad_norm": 0.6484375, - "learning_rate": 0.00019345677005035315, - "loss": 0.9351, + "grad_norm": 0.330078125, + "learning_rate": 0.00017516973672250432, + "loss": 0.8415, "step": 2475 }, { "epoch": 3.067408781694496, - "grad_norm": 0.50390625, - "learning_rate": 0.0001934054386308508, - "loss": 0.9241, + "grad_norm": 0.37890625, + "learning_rate": 0.00017502710367586687, + "loss": 0.8313, "step": 2480 }, { "epoch": 3.0735930735930737, - "grad_norm": 0.49609375, - "learning_rate": 0.00019335391351493257, - "loss": 0.9182, + "grad_norm": 0.365234375, + "learning_rate": 0.0001748841205631537, + "loss": 0.8271, "step": 2485 }, { "epoch": 3.079777365491651, - "grad_norm": 0.43359375, - "learning_rate": 0.00019330219480944694, - "loss": 0.9133, + "grad_norm": 0.34765625, + "learning_rate": 0.0001747407880515041, + "loss": 0.8222, "step": 2490 }, { "epoch": 3.0859616573902287, - "grad_norm": 0.4609375, - "learning_rate": 0.00019325028262164384, - "loss": 0.9386, + "grad_norm": 0.357421875, + "learning_rate": 0.0001745971068096878, + "loss": 0.8444, "step": 2495 }, { "epoch": 3.0921459492888066, - "grad_norm": 0.439453125, - "learning_rate": 0.0001931981770591745, - "loss": 0.9302, + "grad_norm": 0.306640625, + "learning_rate": 0.0001744530775081015, + "loss": 0.8366, "step": 2500 }, { "epoch": 3.098330241187384, - "grad_norm": 0.421875, - "learning_rate": 0.00019314587823009103, - "loss": 0.9421, + "grad_norm": 0.341796875, + "learning_rate": 0.0001743087008187661, + "loss": 0.8482, "step": 2505 }, { "epoch": 3.1045145330859616, - "grad_norm": 0.486328125, - "learning_rate": 0.00019309338624284644, - "loss": 0.9286, + "grad_norm": 0.33203125, + "learning_rate": 0.00017416397741532315, + "loss": 0.8387, "step": 2510 }, { "epoch": 3.110698824984539, - "grad_norm": 0.50390625, - "learning_rate": 0.0001930407012062942, - "loss": 0.9291, + "grad_norm": 0.35546875, + "learning_rate": 0.00017401890797303206, + "loss": 0.8376, "step": 2515 }, { "epoch": 3.116883116883117, - "grad_norm": 0.431640625, - "learning_rate": 0.00019298782322968815, - "loss": 0.9332, + "grad_norm": 0.3671875, + "learning_rate": 0.00017387349316876666, + "loss": 0.8405, "step": 2520 }, { "epoch": 3.1230674087816945, - "grad_norm": 0.43359375, - "learning_rate": 0.00019293475242268223, - "loss": 0.9427, + "grad_norm": 0.326171875, + "learning_rate": 0.0001737277336810124, + "loss": 0.8484, "step": 2525 }, { "epoch": 3.129251700680272, - "grad_norm": 0.5625, - "learning_rate": 0.0001928814888953303, - "loss": 0.9277, + "grad_norm": 0.3203125, + "learning_rate": 0.00017358163018986282, + "loss": 0.8368, "step": 2530 }, { "epoch": 3.1354359925788495, - "grad_norm": 0.4140625, - "learning_rate": 0.0001928280327580858, - "loss": 0.9286, + "grad_norm": 0.3515625, + "learning_rate": 0.00017343518337701658, + "loss": 0.8367, "step": 2535 }, { "epoch": 3.1416202844774275, - "grad_norm": 0.431640625, - "learning_rate": 0.0001927743841218016, - "loss": 0.9214, + "grad_norm": 0.330078125, + "learning_rate": 0.0001732883939257742, + "loss": 0.8324, "step": 2540 }, { "epoch": 3.147804576376005, - "grad_norm": 0.66796875, - "learning_rate": 0.00019272054309772978, - "loss": 0.9328, + "grad_norm": 0.318359375, + "learning_rate": 0.000173141262521035, + "loss": 0.8393, "step": 2545 }, { "epoch": 3.1539888682745825, - "grad_norm": 0.5390625, - "learning_rate": 0.00019266650979752136, - "loss": 0.9427, + "grad_norm": 0.353515625, + "learning_rate": 0.00017299378984929366, + "loss": 0.8502, "step": 2550 }, { "epoch": 3.16017316017316, - "grad_norm": 0.69921875, - "learning_rate": 0.0001926122843332261, - "loss": 0.9285, + "grad_norm": 0.33203125, + "learning_rate": 0.0001728459765986373, + "loss": 0.8363, "step": 2555 }, { "epoch": 3.166357452071738, - "grad_norm": 0.58984375, - "learning_rate": 0.00019255786681729225, - "loss": 0.9344, + "grad_norm": 0.3359375, + "learning_rate": 0.00017269782345874203, + "loss": 0.8427, "step": 2560 }, { "epoch": 3.1725417439703154, - "grad_norm": 0.56640625, - "learning_rate": 0.00019250325736256633, - "loss": 0.9332, + "grad_norm": 0.337890625, + "learning_rate": 0.00017254933112086996, + "loss": 0.8413, "step": 2565 }, { "epoch": 3.178726035868893, - "grad_norm": 0.51171875, - "learning_rate": 0.00019244845608229293, - "loss": 0.9357, + "grad_norm": 0.369140625, + "learning_rate": 0.0001724005002778657, + "loss": 0.8452, "step": 2570 }, { "epoch": 3.1849103277674704, - "grad_norm": 0.4765625, - "learning_rate": 0.00019239346309011426, - "loss": 0.937, + "grad_norm": 0.326171875, + "learning_rate": 0.00017225133162415338, + "loss": 0.8458, "step": 2575 }, { "epoch": 3.1910946196660483, - "grad_norm": 0.46875, - "learning_rate": 0.00019233827850007027, - "loss": 0.9332, + "grad_norm": 0.3203125, + "learning_rate": 0.00017210182585573327, + "loss": 0.8419, "step": 2580 }, { "epoch": 3.197278911564626, - "grad_norm": 0.55859375, - "learning_rate": 0.00019228290242659816, - "loss": 0.937, + "grad_norm": 0.337890625, + "learning_rate": 0.00017195198367017862, + "loss": 0.8457, "step": 2585 }, { "epoch": 3.2034632034632033, - "grad_norm": 0.498046875, - "learning_rate": 0.00019222733498453222, - "loss": 0.9214, + "grad_norm": 0.380859375, + "learning_rate": 0.00017180180576663228, + "loss": 0.8353, "step": 2590 }, { "epoch": 3.2096474953617813, - "grad_norm": 0.546875, - "learning_rate": 0.0001921715762891036, - "loss": 0.9329, + "grad_norm": 0.37890625, + "learning_rate": 0.00017165129284580353, + "loss": 0.8453, "step": 2595 }, { "epoch": 3.215831787260359, - "grad_norm": 0.53515625, - "learning_rate": 0.00019211562645594002, - "loss": 0.9317, + "grad_norm": 0.330078125, + "learning_rate": 0.00017150044560996488, + "loss": 0.8424, "step": 2600 }, { "epoch": 3.2220160791589363, - "grad_norm": 0.498046875, - "learning_rate": 0.00019205948560106556, - "loss": 0.9232, + "grad_norm": 0.345703125, + "learning_rate": 0.0001713492647629486, + "loss": 0.836, "step": 2605 }, { "epoch": 3.228200371057514, - "grad_norm": 0.51953125, - "learning_rate": 0.00019200315384090044, - "loss": 0.9325, + "grad_norm": 0.34375, + "learning_rate": 0.00017119775101014358, + "loss": 0.8443, "step": 2610 }, { "epoch": 3.2343846629560917, - "grad_norm": 0.51953125, - "learning_rate": 0.00019194663129226084, - "loss": 0.9319, + "grad_norm": 0.32421875, + "learning_rate": 0.00017104590505849206, + "loss": 0.8454, "step": 2615 }, { "epoch": 3.2405689548546692, - "grad_norm": 0.52734375, - "learning_rate": 0.00019188991807235844, - "loss": 0.9368, + "grad_norm": 0.333984375, + "learning_rate": 0.00017089372761648616, + "loss": 0.8446, "step": 2620 }, { "epoch": 3.2467532467532467, - "grad_norm": 0.51171875, - "learning_rate": 0.00019183301429880043, - "loss": 0.9295, + "grad_norm": 0.36328125, + "learning_rate": 0.00017074121939416478, + "loss": 0.8403, "step": 2625 }, { "epoch": 3.2529375386518242, - "grad_norm": 0.43359375, - "learning_rate": 0.0001917759200895891, - "loss": 0.9421, + "grad_norm": 0.330078125, + "learning_rate": 0.00017058838110311017, + "loss": 0.8529, "step": 2630 }, { "epoch": 3.259121830550402, - "grad_norm": 0.462890625, - "learning_rate": 0.00019171863556312167, - "loss": 0.9335, + "grad_norm": 0.318359375, + "learning_rate": 0.0001704352134564446, + "loss": 0.8456, "step": 2635 }, { "epoch": 3.2653061224489797, - "grad_norm": 0.5234375, - "learning_rate": 0.00019166116083819002, - "loss": 0.9317, + "grad_norm": 0.318359375, + "learning_rate": 0.00017028171716882714, + "loss": 0.842, "step": 2640 }, { "epoch": 3.271490414347557, - "grad_norm": 0.4609375, - "learning_rate": 0.00019160349603398043, - "loss": 0.9297, + "grad_norm": 0.3125, + "learning_rate": 0.00017012789295645016, + "loss": 0.841, "step": 2645 }, { "epoch": 3.2776747062461347, - "grad_norm": 0.55078125, - "learning_rate": 0.00019154564127007336, - "loss": 0.9371, + "grad_norm": 0.330078125, + "learning_rate": 0.00016997374153703625, + "loss": 0.8483, "step": 2650 }, { "epoch": 3.2838589981447126, - "grad_norm": 0.59375, - "learning_rate": 0.00019148759666644325, - "loss": 0.9286, + "grad_norm": 0.349609375, + "learning_rate": 0.00016981926362983442, + "loss": 0.8406, "step": 2655 }, { "epoch": 3.29004329004329, - "grad_norm": 0.396484375, - "learning_rate": 0.0001914293623434581, - "loss": 0.9404, + "grad_norm": 0.33203125, + "learning_rate": 0.00016966445995561727, + "loss": 0.8497, "step": 2660 }, { "epoch": 3.2962275819418676, - "grad_norm": 0.5234375, - "learning_rate": 0.00019137093842187944, - "loss": 0.9292, + "grad_norm": 0.3359375, + "learning_rate": 0.00016950933123667733, + "loss": 0.8411, "step": 2665 }, { "epoch": 3.302411873840445, - "grad_norm": 0.439453125, - "learning_rate": 0.00019131232502286188, - "loss": 0.9224, + "grad_norm": 0.318359375, + "learning_rate": 0.00016935387819682376, + "loss": 0.8346, "step": 2670 }, { "epoch": 3.308596165739023, - "grad_norm": 0.4375, - "learning_rate": 0.00019125352226795307, - "loss": 0.9313, + "grad_norm": 0.36328125, + "learning_rate": 0.0001691981015613788, + "loss": 0.8417, "step": 2675 }, { "epoch": 3.3147804576376005, - "grad_norm": 0.4921875, - "learning_rate": 0.00019119453027909323, - "loss": 0.928, + "grad_norm": 0.34375, + "learning_rate": 0.0001690420020571747, + "loss": 0.839, "step": 2680 }, { "epoch": 3.320964749536178, - "grad_norm": 0.412109375, - "learning_rate": 0.00019113534917861502, - "loss": 0.9357, + "grad_norm": 0.31640625, + "learning_rate": 0.00016888558041255015, + "loss": 0.8469, "step": 2685 }, { "epoch": 3.3271490414347555, - "grad_norm": 0.421875, - "learning_rate": 0.0001910759790892433, - "loss": 0.9339, + "grad_norm": 0.341796875, + "learning_rate": 0.0001687288373573469, + "loss": 0.8449, "step": 2690 }, { "epoch": 3.3333333333333335, - "grad_norm": 0.470703125, - "learning_rate": 0.0001910164201340948, - "loss": 0.9342, + "grad_norm": 0.3359375, + "learning_rate": 0.00016857177362290625, + "loss": 0.8458, "step": 2695 }, { "epoch": 3.339517625231911, - "grad_norm": 0.41796875, - "learning_rate": 0.0001909566724366779, - "loss": 0.9217, + "grad_norm": 0.330078125, + "learning_rate": 0.00016841438994206595, + "loss": 0.8357, "step": 2700 }, { "epoch": 3.3457019171304885, - "grad_norm": 0.5, - "learning_rate": 0.00019089673612089243, - "loss": 0.9239, + "grad_norm": 0.3515625, + "learning_rate": 0.00016825668704915643, + "loss": 0.8378, "step": 2705 }, { "epoch": 3.3518862090290664, - "grad_norm": 0.546875, - "learning_rate": 0.00019083661131102933, - "loss": 0.9289, + "grad_norm": 0.33203125, + "learning_rate": 0.0001680986656799975, + "loss": 0.8388, "step": 2710 }, { "epoch": 3.358070500927644, - "grad_norm": 0.49609375, - "learning_rate": 0.00019077629813177036, - "loss": 0.925, + "grad_norm": 0.32421875, + "learning_rate": 0.00016794032657189504, + "loss": 0.838, "step": 2715 }, { "epoch": 3.3642547928262214, - "grad_norm": 0.43359375, - "learning_rate": 0.00019071579670818808, - "loss": 0.9251, + "grad_norm": 0.33203125, + "learning_rate": 0.00016778167046363734, + "loss": 0.8372, "step": 2720 }, { "epoch": 3.370439084724799, - "grad_norm": 0.52734375, - "learning_rate": 0.00019065510716574516, - "loss": 0.9272, + "grad_norm": 0.310546875, + "learning_rate": 0.00016762269809549184, + "loss": 0.8398, "step": 2725 }, { "epoch": 3.3766233766233764, - "grad_norm": 0.4375, - "learning_rate": 0.00019059422963029464, - "loss": 0.9264, + "grad_norm": 0.306640625, + "learning_rate": 0.00016746341020920167, + "loss": 0.8402, "step": 2730 }, { "epoch": 3.3828076685219544, - "grad_norm": 0.53515625, - "learning_rate": 0.00019053316422807922, - "loss": 0.9169, + "grad_norm": 0.341796875, + "learning_rate": 0.00016730380754798198, + "loss": 0.8319, "step": 2735 }, { "epoch": 3.388991960420532, - "grad_norm": 0.4609375, - "learning_rate": 0.00019047191108573125, - "loss": 0.9299, + "grad_norm": 0.318359375, + "learning_rate": 0.0001671438908565167, + "loss": 0.8434, "step": 2740 }, { "epoch": 3.3951762523191094, - "grad_norm": 0.439453125, - "learning_rate": 0.00019041047033027236, - "loss": 0.9293, + "grad_norm": 0.3125, + "learning_rate": 0.000166983660880955, + "loss": 0.8417, "step": 2745 }, { "epoch": 3.4013605442176873, - "grad_norm": 0.4921875, - "learning_rate": 0.00019034884208911335, - "loss": 0.9163, + "grad_norm": 0.33984375, + "learning_rate": 0.00016682311836890766, + "loss": 0.8294, "step": 2750 }, { "epoch": 3.407544836116265, - "grad_norm": 0.466796875, - "learning_rate": 0.00019028702649005364, - "loss": 0.927, + "grad_norm": 0.384765625, + "learning_rate": 0.00016666226406944395, + "loss": 0.84, "step": 2755 }, { "epoch": 3.4137291280148423, - "grad_norm": 0.45703125, - "learning_rate": 0.00019022502366128135, - "loss": 0.9231, + "grad_norm": 0.388671875, + "learning_rate": 0.00016650109873308765, + "loss": 0.8378, "step": 2760 }, { "epoch": 3.41991341991342, - "grad_norm": 0.427734375, - "learning_rate": 0.00019016283373137274, - "loss": 0.9216, + "grad_norm": 0.33984375, + "learning_rate": 0.0001663396231118139, + "loss": 0.8374, "step": 2765 }, { "epoch": 3.4260977118119973, - "grad_norm": 0.56640625, - "learning_rate": 0.00019010045682929213, - "loss": 0.9191, + "grad_norm": 0.40234375, + "learning_rate": 0.00016617783795904565, + "loss": 0.8331, "step": 2770 }, { "epoch": 3.4322820037105752, - "grad_norm": 0.53125, - "learning_rate": 0.00019003789308439148, - "loss": 0.9167, + "grad_norm": 0.349609375, + "learning_rate": 0.00016601574402965, + "loss": 0.8289, "step": 2775 }, { "epoch": 3.4384662956091527, - "grad_norm": 0.46875, - "learning_rate": 0.00018997514262641035, - "loss": 0.9203, + "grad_norm": 0.326171875, + "learning_rate": 0.00016585334207993476, + "loss": 0.8351, "step": 2780 }, { "epoch": 3.4446505875077302, - "grad_norm": 0.48046875, - "learning_rate": 0.00018991220558547533, - "loss": 0.9316, + "grad_norm": 0.3203125, + "learning_rate": 0.0001656906328676449, + "loss": 0.8474, "step": 2785 }, { "epoch": 3.450834879406308, - "grad_norm": 0.51953125, - "learning_rate": 0.0001898490820921001, - "loss": 0.9255, + "grad_norm": 0.333984375, + "learning_rate": 0.00016552761715195918, + "loss": 0.8373, "step": 2790 }, { "epoch": 3.4570191713048857, - "grad_norm": 0.474609375, - "learning_rate": 0.00018978577227718484, - "loss": 0.9166, + "grad_norm": 0.32421875, + "learning_rate": 0.00016536429569348623, + "loss": 0.8314, "step": 2795 }, { "epoch": 3.463203463203463, - "grad_norm": 0.40234375, - "learning_rate": 0.00018972227627201617, - "loss": 0.928, + "grad_norm": 0.3046875, + "learning_rate": 0.00016520066925426144, + "loss": 0.8397, "step": 2800 }, { "epoch": 3.4693877551020407, - "grad_norm": 0.45703125, - "learning_rate": 0.00018965859420826684, - "loss": 0.9247, + "grad_norm": 0.310546875, + "learning_rate": 0.0001650367385977431, + "loss": 0.8393, "step": 2805 }, { "epoch": 3.4755720470006186, - "grad_norm": 0.40625, - "learning_rate": 0.0001895947262179954, - "loss": 0.9221, + "grad_norm": 0.296875, + "learning_rate": 0.00016487250448880893, + "loss": 0.8369, "step": 2810 }, { "epoch": 3.481756338899196, - "grad_norm": 0.45703125, - "learning_rate": 0.00018953067243364594, - "loss": 0.9183, + "grad_norm": 0.330078125, + "learning_rate": 0.00016470796769375257, + "loss": 0.8336, "step": 2815 }, { "epoch": 3.4879406307977736, - "grad_norm": 0.46484375, - "learning_rate": 0.00018946643298804793, - "loss": 0.925, + "grad_norm": 0.31640625, + "learning_rate": 0.0001645431289802799, + "loss": 0.8415, "step": 2820 }, { "epoch": 3.494124922696351, - "grad_norm": 0.46484375, - "learning_rate": 0.0001894020080144157, - "loss": 0.9341, + "grad_norm": 0.330078125, + "learning_rate": 0.0001643779891175055, + "loss": 0.8482, "step": 2825 }, { "epoch": 3.500309214594929, - "grad_norm": 0.59765625, - "learning_rate": 0.00018933739764634847, - "loss": 0.9188, + "grad_norm": 0.400390625, + "learning_rate": 0.00016421254887594917, + "loss": 0.8354, "step": 2830 }, { "epoch": 3.5064935064935066, - "grad_norm": 0.62890625, - "learning_rate": 0.00018927260201782978, - "loss": 0.9297, + "grad_norm": 0.447265625, + "learning_rate": 0.00016404680902753214, + "loss": 0.8423, "step": 2835 }, { "epoch": 3.512677798392084, - "grad_norm": 0.6171875, - "learning_rate": 0.0001892076212632274, - "loss": 0.9197, + "grad_norm": 0.33203125, + "learning_rate": 0.00016388077034557355, + "loss": 0.8334, "step": 2840 }, { "epoch": 3.5188620902906615, - "grad_norm": 0.56640625, - "learning_rate": 0.00018914245551729305, - "loss": 0.9139, + "grad_norm": 0.373046875, + "learning_rate": 0.00016371443360478692, + "loss": 0.8298, "step": 2845 }, { "epoch": 3.5250463821892395, - "grad_norm": 0.61328125, - "learning_rate": 0.00018907710491516199, - "loss": 0.9252, + "grad_norm": 0.369140625, + "learning_rate": 0.0001635477995812765, + "loss": 0.839, "step": 2850 }, { "epoch": 3.531230674087817, - "grad_norm": 0.482421875, - "learning_rate": 0.00018901156959235285, - "loss": 0.926, + "grad_norm": 0.349609375, + "learning_rate": 0.0001633808690525335, + "loss": 0.8398, "step": 2855 }, { "epoch": 3.5374149659863945, - "grad_norm": 0.478515625, - "learning_rate": 0.00018894584968476733, - "loss": 0.926, + "grad_norm": 0.345703125, + "learning_rate": 0.00016321364279743266, + "loss": 0.8429, "step": 2860 }, { "epoch": 3.5435992578849724, - "grad_norm": 0.5625, - "learning_rate": 0.0001888799453286899, - "loss": 0.9311, + "grad_norm": 0.3671875, + "learning_rate": 0.00016304612159622855, + "loss": 0.8458, "step": 2865 }, { "epoch": 3.54978354978355, - "grad_norm": 0.50390625, - "learning_rate": 0.00018881385666078755, - "loss": 0.9264, + "grad_norm": 0.390625, + "learning_rate": 0.00016287830623055188, + "loss": 0.8421, "step": 2870 }, { "epoch": 3.5559678416821274, - "grad_norm": 0.490234375, - "learning_rate": 0.00018874758381810943, - "loss": 0.9353, + "grad_norm": 0.3203125, + "learning_rate": 0.0001627101974834059, + "loss": 0.8496, "step": 2875 }, { "epoch": 3.562152133580705, - "grad_norm": 0.490234375, - "learning_rate": 0.00018868112693808665, - "loss": 0.931, + "grad_norm": 0.3359375, + "learning_rate": 0.00016254179613916278, + "loss": 0.8466, "step": 2880 }, { "epoch": 3.5683364254792824, - "grad_norm": 0.4921875, - "learning_rate": 0.0001886144861585319, - "loss": 0.9156, + "grad_norm": 0.322265625, + "learning_rate": 0.00016237310298355986, + "loss": 0.8342, "step": 2885 }, { "epoch": 3.5745207173778604, - "grad_norm": 0.455078125, - "learning_rate": 0.00018854766161763932, - "loss": 0.933, + "grad_norm": 0.318359375, + "learning_rate": 0.00016220411880369601, + "loss": 0.8486, "step": 2890 }, { "epoch": 3.580705009276438, - "grad_norm": 0.60546875, - "learning_rate": 0.0001884806534539841, - "loss": 0.9268, + "grad_norm": 0.341796875, + "learning_rate": 0.00016203484438802806, + "loss": 0.8414, "step": 2895 }, { "epoch": 3.5868893011750154, - "grad_norm": 0.60546875, - "learning_rate": 0.00018841346180652213, - "loss": 0.92, + "grad_norm": 0.310546875, + "learning_rate": 0.00016186528052636692, + "loss": 0.8345, "step": 2900 }, { "epoch": 3.5930735930735933, - "grad_norm": 0.49609375, - "learning_rate": 0.00018834608681458988, - "loss": 0.9125, + "grad_norm": 0.31640625, + "learning_rate": 0.00016169542800987418, + "loss": 0.8275, "step": 2905 }, { "epoch": 3.599257884972171, - "grad_norm": 0.5390625, - "learning_rate": 0.00018827852861790398, - "loss": 0.9187, + "grad_norm": 0.365234375, + "learning_rate": 0.0001615252876310581, + "loss": 0.8339, "step": 2910 }, { "epoch": 3.6054421768707483, - "grad_norm": 0.50390625, - "learning_rate": 0.00018821078735656101, - "loss": 0.9253, + "grad_norm": 0.349609375, + "learning_rate": 0.00016135486018377008, + "loss": 0.8399, "step": 2915 }, { "epoch": 3.611626468769326, - "grad_norm": 0.455078125, - "learning_rate": 0.00018814286317103714, - "loss": 0.9273, + "grad_norm": 0.388671875, + "learning_rate": 0.0001611841464632011, + "loss": 0.8415, "step": 2920 }, { "epoch": 3.6178107606679033, - "grad_norm": 0.46484375, - "learning_rate": 0.00018807475620218788, - "loss": 0.9167, + "grad_norm": 0.3125, + "learning_rate": 0.0001610131472658777, + "loss": 0.8321, "step": 2925 }, { "epoch": 3.6239950525664812, - "grad_norm": 0.4609375, - "learning_rate": 0.00018800646659124782, - "loss": 0.9192, + "grad_norm": 0.345703125, + "learning_rate": 0.00016084186338965843, + "loss": 0.8377, "step": 2930 }, { "epoch": 3.6301793444650587, - "grad_norm": 0.419921875, - "learning_rate": 0.00018793799447983025, - "loss": 0.9288, + "grad_norm": 0.345703125, + "learning_rate": 0.00016067029563373013, + "loss": 0.8448, "step": 2935 }, { "epoch": 3.6363636363636362, - "grad_norm": 0.4375, - "learning_rate": 0.00018786934000992688, - "loss": 0.9198, + "grad_norm": 0.33203125, + "learning_rate": 0.00016049844479860422, + "loss": 0.8353, "step": 2940 }, { "epoch": 3.642547928262214, - "grad_norm": 0.6875, - "learning_rate": 0.00018780050332390768, - "loss": 0.9157, + "grad_norm": 0.3046875, + "learning_rate": 0.00016032631168611284, + "loss": 0.8304, "step": 2945 }, { "epoch": 3.6487322201607917, - "grad_norm": 0.47265625, - "learning_rate": 0.00018773148456452046, - "loss": 0.9139, + "grad_norm": 0.333984375, + "learning_rate": 0.00016015389709940538, + "loss": 0.8326, "step": 2950 }, { "epoch": 3.654916512059369, - "grad_norm": 0.50390625, - "learning_rate": 0.00018766228387489048, - "loss": 0.9218, + "grad_norm": 0.34765625, + "learning_rate": 0.0001599812018429443, + "loss": 0.8412, "step": 2955 }, { "epoch": 3.6611008039579467, - "grad_norm": 0.48828125, - "learning_rate": 0.00018759290139852048, - "loss": 0.9315, + "grad_norm": 0.314453125, + "learning_rate": 0.0001598082267225018, + "loss": 0.8449, "step": 2960 }, { "epoch": 3.667285095856524, - "grad_norm": 0.53515625, - "learning_rate": 0.00018752333727928993, - "loss": 0.9291, + "grad_norm": 0.36328125, + "learning_rate": 0.00015963497254515581, + "loss": 0.8423, "step": 2965 }, { "epoch": 3.673469387755102, - "grad_norm": 0.443359375, - "learning_rate": 0.00018745359166145523, - "loss": 0.915, + "grad_norm": 0.3359375, + "learning_rate": 0.00015946144011928638, + "loss": 0.8301, "step": 2970 }, { "epoch": 3.6796536796536796, - "grad_norm": 0.47265625, - "learning_rate": 0.00018738366468964898, - "loss": 0.9188, + "grad_norm": 0.3359375, + "learning_rate": 0.0001592876302545718, + "loss": 0.8345, "step": 2975 }, { "epoch": 3.685837971552257, - "grad_norm": 0.5234375, - "learning_rate": 0.00018731355650887985, - "loss": 0.917, + "grad_norm": 0.326171875, + "learning_rate": 0.0001591135437619847, + "loss": 0.8339, "step": 2980 }, { "epoch": 3.692022263450835, - "grad_norm": 0.490234375, - "learning_rate": 0.00018724326726453244, - "loss": 0.9351, + "grad_norm": 0.306640625, + "learning_rate": 0.00015893918145378866, + "loss": 0.8481, "step": 2985 }, { "epoch": 3.6982065553494126, - "grad_norm": 0.5078125, - "learning_rate": 0.00018717279710236666, - "loss": 0.9178, + "grad_norm": 0.314453125, + "learning_rate": 0.000158764544143534, + "loss": 0.8328, "step": 2990 }, { "epoch": 3.70439084724799, - "grad_norm": 0.4765625, - "learning_rate": 0.0001871021461685177, - "loss": 0.9245, + "grad_norm": 0.3125, + "learning_rate": 0.0001585896326460543, + "loss": 0.8415, "step": 2995 }, { "epoch": 3.7105751391465676, - "grad_norm": 0.41796875, - "learning_rate": 0.00018703131460949554, - "loss": 0.9176, + "grad_norm": 0.314453125, + "learning_rate": 0.0001584144477774623, + "loss": 0.8354, "step": 3000 }, { "epoch": 3.716759431045145, - "grad_norm": 0.5859375, - "learning_rate": 0.0001869603025721848, - "loss": 0.9334, + "grad_norm": 0.33203125, + "learning_rate": 0.00015823899035514639, + "loss": 0.8496, "step": 3005 }, { "epoch": 3.722943722943723, - "grad_norm": 0.482421875, - "learning_rate": 0.00018688911020384432, - "loss": 0.9213, + "grad_norm": 0.3671875, + "learning_rate": 0.00015806326119776663, + "loss": 0.8391, "step": 3010 }, { "epoch": 3.7291280148423005, - "grad_norm": 0.431640625, - "learning_rate": 0.0001868177376521069, - "loss": 0.9271, + "grad_norm": 0.333984375, + "learning_rate": 0.00015788726112525085, + "loss": 0.8435, "step": 3015 }, { "epoch": 3.7353123067408784, - "grad_norm": 0.48046875, - "learning_rate": 0.000186746185064979, - "loss": 0.9239, + "grad_norm": 0.328125, + "learning_rate": 0.00015771099095879108, + "loss": 0.8401, "step": 3020 }, { "epoch": 3.741496598639456, - "grad_norm": 0.4453125, - "learning_rate": 0.00018667445259084036, - "loss": 0.9201, + "grad_norm": 0.33203125, + "learning_rate": 0.0001575344515208395, + "loss": 0.8375, "step": 3025 }, { "epoch": 3.7476808905380334, - "grad_norm": 0.412109375, - "learning_rate": 0.00018660254037844388, - "loss": 0.9236, + "grad_norm": 0.322265625, + "learning_rate": 0.0001573576436351046, + "loss": 0.8404, "step": 3030 }, { "epoch": 3.753865182436611, - "grad_norm": 0.40234375, - "learning_rate": 0.00018653044857691508, - "loss": 0.9212, + "grad_norm": 0.30859375, + "learning_rate": 0.00015718056812654763, + "loss": 0.8402, "step": 3035 }, { "epoch": 3.7600494743351884, - "grad_norm": 0.466796875, - "learning_rate": 0.00018645817733575193, - "loss": 0.9157, + "grad_norm": 0.328125, + "learning_rate": 0.00015700322582137827, + "loss": 0.8328, "step": 3040 }, { "epoch": 3.7662337662337664, - "grad_norm": 0.5078125, - "learning_rate": 0.00018638572680482448, - "loss": 0.9181, + "grad_norm": 0.33984375, + "learning_rate": 0.00015682561754705123, + "loss": 0.8342, "step": 3045 }, { "epoch": 3.772418058132344, - "grad_norm": 0.5546875, - "learning_rate": 0.00018631309713437467, - "loss": 0.9329, + "grad_norm": 0.349609375, + "learning_rate": 0.0001566477441322621, + "loss": 0.8472, "step": 3050 }, { "epoch": 3.7786023500309214, - "grad_norm": 0.6171875, - "learning_rate": 0.00018624028847501585, - "loss": 0.9188, + "grad_norm": 0.361328125, + "learning_rate": 0.0001564696064069436, + "loss": 0.8341, "step": 3055 }, { "epoch": 3.7847866419294993, - "grad_norm": 0.64453125, - "learning_rate": 0.0001861673009777325, - "loss": 0.9255, + "grad_norm": 0.3515625, + "learning_rate": 0.00015629120520226165, + "loss": 0.8415, "step": 3060 }, { "epoch": 3.790970933828077, - "grad_norm": 0.83203125, - "learning_rate": 0.00018609413479388003, - "loss": 0.921, + "grad_norm": 0.42578125, + "learning_rate": 0.0001561125413506116, + "loss": 0.8399, "step": 3065 }, { "epoch": 3.7971552257266543, - "grad_norm": 0.61328125, - "learning_rate": 0.00018602079007518438, - "loss": 0.9226, + "grad_norm": 0.361328125, + "learning_rate": 0.00015593361568561428, + "loss": 0.8374, "step": 3070 }, { "epoch": 3.803339517625232, - "grad_norm": 0.5625, - "learning_rate": 0.00018594726697374175, - "loss": 0.9174, + "grad_norm": 0.330078125, + "learning_rate": 0.000155754429042112, + "loss": 0.8355, "step": 3075 }, { "epoch": 3.8095238095238093, - "grad_norm": 0.55859375, - "learning_rate": 0.00018587356564201817, - "loss": 0.9104, + "grad_norm": 0.345703125, + "learning_rate": 0.00015557498225616487, + "loss": 0.8246, "step": 3080 }, { "epoch": 3.8157081014223873, - "grad_norm": 0.4609375, - "learning_rate": 0.00018579968623284933, - "loss": 0.9279, + "grad_norm": 0.349609375, + "learning_rate": 0.0001553952761650467, + "loss": 0.8422, "step": 3085 }, { "epoch": 3.8218923933209648, - "grad_norm": 0.5390625, - "learning_rate": 0.0001857256288994402, - "loss": 0.9153, + "grad_norm": 0.35546875, + "learning_rate": 0.00015521531160724126, + "loss": 0.8327, "step": 3090 }, { "epoch": 3.8280766852195423, - "grad_norm": 0.478515625, - "learning_rate": 0.00018565139379536473, - "loss": 0.9225, + "grad_norm": 0.341796875, + "learning_rate": 0.0001550350894224382, + "loss": 0.8387, "step": 3095 }, { "epoch": 3.83426097711812, - "grad_norm": 0.423828125, - "learning_rate": 0.00018557698107456549, - "loss": 0.9158, + "grad_norm": 0.345703125, + "learning_rate": 0.0001548546104515294, + "loss": 0.8329, "step": 3100 }, { "epoch": 3.8404452690166977, - "grad_norm": 0.4296875, - "learning_rate": 0.00018550239089135334, - "loss": 0.924, + "grad_norm": 0.337890625, + "learning_rate": 0.0001546738755366046, + "loss": 0.8404, "step": 3105 }, { "epoch": 3.846629560915275, - "grad_norm": 0.40625, - "learning_rate": 0.00018542762340040722, - "loss": 0.92, + "grad_norm": 0.318359375, + "learning_rate": 0.00015449288552094796, + "loss": 0.8376, "step": 3110 }, { "epoch": 3.8528138528138527, - "grad_norm": 0.4765625, - "learning_rate": 0.00018535267875677372, - "loss": 0.9123, + "grad_norm": 0.34375, + "learning_rate": 0.00015431164124903382, + "loss": 0.8304, "step": 3115 }, { "epoch": 3.85899814471243, - "grad_norm": 0.462890625, - "learning_rate": 0.00018527755711586678, - "loss": 0.9152, + "grad_norm": 0.30859375, + "learning_rate": 0.00015413014356652286, + "loss": 0.833, "step": 3120 }, { "epoch": 3.865182436611008, - "grad_norm": 0.54296875, - "learning_rate": 0.00018520225863346743, - "loss": 0.9285, + "grad_norm": 0.33984375, + "learning_rate": 0.00015394839332025811, + "loss": 0.8451, "step": 3125 }, { "epoch": 3.8713667285095856, - "grad_norm": 0.58984375, - "learning_rate": 0.00018512678346572337, - "loss": 0.9203, + "grad_norm": 0.33203125, + "learning_rate": 0.00015376639135826107, + "loss": 0.8373, "step": 3130 }, { "epoch": 3.877551020408163, - "grad_norm": 0.73828125, - "learning_rate": 0.0001850511317691487, - "loss": 0.9214, + "grad_norm": 0.328125, + "learning_rate": 0.00015358413852972766, + "loss": 0.8392, "step": 3135 }, { "epoch": 3.883735312306741, - "grad_norm": 0.66796875, - "learning_rate": 0.00018497530370062363, - "loss": 0.9185, + "grad_norm": 0.31640625, + "learning_rate": 0.0001534016356850244, + "loss": 0.8352, "step": 3140 }, { "epoch": 3.8899196042053186, - "grad_norm": 0.455078125, - "learning_rate": 0.00018489929941739407, - "loss": 0.917, + "grad_norm": 0.38671875, + "learning_rate": 0.00015321888367568422, + "loss": 0.8344, "step": 3145 }, { "epoch": 3.896103896103896, - "grad_norm": 0.4375, - "learning_rate": 0.0001848231190770714, - "loss": 0.9221, + "grad_norm": 0.341796875, + "learning_rate": 0.00015303588335440274, + "loss": 0.8408, "step": 3150 }, { "epoch": 3.9022881880024736, - "grad_norm": 0.45703125, - "learning_rate": 0.00018474676283763205, - "loss": 0.9148, + "grad_norm": 0.3359375, + "learning_rate": 0.00015285263557503407, + "loss": 0.8328, "step": 3155 }, { "epoch": 3.908472479901051, - "grad_norm": 0.462890625, - "learning_rate": 0.00018467023085741717, - "loss": 0.9258, + "grad_norm": 0.345703125, + "learning_rate": 0.000152669141192587, + "loss": 0.8454, "step": 3160 }, { "epoch": 3.914656771799629, - "grad_norm": 0.404296875, - "learning_rate": 0.0001845935232951325, - "loss": 0.9044, + "grad_norm": 0.326171875, + "learning_rate": 0.00015248540106322094, + "loss": 0.8234, "step": 3165 }, { "epoch": 3.9208410636982065, - "grad_norm": 0.4375, - "learning_rate": 0.00018451664030984773, - "loss": 0.9217, + "grad_norm": 0.330078125, + "learning_rate": 0.00015230141604424181, + "loss": 0.8391, "step": 3170 }, { "epoch": 3.927025355596784, - "grad_norm": 0.462890625, - "learning_rate": 0.0001844395820609964, - "loss": 0.9186, + "grad_norm": 0.408203125, + "learning_rate": 0.0001521171869940983, + "loss": 0.8347, "step": 3175 }, { "epoch": 3.933209647495362, - "grad_norm": 0.455078125, - "learning_rate": 0.00018436234870837547, - "loss": 0.9087, + "grad_norm": 0.330078125, + "learning_rate": 0.0001519327147723776, + "loss": 0.8289, "step": 3180 }, { "epoch": 3.9393939393939394, - "grad_norm": 0.43359375, - "learning_rate": 0.00018428494041214507, - "loss": 0.9143, + "grad_norm": 0.3125, + "learning_rate": 0.0001517480002398016, + "loss": 0.8342, "step": 3185 }, { "epoch": 3.945578231292517, - "grad_norm": 0.48828125, - "learning_rate": 0.00018420735733282807, - "loss": 0.923, + "grad_norm": 0.3125, + "learning_rate": 0.00015156304425822267, + "loss": 0.8406, "step": 3190 }, { "epoch": 3.9517625231910944, - "grad_norm": 0.4296875, - "learning_rate": 0.00018412959963130975, - "loss": 0.9181, + "grad_norm": 0.322265625, + "learning_rate": 0.00015137784769061986, + "loss": 0.8361, "step": 3195 }, { "epoch": 3.9579468150896724, - "grad_norm": 0.423828125, - "learning_rate": 0.00018405166746883762, - "loss": 0.9207, + "grad_norm": 0.34375, + "learning_rate": 0.00015119241140109467, + "loss": 0.8391, "step": 3200 }, { "epoch": 3.96413110698825, - "grad_norm": 0.47265625, - "learning_rate": 0.00018397356100702085, - "loss": 0.9107, + "grad_norm": 0.373046875, + "learning_rate": 0.00015100673625486716, + "loss": 0.8312, "step": 3205 }, { "epoch": 3.9703153988868274, - "grad_norm": 0.419921875, - "learning_rate": 0.00018389528040783012, - "loss": 0.9207, + "grad_norm": 0.3203125, + "learning_rate": 0.00015082082311827183, + "loss": 0.8381, "step": 3210 }, { "epoch": 3.9764996907854053, - "grad_norm": 0.45703125, - "learning_rate": 0.00018381682583359723, - "loss": 0.9226, + "grad_norm": 0.345703125, + "learning_rate": 0.00015063467285875365, + "loss": 0.8402, "step": 3215 }, { "epoch": 3.982683982683983, - "grad_norm": 0.431640625, - "learning_rate": 0.00018373819744701476, - "loss": 0.9065, + "grad_norm": 0.32421875, + "learning_rate": 0.000150448286344864, + "loss": 0.827, "step": 3220 }, { "epoch": 3.9888682745825603, - "grad_norm": 0.46875, - "learning_rate": 0.00018365939541113566, - "loss": 0.9145, + "grad_norm": 0.326171875, + "learning_rate": 0.00015026166444625646, + "loss": 0.8326, "step": 3225 }, { "epoch": 3.995052566481138, - "grad_norm": 0.447265625, - "learning_rate": 0.00018358041988937305, - "loss": 0.925, + "grad_norm": 0.359375, + "learning_rate": 0.000150074808033683, + "loss": 0.8424, "step": 3230 }, { "epoch": 4.0, - "eval_loss": 2.475017786026001, - "eval_runtime": 0.5414, - "eval_samples_per_second": 18.47, - "eval_steps_per_second": 1.847, + "eval_loss": 2.5590126514434814, + "eval_runtime": 0.5375, + "eval_samples_per_second": 18.605, + "eval_steps_per_second": 1.861, "step": 3234 }, { "epoch": 4.001236858379715, - "grad_norm": 0.4921875, - "learning_rate": 0.00018350127104549977, - "loss": 0.9148, + "grad_norm": 0.3671875, + "learning_rate": 0.00014988771797898976, + "loss": 0.8301, "step": 3235 }, { "epoch": 4.007421150278293, - "grad_norm": 0.55078125, - "learning_rate": 0.00018342194904364813, - "loss": 0.9175, + "grad_norm": 0.322265625, + "learning_rate": 0.00014970039515511304, + "loss": 0.8174, "step": 3240 }, { "epoch": 4.01360544217687, - "grad_norm": 0.51953125, - "learning_rate": 0.00018334245404830944, - "loss": 0.8931, + "grad_norm": 0.486328125, + "learning_rate": 0.00014951284043607517, + "loss": 0.7991, "step": 3245 }, { "epoch": 4.019789734075449, - "grad_norm": 0.44140625, - "learning_rate": 0.00018326278622433386, - "loss": 0.9231, + "grad_norm": 0.353515625, + "learning_rate": 0.00014932505469698052, + "loss": 0.8198, "step": 3250 }, { "epoch": 4.025974025974026, - "grad_norm": 0.498046875, - "learning_rate": 0.00018318294573692985, - "loss": 0.9013, + "grad_norm": 0.33984375, + "learning_rate": 0.00014913703881401134, + "loss": 0.8031, "step": 3255 }, { "epoch": 4.032158317872604, - "grad_norm": 0.443359375, - "learning_rate": 0.00018310293275166392, - "loss": 0.9067, + "grad_norm": 0.376953125, + "learning_rate": 0.0001489487936644237, + "loss": 0.8086, "step": 3260 }, { "epoch": 4.038342609771181, - "grad_norm": 0.5390625, - "learning_rate": 0.00018302274743446043, - "loss": 0.9111, + "grad_norm": 0.33203125, + "learning_rate": 0.00014876032012654336, + "loss": 0.8114, "step": 3265 }, { "epoch": 4.044526901669759, - "grad_norm": 0.4296875, - "learning_rate": 0.00018294238995160094, - "loss": 0.9146, + "grad_norm": 0.39453125, + "learning_rate": 0.00014857161907976183, + "loss": 0.8151, "step": 3270 }, { "epoch": 4.050711193568336, - "grad_norm": 0.5, - "learning_rate": 0.0001828618604697241, - "loss": 0.8986, + "grad_norm": 0.318359375, + "learning_rate": 0.00014838269140453198, + "loss": 0.8019, "step": 3275 }, { "epoch": 4.056895485466914, - "grad_norm": 0.52734375, - "learning_rate": 0.00018278115915582526, - "loss": 0.8951, + "grad_norm": 0.3671875, + "learning_rate": 0.00014819353798236427, + "loss": 0.7974, "step": 3280 }, { "epoch": 4.063079777365492, - "grad_norm": 0.45703125, - "learning_rate": 0.00018270028617725607, - "loss": 0.9153, + "grad_norm": 0.337890625, + "learning_rate": 0.00014800415969582227, + "loss": 0.8166, "step": 3285 }, { "epoch": 4.06926406926407, - "grad_norm": 0.4609375, - "learning_rate": 0.0001826192417017242, - "loss": 0.9018, + "grad_norm": 0.38671875, + "learning_rate": 0.00014781455742851892, + "loss": 0.8066, "step": 3290 }, { "epoch": 4.075448361162647, - "grad_norm": 0.421875, - "learning_rate": 0.0001825380258972929, - "loss": 0.9111, + "grad_norm": 0.37109375, + "learning_rate": 0.00014762473206511207, + "loss": 0.8136, "step": 3295 }, { "epoch": 4.081632653061225, - "grad_norm": 0.44921875, - "learning_rate": 0.00018245663893238075, - "loss": 0.9133, + "grad_norm": 0.357421875, + "learning_rate": 0.00014743468449130063, + "loss": 0.8167, "step": 3300 }, { "epoch": 4.087816944959802, - "grad_norm": 0.462890625, - "learning_rate": 0.00018237508097576123, - "loss": 0.9063, + "grad_norm": 0.32421875, + "learning_rate": 0.00014724441559382028, + "loss": 0.809, "step": 3305 }, { "epoch": 4.09400123685838, - "grad_norm": 0.447265625, - "learning_rate": 0.0001822933521965625, - "loss": 0.8988, + "grad_norm": 0.408203125, + "learning_rate": 0.0001470539262604393, + "loss": 0.8024, "step": 3310 }, { "epoch": 4.100185528756957, - "grad_norm": 0.47265625, - "learning_rate": 0.00018221145276426683, - "loss": 0.9129, + "grad_norm": 0.337890625, + "learning_rate": 0.00014686321737995454, + "loss": 0.8148, "step": 3315 }, { "epoch": 4.106369820655535, - "grad_norm": 0.412109375, - "learning_rate": 0.00018212938284871047, - "loss": 0.9105, + "grad_norm": 0.322265625, + "learning_rate": 0.0001466722898421873, + "loss": 0.8127, "step": 3320 }, { "epoch": 4.112554112554113, - "grad_norm": 0.44921875, - "learning_rate": 0.00018204714262008316, - "loss": 0.8963, + "grad_norm": 0.357421875, + "learning_rate": 0.0001464811445379789, + "loss": 0.8008, "step": 3325 }, { "epoch": 4.1187384044526905, - "grad_norm": 0.46484375, - "learning_rate": 0.00018196473224892784, - "loss": 0.9094, + "grad_norm": 0.3125, + "learning_rate": 0.00014628978235918695, + "loss": 0.8145, "step": 3330 }, { "epoch": 4.124922696351268, - "grad_norm": 0.44921875, - "learning_rate": 0.00018188215190614027, - "loss": 0.8976, + "grad_norm": 0.3515625, + "learning_rate": 0.0001460982041986809, + "loss": 0.8009, "step": 3335 }, { "epoch": 4.1311069882498455, - "grad_norm": 0.50390625, - "learning_rate": 0.0001817994017629687, - "loss": 0.9048, + "grad_norm": 0.34375, + "learning_rate": 0.00014590641095033787, + "loss": 0.8056, "step": 3340 }, { "epoch": 4.137291280148423, - "grad_norm": 0.53515625, - "learning_rate": 0.00018171648199101346, - "loss": 0.9181, + "grad_norm": 0.34375, + "learning_rate": 0.00014571440350903857, + "loss": 0.8189, "step": 3345 }, { "epoch": 4.1434755720470005, - "grad_norm": 0.515625, - "learning_rate": 0.00018163339276222666, - "loss": 0.9077, + "grad_norm": 0.359375, + "learning_rate": 0.00014552218277066314, + "loss": 0.8103, "step": 3350 }, { "epoch": 4.149659863945578, - "grad_norm": 0.59765625, - "learning_rate": 0.00018155013424891184, - "loss": 0.9036, + "grad_norm": 0.3359375, + "learning_rate": 0.00014532974963208704, + "loss": 0.8073, "step": 3355 }, { "epoch": 4.1558441558441555, - "grad_norm": 0.54296875, - "learning_rate": 0.00018146670662372354, - "loss": 0.9122, + "grad_norm": 0.330078125, + "learning_rate": 0.00014513710499117647, + "loss": 0.8158, "step": 3360 }, { "epoch": 4.162028447742734, - "grad_norm": 0.5546875, - "learning_rate": 0.00018138311005966705, - "loss": 0.9029, + "grad_norm": 0.365234375, + "learning_rate": 0.00014494424974678476, + "loss": 0.8058, "step": 3365 }, { "epoch": 4.168212739641311, - "grad_norm": 0.6953125, - "learning_rate": 0.0001812993447300979, - "loss": 0.9063, + "grad_norm": 0.43359375, + "learning_rate": 0.00014475118479874774, + "loss": 0.8117, "step": 3370 }, { "epoch": 4.174397031539889, - "grad_norm": 0.62109375, - "learning_rate": 0.00018121541080872176, - "loss": 0.9096, + "grad_norm": 0.353515625, + "learning_rate": 0.00014455791104787976, + "loss": 0.814, "step": 3375 }, { "epoch": 4.180581323438466, - "grad_norm": 0.52734375, - "learning_rate": 0.00018113130846959368, - "loss": 0.9121, + "grad_norm": 0.33984375, + "learning_rate": 0.0001443644293959693, + "loss": 0.8154, "step": 3380 }, { "epoch": 4.186765615337044, - "grad_norm": 0.470703125, - "learning_rate": 0.00018104703788711816, - "loss": 0.9073, + "grad_norm": 0.326171875, + "learning_rate": 0.00014417074074577502, + "loss": 0.8108, "step": 3385 }, { "epoch": 4.192949907235621, - "grad_norm": 0.408203125, - "learning_rate": 0.0001809625992360485, - "loss": 0.9113, + "grad_norm": 0.318359375, + "learning_rate": 0.0001439768460010213, + "loss": 0.8166, "step": 3390 }, { "epoch": 4.199134199134199, - "grad_norm": 0.478515625, - "learning_rate": 0.00018087799269148654, - "loss": 0.8939, + "grad_norm": 0.322265625, + "learning_rate": 0.00014378274606639422, + "loss": 0.8002, "step": 3395 }, { "epoch": 4.205318491032776, - "grad_norm": 0.466796875, - "learning_rate": 0.00018079321842888227, - "loss": 0.8998, + "grad_norm": 0.3125, + "learning_rate": 0.00014358844184753712, + "loss": 0.8058, "step": 3400 }, { "epoch": 4.211502782931355, - "grad_norm": 0.484375, - "learning_rate": 0.00018070827662403349, - "loss": 0.897, + "grad_norm": 0.330078125, + "learning_rate": 0.00014339393425104663, + "loss": 0.8011, "step": 3405 }, { "epoch": 4.217687074829932, - "grad_norm": 0.55859375, - "learning_rate": 0.00018062316745308542, - "loss": 0.9122, + "grad_norm": 0.37109375, + "learning_rate": 0.00014319922418446824, + "loss": 0.8159, "step": 3410 }, { "epoch": 4.22387136672851, - "grad_norm": 0.62890625, - "learning_rate": 0.00018053789109253042, - "loss": 0.9, + "grad_norm": 0.37109375, + "learning_rate": 0.0001430043125562922, + "loss": 0.8065, "step": 3415 }, { "epoch": 4.230055658627087, - "grad_norm": 0.478515625, - "learning_rate": 0.0001804524477192075, - "loss": 0.9108, + "grad_norm": 0.337890625, + "learning_rate": 0.00014280920027594907, + "loss": 0.8171, "step": 3420 }, { "epoch": 4.236239950525665, - "grad_norm": 0.5234375, - "learning_rate": 0.00018036683751030194, - "loss": 0.9214, + "grad_norm": 0.3984375, + "learning_rate": 0.00014261388825380586, + "loss": 0.8245, "step": 3425 }, { "epoch": 4.242424242424242, - "grad_norm": 0.51171875, - "learning_rate": 0.0001802810606433451, - "loss": 0.8972, + "grad_norm": 0.33984375, + "learning_rate": 0.00014241837740116132, + "loss": 0.8015, "step": 3430 }, { "epoch": 4.24860853432282, - "grad_norm": 0.4375, - "learning_rate": 0.0001801951172962139, - "loss": 0.9142, + "grad_norm": 0.341796875, + "learning_rate": 0.00014222266863024206, + "loss": 0.8169, "step": 3435 }, { "epoch": 4.254792826221397, - "grad_norm": 0.451171875, - "learning_rate": 0.00018010900764713048, - "loss": 0.8953, + "grad_norm": 0.349609375, + "learning_rate": 0.00014202676285419812, + "loss": 0.8034, "step": 3440 }, { "epoch": 4.260977118119976, - "grad_norm": 0.453125, - "learning_rate": 0.0001800227318746619, - "loss": 0.9033, + "grad_norm": 0.3359375, + "learning_rate": 0.00014183066098709865, + "loss": 0.8083, "step": 3445 }, { "epoch": 4.267161410018553, - "grad_norm": 0.453125, - "learning_rate": 0.0001799362901577196, - "loss": 0.896, + "grad_norm": 0.330078125, + "learning_rate": 0.00014163436394392786, + "loss": 0.801, "step": 3450 }, { "epoch": 4.273345701917131, - "grad_norm": 0.5390625, - "learning_rate": 0.00017984968267555925, - "loss": 0.8991, + "grad_norm": 0.3359375, + "learning_rate": 0.00014143787264058055, + "loss": 0.8035, "step": 3455 }, { "epoch": 4.279529993815708, - "grad_norm": 0.421875, - "learning_rate": 0.00017976290960778024, - "loss": 0.9085, + "grad_norm": 0.353515625, + "learning_rate": 0.00014124118799385796, + "loss": 0.8152, "step": 3460 }, { "epoch": 4.285714285714286, - "grad_norm": 0.443359375, - "learning_rate": 0.0001796759711343253, - "loss": 0.9209, + "grad_norm": 0.349609375, + "learning_rate": 0.00014104431092146338, + "loss": 0.8258, "step": 3465 }, { "epoch": 4.291898577612863, - "grad_norm": 0.44140625, - "learning_rate": 0.0001795888674354802, - "loss": 0.9091, + "grad_norm": 0.41796875, + "learning_rate": 0.000140847242341998, + "loss": 0.818, "step": 3470 }, { "epoch": 4.298082869511441, - "grad_norm": 0.55078125, - "learning_rate": 0.00017950159869187333, - "loss": 0.9103, + "grad_norm": 0.357421875, + "learning_rate": 0.00014064998317495647, + "loss": 0.8159, "step": 3475 }, { "epoch": 4.304267161410019, - "grad_norm": 0.5234375, - "learning_rate": 0.00017941416508447536, - "loss": 0.9115, + "grad_norm": 0.34765625, + "learning_rate": 0.0001404525343407228, + "loss": 0.8164, "step": 3480 }, { "epoch": 4.3104514533085965, - "grad_norm": 0.5078125, - "learning_rate": 0.0001793265667945988, - "loss": 0.9025, + "grad_norm": 0.326171875, + "learning_rate": 0.00014025489676056587, + "loss": 0.8083, "step": 3485 }, { "epoch": 4.316635745207174, - "grad_norm": 0.48046875, - "learning_rate": 0.0001792388040038977, - "loss": 0.9113, + "grad_norm": 0.349609375, + "learning_rate": 0.00014005707135663527, + "loss": 0.8148, "step": 3490 }, { "epoch": 4.3228200371057515, - "grad_norm": 0.44140625, - "learning_rate": 0.0001791508768943672, - "loss": 0.9182, + "grad_norm": 0.34375, + "learning_rate": 0.00013985905905195697, + "loss": 0.8236, "step": 3495 }, { "epoch": 4.329004329004329, - "grad_norm": 0.515625, - "learning_rate": 0.00017906278564834324, - "loss": 0.9105, + "grad_norm": 0.392578125, + "learning_rate": 0.0001396608607704289, + "loss": 0.8153, "step": 3500 }, { "epoch": 4.3351886209029065, - "grad_norm": 0.478515625, - "learning_rate": 0.00017897453044850208, - "loss": 0.9133, + "grad_norm": 0.34375, + "learning_rate": 0.00013946247743681686, + "loss": 0.8156, "step": 3505 }, { "epoch": 4.341372912801484, - "grad_norm": 0.478515625, - "learning_rate": 0.00017888611147786002, - "loss": 0.9112, + "grad_norm": 0.349609375, + "learning_rate": 0.00013926390997674997, + "loss": 0.8181, "step": 3510 }, { "epoch": 4.3475572047000615, - "grad_norm": 0.44140625, - "learning_rate": 0.00017879752891977296, - "loss": 0.9023, + "grad_norm": 0.318359375, + "learning_rate": 0.00013906515931671651, + "loss": 0.809, "step": 3515 }, { "epoch": 4.35374149659864, - "grad_norm": 0.451171875, - "learning_rate": 0.00017870878295793598, - "loss": 0.8993, + "grad_norm": 0.361328125, + "learning_rate": 0.00013886622638405952, + "loss": 0.8045, "step": 3520 }, { "epoch": 4.359925788497217, - "grad_norm": 0.46875, - "learning_rate": 0.00017861987377638312, - "loss": 0.9052, + "grad_norm": 0.3515625, + "learning_rate": 0.00013866711210697256, + "loss": 0.8105, "step": 3525 }, { "epoch": 4.366110080395795, - "grad_norm": 0.4765625, - "learning_rate": 0.0001785308015594868, - "loss": 0.8988, + "grad_norm": 0.328125, + "learning_rate": 0.00013846781741449525, + "loss": 0.8076, "step": 3530 }, { "epoch": 4.372294372294372, - "grad_norm": 0.466796875, - "learning_rate": 0.00017844156649195759, - "loss": 0.9054, + "grad_norm": 0.337890625, + "learning_rate": 0.000138268343236509, + "loss": 0.8137, "step": 3535 }, { "epoch": 4.37847866419295, - "grad_norm": 0.458984375, - "learning_rate": 0.00017835216875884368, - "loss": 0.9039, + "grad_norm": 0.349609375, + "learning_rate": 0.0001380686905037327, + "loss": 0.8085, "step": 3540 }, { "epoch": 4.384662956091527, - "grad_norm": 0.515625, - "learning_rate": 0.00017826260854553072, - "loss": 0.9082, + "grad_norm": 0.330078125, + "learning_rate": 0.00013786886014771843, + "loss": 0.8151, "step": 3545 }, { "epoch": 4.390847247990105, - "grad_norm": 0.65234375, - "learning_rate": 0.00017817288603774116, - "loss": 0.9025, + "grad_norm": 0.330078125, + "learning_rate": 0.00013766885310084688, + "loss": 0.8075, "step": 3550 }, { "epoch": 4.397031539888682, - "grad_norm": 0.43359375, - "learning_rate": 0.00017808300142153406, - "loss": 0.9022, + "grad_norm": 0.31640625, + "learning_rate": 0.00013746867029632324, + "loss": 0.8081, "step": 3555 }, { "epoch": 4.403215831787261, - "grad_norm": 0.40625, - "learning_rate": 0.00017799295488330467, - "loss": 0.8981, + "grad_norm": 0.322265625, + "learning_rate": 0.00013726831266817278, + "loss": 0.8067, "step": 3560 }, { "epoch": 4.409400123685838, - "grad_norm": 0.453125, - "learning_rate": 0.000177902746609784, - "loss": 0.9073, + "grad_norm": 0.34765625, + "learning_rate": 0.00013706778115123646, + "loss": 0.8127, "step": 3565 }, { "epoch": 4.415584415584416, - "grad_norm": 0.53125, - "learning_rate": 0.00017781237678803847, - "loss": 0.908, + "grad_norm": 0.35546875, + "learning_rate": 0.0001368670766811665, + "loss": 0.8151, "step": 3570 }, { "epoch": 4.421768707482993, - "grad_norm": 0.42578125, - "learning_rate": 0.00017772184560546942, - "loss": 0.8984, + "grad_norm": 0.326171875, + "learning_rate": 0.00013666620019442223, + "loss": 0.8074, "step": 3575 }, { "epoch": 4.427952999381571, - "grad_norm": 0.515625, - "learning_rate": 0.00017763115324981294, - "loss": 0.9122, + "grad_norm": 0.32421875, + "learning_rate": 0.00013646515262826552, + "loss": 0.8165, "step": 3580 }, { "epoch": 4.434137291280148, - "grad_norm": 0.458984375, - "learning_rate": 0.00017754029990913926, - "loss": 0.9038, + "grad_norm": 0.3515625, + "learning_rate": 0.00013626393492075645, + "loss": 0.8115, "step": 3585 }, { "epoch": 4.440321583178726, - "grad_norm": 0.51953125, - "learning_rate": 0.00017744928577185243, - "loss": 0.9047, + "grad_norm": 0.375, + "learning_rate": 0.00013606254801074895, + "loss": 0.8113, "step": 3590 }, { "epoch": 4.446505875077303, - "grad_norm": 0.55078125, - "learning_rate": 0.00017735811102669003, - "loss": 0.8955, + "grad_norm": 0.3515625, + "learning_rate": 0.0001358609928378865, + "loss": 0.8035, "step": 3595 }, { "epoch": 4.452690166975882, - "grad_norm": 0.490234375, - "learning_rate": 0.00017726677586272263, - "loss": 0.8993, + "grad_norm": 0.3359375, + "learning_rate": 0.0001356592703425976, + "loss": 0.8097, "step": 3600 }, { "epoch": 4.458874458874459, - "grad_norm": 0.42578125, - "learning_rate": 0.0001771752804693535, - "loss": 0.9106, + "grad_norm": 0.33203125, + "learning_rate": 0.00013545738146609145, + "loss": 0.8187, "step": 3605 }, { "epoch": 4.465058750773037, - "grad_norm": 0.451171875, - "learning_rate": 0.00017708362503631814, - "loss": 0.895, + "grad_norm": 0.345703125, + "learning_rate": 0.00013525532715035366, + "loss": 0.8031, "step": 3610 }, { "epoch": 4.471243042671614, - "grad_norm": 0.423828125, - "learning_rate": 0.00017699180975368396, - "loss": 0.9059, + "grad_norm": 0.328125, + "learning_rate": 0.00013505310833814168, + "loss": 0.8138, "step": 3615 }, { "epoch": 4.477427334570192, - "grad_norm": 0.421875, - "learning_rate": 0.00017689983481184989, - "loss": 0.9031, + "grad_norm": 0.326171875, + "learning_rate": 0.00013485072597298038, + "loss": 0.8119, "step": 3620 }, { "epoch": 4.483611626468769, - "grad_norm": 0.482421875, - "learning_rate": 0.0001768077004015458, - "loss": 0.9148, + "grad_norm": 0.34375, + "learning_rate": 0.00013464818099915798, + "loss": 0.8212, "step": 3625 }, { "epoch": 4.489795918367347, - "grad_norm": 0.47265625, - "learning_rate": 0.00017671540671383243, - "loss": 0.8983, + "grad_norm": 0.3515625, + "learning_rate": 0.00013444547436172117, + "loss": 0.807, "step": 3630 }, { "epoch": 4.495980210265925, - "grad_norm": 0.451171875, - "learning_rate": 0.00017662295394010072, - "loss": 0.9008, + "grad_norm": 0.34765625, + "learning_rate": 0.00013424260700647115, + "loss": 0.809, "step": 3635 }, { "epoch": 4.5021645021645025, - "grad_norm": 0.443359375, - "learning_rate": 0.00017653034227207152, - "loss": 0.9089, + "grad_norm": 0.33984375, + "learning_rate": 0.00013403957987995882, + "loss": 0.8157, "step": 3640 }, { "epoch": 4.50834879406308, - "grad_norm": 0.45703125, - "learning_rate": 0.00017643757190179523, - "loss": 0.9176, + "grad_norm": 0.341796875, + "learning_rate": 0.00013383639392948072, + "loss": 0.8213, "step": 3645 }, { "epoch": 4.5145330859616575, - "grad_norm": 0.46875, - "learning_rate": 0.00017634464302165124, - "loss": 0.9026, + "grad_norm": 0.361328125, + "learning_rate": 0.00013363305010307425, + "loss": 0.809, "step": 3650 }, { "epoch": 4.520717377860235, - "grad_norm": 0.40234375, - "learning_rate": 0.00017625155582434777, - "loss": 0.9066, + "grad_norm": 0.34375, + "learning_rate": 0.00013342954934951365, + "loss": 0.8138, "step": 3655 }, { "epoch": 4.5269016697588125, - "grad_norm": 0.4296875, - "learning_rate": 0.0001761583105029213, - "loss": 0.9125, + "grad_norm": 0.36328125, + "learning_rate": 0.00013322589261830517, + "loss": 0.8177, "step": 3660 }, { "epoch": 4.53308596165739, - "grad_norm": 0.482421875, - "learning_rate": 0.00017606490725073615, - "loss": 0.9058, + "grad_norm": 0.34765625, + "learning_rate": 0.00013302208085968296, + "loss": 0.8145, "step": 3665 }, { "epoch": 4.5392702535559675, - "grad_norm": 0.4296875, - "learning_rate": 0.00017597134626148427, - "loss": 0.8999, + "grad_norm": 0.326171875, + "learning_rate": 0.0001328181150246045, + "loss": 0.8063, "step": 3670 }, { "epoch": 4.545454545454545, - "grad_norm": 0.625, - "learning_rate": 0.00017587762772918467, - "loss": 0.9047, + "grad_norm": 0.34765625, + "learning_rate": 0.00013261399606474605, + "loss": 0.8136, "step": 3675 }, { "epoch": 4.551638837353123, - "grad_norm": 0.52734375, - "learning_rate": 0.0001757837518481829, - "loss": 0.902, + "grad_norm": 0.384765625, + "learning_rate": 0.00013240972493249847, + "loss": 0.8117, "step": 3680 }, { "epoch": 4.557823129251701, - "grad_norm": 0.48828125, - "learning_rate": 0.00017568971881315104, - "loss": 0.9083, + "grad_norm": 0.376953125, + "learning_rate": 0.00013220530258096252, + "loss": 0.8164, "step": 3685 }, { "epoch": 4.564007421150278, - "grad_norm": 0.42578125, - "learning_rate": 0.00017559552881908695, - "loss": 0.9084, + "grad_norm": 0.34765625, + "learning_rate": 0.0001320007299639446, + "loss": 0.8153, "step": 3690 }, { "epoch": 4.570191713048856, - "grad_norm": 0.498046875, - "learning_rate": 0.00017550118206131402, - "loss": 0.905, + "grad_norm": 0.3515625, + "learning_rate": 0.00013179600803595224, + "loss": 0.8148, "step": 3695 }, { "epoch": 4.576376004947433, - "grad_norm": 0.447265625, - "learning_rate": 0.00017540667873548063, - "loss": 0.8922, + "grad_norm": 0.314453125, + "learning_rate": 0.00013159113775218964, + "loss": 0.7987, "step": 3700 }, { "epoch": 4.582560296846011, - "grad_norm": 0.41796875, - "learning_rate": 0.00017531201903755994, - "loss": 0.9123, + "grad_norm": 0.3203125, + "learning_rate": 0.00013138612006855307, + "loss": 0.8191, "step": 3705 }, { "epoch": 4.588744588744589, - "grad_norm": 0.46484375, - "learning_rate": 0.00017521720316384935, - "loss": 0.895, + "grad_norm": 0.328125, + "learning_rate": 0.0001311809559416267, + "loss": 0.8048, "step": 3710 }, { "epoch": 4.594928880643167, - "grad_norm": 0.49609375, - "learning_rate": 0.00017512223131097007, - "loss": 0.9068, + "grad_norm": 0.32421875, + "learning_rate": 0.00013097564632867794, + "loss": 0.8151, "step": 3715 }, { "epoch": 4.601113172541744, - "grad_norm": 0.419921875, - "learning_rate": 0.00017502710367586687, - "loss": 0.8987, + "grad_norm": 0.32421875, + "learning_rate": 0.00013077019218765305, + "loss": 0.8083, "step": 3720 }, { "epoch": 4.607297464440322, - "grad_norm": 0.431640625, - "learning_rate": 0.0001749318204558075, - "loss": 0.9054, + "grad_norm": 0.310546875, + "learning_rate": 0.00013056459447717252, + "loss": 0.8142, "step": 3725 }, { "epoch": 4.613481756338899, - "grad_norm": 0.455078125, - "learning_rate": 0.00017483638184838239, - "loss": 0.9075, + "grad_norm": 0.341796875, + "learning_rate": 0.00013035885415652685, + "loss": 0.8166, "step": 3730 }, { "epoch": 4.619666048237477, - "grad_norm": 0.44140625, - "learning_rate": 0.0001747407880515041, - "loss": 0.9086, + "grad_norm": 0.349609375, + "learning_rate": 0.00013015297218567186, + "loss": 0.8147, "step": 3735 }, { "epoch": 4.625850340136054, - "grad_norm": 0.44140625, - "learning_rate": 0.0001746450392634071, - "loss": 0.9125, + "grad_norm": 0.333984375, + "learning_rate": 0.00012994694952522435, + "loss": 0.819, "step": 3740 }, { "epoch": 4.632034632034632, - "grad_norm": 0.58203125, - "learning_rate": 0.0001745491356826473, - "loss": 0.9123, + "grad_norm": 0.330078125, + "learning_rate": 0.0001297407871364575, + "loss": 0.8188, "step": 3745 }, { "epoch": 4.638218923933209, - "grad_norm": 0.54296875, - "learning_rate": 0.0001744530775081015, - "loss": 0.9056, + "grad_norm": 0.349609375, + "learning_rate": 0.00012953448598129643, + "loss": 0.8137, "step": 3750 }, { "epoch": 4.644403215831788, - "grad_norm": 0.59375, - "learning_rate": 0.0001743568649389672, - "loss": 0.9066, + "grad_norm": 0.322265625, + "learning_rate": 0.0001293280470223138, + "loss": 0.8157, "step": 3755 }, { "epoch": 4.650587507730365, - "grad_norm": 0.63671875, - "learning_rate": 0.00017426049817476197, - "loss": 0.9052, + "grad_norm": 0.3125, + "learning_rate": 0.00012912147122272523, + "loss": 0.8122, "step": 3760 }, { "epoch": 4.656771799628943, - "grad_norm": 0.546875, - "learning_rate": 0.00017416397741532315, - "loss": 0.9075, + "grad_norm": 0.384765625, + "learning_rate": 0.00012891475954638474, + "loss": 0.8164, "step": 3765 }, { "epoch": 4.66295609152752, - "grad_norm": 0.5390625, - "learning_rate": 0.00017406730286080753, - "loss": 0.8945, + "grad_norm": 0.416015625, + "learning_rate": 0.0001287079129577804, + "loss": 0.8045, "step": 3770 }, { "epoch": 4.669140383426098, - "grad_norm": 0.515625, - "learning_rate": 0.00017397047471169063, - "loss": 0.9086, + "grad_norm": 0.345703125, + "learning_rate": 0.00012850093242202978, + "loss": 0.814, "step": 3775 }, { "epoch": 4.675324675324675, - "grad_norm": 0.53515625, - "learning_rate": 0.00017387349316876666, - "loss": 0.9103, + "grad_norm": 0.35546875, + "learning_rate": 0.00012829381890487536, + "loss": 0.8187, "step": 3780 }, { "epoch": 4.681508967223253, - "grad_norm": 0.484375, - "learning_rate": 0.0001737763584331479, - "loss": 0.9139, + "grad_norm": 0.33984375, + "learning_rate": 0.00012808657337268014, + "loss": 0.8224, "step": 3785 }, { "epoch": 4.687693259121831, - "grad_norm": 0.439453125, - "learning_rate": 0.00017367907070626424, - "loss": 0.8962, + "grad_norm": 0.328125, + "learning_rate": 0.00012787919679242306, + "loss": 0.8082, "step": 3790 }, { "epoch": 4.6938775510204085, - "grad_norm": 0.41015625, - "learning_rate": 0.00017358163018986282, - "loss": 0.9195, + "grad_norm": 0.318359375, + "learning_rate": 0.00012767169013169457, + "loss": 0.8258, "step": 3795 }, { "epoch": 4.700061842918986, - "grad_norm": 0.490234375, - "learning_rate": 0.00017348403708600772, - "loss": 0.9006, + "grad_norm": 0.349609375, + "learning_rate": 0.00012746405435869198, + "loss": 0.8099, "step": 3800 }, { "epoch": 4.7062461348175635, - "grad_norm": 0.47265625, - "learning_rate": 0.00017338629159707936, - "loss": 0.9051, + "grad_norm": 0.341796875, + "learning_rate": 0.00012725629044221505, + "loss": 0.8123, "step": 3805 }, { "epoch": 4.712430426716141, - "grad_norm": 0.458984375, - "learning_rate": 0.0001732883939257742, - "loss": 0.9039, + "grad_norm": 0.318359375, + "learning_rate": 0.00012704839935166143, + "loss": 0.8137, "step": 3810 }, { "epoch": 4.7186147186147185, - "grad_norm": 0.439453125, - "learning_rate": 0.0001731903442751043, - "loss": 0.8953, + "grad_norm": 0.33984375, + "learning_rate": 0.00012684038205702222, + "loss": 0.805, "step": 3815 }, { "epoch": 4.724799010513296, - "grad_norm": 0.4140625, - "learning_rate": 0.00017309214284839678, - "loss": 0.9031, + "grad_norm": 0.3125, + "learning_rate": 0.00012663223952887723, + "loss": 0.8148, "step": 3820 }, { "epoch": 4.7309833024118735, - "grad_norm": 0.431640625, - "learning_rate": 0.00017299378984929366, - "loss": 0.8986, + "grad_norm": 0.3046875, + "learning_rate": 0.00012642397273839075, + "loss": 0.8067, "step": 3825 }, { "epoch": 4.737167594310451, - "grad_norm": 0.5078125, - "learning_rate": 0.00017289528548175114, - "loss": 0.8992, + "grad_norm": 0.330078125, + "learning_rate": 0.0001262155826573067, + "loss": 0.8045, "step": 3830 }, { "epoch": 4.743351886209029, - "grad_norm": 0.412109375, - "learning_rate": 0.0001727966299500394, - "loss": 0.9061, + "grad_norm": 0.3125, + "learning_rate": 0.00012600707025794443, + "loss": 0.813, "step": 3835 }, { "epoch": 4.749536178107607, - "grad_norm": 0.474609375, - "learning_rate": 0.00017269782345874203, - "loss": 0.8924, + "grad_norm": 0.3515625, + "learning_rate": 0.0001257984365131938, + "loss": 0.8037, "step": 3840 }, { "epoch": 4.755720470006184, - "grad_norm": 0.447265625, - "learning_rate": 0.00017259886621275573, - "loss": 0.8993, + "grad_norm": 0.34765625, + "learning_rate": 0.0001255896823965111, + "loss": 0.8107, "step": 3845 }, { "epoch": 4.761904761904762, - "grad_norm": 0.54296875, - "learning_rate": 0.0001724997584172898, - "loss": 0.8885, + "grad_norm": 0.33984375, + "learning_rate": 0.00012538080888191408, + "loss": 0.8004, "step": 3850 }, { "epoch": 4.768089053803339, - "grad_norm": 0.498046875, - "learning_rate": 0.0001724005002778657, - "loss": 0.906, + "grad_norm": 0.326171875, + "learning_rate": 0.00012517181694397762, + "loss": 0.8151, "step": 3855 }, { "epoch": 4.774273345701917, - "grad_norm": 0.5078125, - "learning_rate": 0.00017230109200031668, - "loss": 0.9108, + "grad_norm": 0.35546875, + "learning_rate": 0.00012496270755782914, + "loss": 0.819, "step": 3860 }, { "epoch": 4.780457637600494, - "grad_norm": 0.46875, - "learning_rate": 0.00017220153379078737, - "loss": 0.9037, + "grad_norm": 0.369140625, + "learning_rate": 0.0001247534816991441, + "loss": 0.8118, "step": 3865 }, { "epoch": 4.786641929499073, - "grad_norm": 0.48046875, - "learning_rate": 0.00017210182585573327, - "loss": 0.904, + "grad_norm": 0.384765625, + "learning_rate": 0.00012454414034414142, + "loss": 0.8119, "step": 3870 }, { "epoch": 4.79282622139765, - "grad_norm": 0.48828125, - "learning_rate": 0.00017200196840192042, - "loss": 0.9066, + "grad_norm": 0.3671875, + "learning_rate": 0.00012433468446957887, + "loss": 0.8172, "step": 3875 }, { "epoch": 4.799010513296228, - "grad_norm": 0.48828125, - "learning_rate": 0.00017190196163642483, - "loss": 0.8982, + "grad_norm": 0.37109375, + "learning_rate": 0.00012412511505274844, + "loss": 0.8091, "step": 3880 }, { "epoch": 4.805194805194805, - "grad_norm": 0.443359375, - "learning_rate": 0.00017180180576663228, - "loss": 0.9065, + "grad_norm": 0.34765625, + "learning_rate": 0.00012391543307147212, + "loss": 0.8139, "step": 3885 }, { "epoch": 4.811379097093383, - "grad_norm": 0.451171875, - "learning_rate": 0.0001717015010002376, - "loss": 0.8865, + "grad_norm": 0.3515625, + "learning_rate": 0.00012370563950409703, + "loss": 0.7987, "step": 3890 }, { "epoch": 4.81756338899196, - "grad_norm": 0.41796875, - "learning_rate": 0.00017160104754524445, - "loss": 0.9034, + "grad_norm": 0.32421875, + "learning_rate": 0.0001234957353294908, + "loss": 0.8144, "step": 3895 }, { "epoch": 4.823747680890538, - "grad_norm": 0.408203125, - "learning_rate": 0.00017150044560996488, - "loss": 0.8984, + "grad_norm": 0.3359375, + "learning_rate": 0.00012328572152703725, + "loss": 0.8085, "step": 3900 }, { "epoch": 4.829931972789115, - "grad_norm": 0.470703125, - "learning_rate": 0.00017139969540301878, - "loss": 0.9073, + "grad_norm": 0.361328125, + "learning_rate": 0.00012307559907663175, + "loss": 0.8178, "step": 3905 }, { "epoch": 4.836116264687694, - "grad_norm": 0.53515625, - "learning_rate": 0.00017129879713333356, - "loss": 0.9078, + "grad_norm": 0.333984375, + "learning_rate": 0.00012286536895867654, + "loss": 0.8146, "step": 3910 }, { "epoch": 4.842300556586271, - "grad_norm": 0.46484375, - "learning_rate": 0.00017119775101014358, - "loss": 0.9127, + "grad_norm": 0.33203125, + "learning_rate": 0.00012265503215407627, + "loss": 0.8224, "step": 3915 }, { "epoch": 4.848484848484849, - "grad_norm": 0.4765625, - "learning_rate": 0.00017109655724298995, - "loss": 0.9068, + "grad_norm": 0.36328125, + "learning_rate": 0.00012244458964423327, + "loss": 0.8146, "step": 3920 }, { "epoch": 4.854669140383426, - "grad_norm": 0.515625, - "learning_rate": 0.00017099521604171982, - "loss": 0.8936, + "grad_norm": 0.33203125, + "learning_rate": 0.00012223404241104317, + "loss": 0.8034, "step": 3925 }, { "epoch": 4.860853432282004, - "grad_norm": 0.45703125, - "learning_rate": 0.00017089372761648616, - "loss": 0.9042, + "grad_norm": 0.345703125, + "learning_rate": 0.00012202339143689023, + "loss": 0.8128, "step": 3930 }, { "epoch": 4.867037724180581, - "grad_norm": 0.462890625, - "learning_rate": 0.0001707920921777472, - "loss": 0.9065, + "grad_norm": 0.388671875, + "learning_rate": 0.00012181263770464273, + "loss": 0.8148, "step": 3935 }, { "epoch": 4.873222016079159, - "grad_norm": 0.48828125, - "learning_rate": 0.00017069030993626603, - "loss": 0.8962, + "grad_norm": 0.37109375, + "learning_rate": 0.00012160178219764837, + "loss": 0.806, "step": 3940 }, { "epoch": 4.879406307977737, - "grad_norm": 0.4921875, - "learning_rate": 0.00017058838110311017, - "loss": 0.9011, + "grad_norm": 0.373046875, + "learning_rate": 0.00012139082589972972, + "loss": 0.8123, "step": 3945 }, { "epoch": 4.8855905998763145, - "grad_norm": 0.44140625, - "learning_rate": 0.00017048630588965117, - "loss": 0.8989, + "grad_norm": 0.328125, + "learning_rate": 0.00012117976979517973, + "loss": 0.8089, "step": 3950 }, { "epoch": 4.891774891774892, - "grad_norm": 0.51953125, - "learning_rate": 0.0001703840845075641, - "loss": 0.9101, + "grad_norm": 0.341796875, + "learning_rate": 0.00012096861486875693, + "loss": 0.8188, "step": 3955 }, { "epoch": 4.8979591836734695, - "grad_norm": 0.5078125, - "learning_rate": 0.00017028171716882714, - "loss": 0.8964, + "grad_norm": 0.357421875, + "learning_rate": 0.0001207573621056809, + "loss": 0.8057, "step": 3960 }, { "epoch": 4.904143475572047, - "grad_norm": 0.490234375, - "learning_rate": 0.00017017920408572115, - "loss": 0.9077, + "grad_norm": 0.37890625, + "learning_rate": 0.00012054601249162783, + "loss": 0.8163, "step": 3965 }, { "epoch": 4.9103277674706245, - "grad_norm": 0.58203125, - "learning_rate": 0.00017007654547082922, - "loss": 0.9015, + "grad_norm": 0.330078125, + "learning_rate": 0.00012033456701272576, + "loss": 0.8103, "step": 3970 }, { "epoch": 4.916512059369202, - "grad_norm": 0.458984375, - "learning_rate": 0.00016997374153703625, - "loss": 0.8991, + "grad_norm": 0.34375, + "learning_rate": 0.00012012302665555002, + "loss": 0.8099, "step": 3975 }, { "epoch": 4.9226963512677795, - "grad_norm": 0.416015625, - "learning_rate": 0.00016987079249752843, - "loss": 0.9045, + "grad_norm": 0.376953125, + "learning_rate": 0.00011991139240711857, + "loss": 0.814, "step": 3980 }, { "epoch": 4.928880643166357, - "grad_norm": 0.423828125, - "learning_rate": 0.0001697676985657929, - "loss": 0.9004, + "grad_norm": 0.345703125, + "learning_rate": 0.00011969966525488753, + "loss": 0.8105, "step": 3985 }, { "epoch": 4.935064935064935, - "grad_norm": 0.47265625, - "learning_rate": 0.00016966445995561727, - "loss": 0.8999, + "grad_norm": 0.328125, + "learning_rate": 0.00011948784618674653, + "loss": 0.8127, "step": 3990 }, { "epoch": 4.941249226963513, - "grad_norm": 0.44921875, - "learning_rate": 0.00016956107688108923, - "loss": 0.9044, + "grad_norm": 0.34375, + "learning_rate": 0.00011927593619101391, + "loss": 0.8136, "step": 3995 }, { "epoch": 4.94743351886209, - "grad_norm": 0.443359375, - "learning_rate": 0.00016945754955659595, - "loss": 0.9037, + "grad_norm": 0.353515625, + "learning_rate": 0.00011906393625643244, + "loss": 0.811, "step": 4000 }, { "epoch": 4.953617810760668, - "grad_norm": 0.4921875, - "learning_rate": 0.00016935387819682376, - "loss": 0.9038, + "grad_norm": 0.34765625, + "learning_rate": 0.0001188518473721644, + "loss": 0.8143, "step": 4005 }, { "epoch": 4.959802102659245, - "grad_norm": 0.435546875, - "learning_rate": 0.00016925006301675763, - "loss": 0.8995, + "grad_norm": 0.310546875, + "learning_rate": 0.00011863967052778721, + "loss": 0.8092, "step": 4010 }, { "epoch": 4.965986394557823, - "grad_norm": 0.439453125, - "learning_rate": 0.00016914610423168094, - "loss": 0.9127, + "grad_norm": 0.32421875, + "learning_rate": 0.0001184274067132886, + "loss": 0.8225, "step": 4015 }, { "epoch": 4.9721706864564, - "grad_norm": 0.44140625, - "learning_rate": 0.0001690420020571747, - "loss": 0.8946, + "grad_norm": 0.3359375, + "learning_rate": 0.00011821505691906216, + "loss": 0.8057, "step": 4020 }, { "epoch": 4.978354978354979, - "grad_norm": 0.5390625, - "learning_rate": 0.00016893775670911732, - "loss": 0.9108, + "grad_norm": 0.322265625, + "learning_rate": 0.00011800262213590261, + "loss": 0.8183, "step": 4025 }, { "epoch": 4.984539270253556, - "grad_norm": 0.412109375, - "learning_rate": 0.00016883336840368412, - "loss": 0.9083, + "grad_norm": 0.35546875, + "learning_rate": 0.0001177901033550012, + "loss": 0.8163, "step": 4030 }, { "epoch": 4.990723562152134, - "grad_norm": 0.45703125, - "learning_rate": 0.0001687288373573469, - "loss": 0.9043, + "grad_norm": 0.353515625, + "learning_rate": 0.00011757750156794118, + "loss": 0.8127, "step": 4035 }, { "epoch": 4.996907854050711, - "grad_norm": 0.427734375, - "learning_rate": 0.0001686241637868734, - "loss": 0.9099, + "grad_norm": 0.349609375, + "learning_rate": 0.00011736481776669306, + "loss": 0.8173, "step": 4040 }, { "epoch": 4.999381570810142, - "eval_loss": 2.483851194381714, - "eval_runtime": 0.6403, - "eval_samples_per_second": 15.619, - "eval_steps_per_second": 1.562, + "eval_loss": 2.6021482944488525, + "eval_runtime": 0.8279, + "eval_samples_per_second": 12.079, + "eval_steps_per_second": 1.208, "step": 4042 }, { "epoch": 5.003092145949289, - "grad_norm": 0.54296875, - "learning_rate": 0.00016851934790932692, - "loss": 0.903, + "grad_norm": 0.318359375, + "learning_rate": 0.00011715205294360994, + "loss": 0.8044, "step": 4045 }, { "epoch": 5.009276437847866, - "grad_norm": 0.4296875, - "learning_rate": 0.00016841438994206595, - "loss": 0.8894, + "grad_norm": 0.328125, + "learning_rate": 0.00011693920809142305, + "loss": 0.7861, "step": 4050 }, { "epoch": 5.015460729746444, - "grad_norm": 0.515625, - "learning_rate": 0.0001683092901027436, - "loss": 0.9058, + "grad_norm": 0.330078125, + "learning_rate": 0.00011672628420323699, + "loss": 0.7971, "step": 4055 }, { "epoch": 5.021645021645021, - "grad_norm": 0.46875, - "learning_rate": 0.0001682040486093071, - "loss": 0.8982, + "grad_norm": 0.34375, + "learning_rate": 0.00011651328227252517, + "loss": 0.7921, "step": 4060 }, { "epoch": 5.0278293135436, - "grad_norm": 0.486328125, - "learning_rate": 0.0001680986656799975, - "loss": 0.8948, + "grad_norm": 0.369140625, + "learning_rate": 0.00011630020329312507, + "loss": 0.7904, "step": 4065 }, { "epoch": 5.034013605442177, - "grad_norm": 0.5234375, - "learning_rate": 0.00016799314153334916, - "loss": 0.8902, + "grad_norm": 0.3515625, + "learning_rate": 0.00011608704825923369, + "loss": 0.7837, "step": 4070 }, { "epoch": 5.040197897340755, - "grad_norm": 0.41796875, - "learning_rate": 0.00016788747638818926, - "loss": 0.8845, + "grad_norm": 0.326171875, + "learning_rate": 0.00011587381816540292, + "loss": 0.7822, "step": 4075 }, { "epoch": 5.046382189239332, - "grad_norm": 0.45703125, - "learning_rate": 0.00016778167046363734, - "loss": 0.8875, + "grad_norm": 0.32421875, + "learning_rate": 0.00011566051400653486, + "loss": 0.7833, "step": 4080 }, { "epoch": 5.05256648113791, - "grad_norm": 0.44140625, - "learning_rate": 0.0001676757239791049, - "loss": 0.8976, + "grad_norm": 0.33203125, + "learning_rate": 0.00011544713677787715, + "loss": 0.7926, "step": 4085 }, { "epoch": 5.058750773036487, - "grad_norm": 0.50390625, - "learning_rate": 0.00016756963715429502, - "loss": 0.887, + "grad_norm": 0.33203125, + "learning_rate": 0.00011523368747501839, + "loss": 0.784, "step": 4090 }, { "epoch": 5.064935064935065, - "grad_norm": 0.421875, - "learning_rate": 0.00016746341020920167, - "loss": 0.8852, + "grad_norm": 0.3671875, + "learning_rate": 0.00011502016709388348, + "loss": 0.781, "step": 4095 }, { "epoch": 5.071119356833642, - "grad_norm": 0.3984375, - "learning_rate": 0.00016735704336410943, - "loss": 0.8966, + "grad_norm": 0.357421875, + "learning_rate": 0.00011480657663072896, + "loss": 0.7901, "step": 4100 }, { "epoch": 5.0773036487322205, - "grad_norm": 0.466796875, - "learning_rate": 0.000167250536839593, - "loss": 0.9055, + "grad_norm": 0.392578125, + "learning_rate": 0.00011459291708213836, + "loss": 0.8018, "step": 4105 }, { "epoch": 5.083487940630798, - "grad_norm": 0.5, - "learning_rate": 0.0001671438908565167, - "loss": 0.8904, + "grad_norm": 0.330078125, + "learning_rate": 0.00011437918944501749, + "loss": 0.7841, "step": 4110 }, { "epoch": 5.0896722325293755, - "grad_norm": 0.482421875, - "learning_rate": 0.00016703710563603416, - "loss": 0.8829, + "grad_norm": 0.34765625, + "learning_rate": 0.00011416539471658994, + "loss": 0.7773, "step": 4115 }, { "epoch": 5.095856524427953, - "grad_norm": 0.451171875, - "learning_rate": 0.00016693018139958763, - "loss": 0.8835, + "grad_norm": 0.33984375, + "learning_rate": 0.00011395153389439233, + "loss": 0.7806, "step": 4120 }, { "epoch": 5.1020408163265305, - "grad_norm": 0.466796875, - "learning_rate": 0.00016682311836890766, - "loss": 0.8927, + "grad_norm": 0.361328125, + "learning_rate": 0.0001137376079762696, + "loss": 0.7873, "step": 4125 }, { "epoch": 5.108225108225108, - "grad_norm": 0.458984375, - "learning_rate": 0.00016671591676601272, - "loss": 0.8803, + "grad_norm": 0.32421875, + "learning_rate": 0.00011352361796037047, + "loss": 0.7771, "step": 4130 }, { "epoch": 5.1144094001236855, - "grad_norm": 0.51953125, - "learning_rate": 0.0001666085768132085, - "loss": 0.8923, + "grad_norm": 0.333984375, + "learning_rate": 0.00011330956484514274, + "loss": 0.7894, "step": 4135 }, { "epoch": 5.120593692022264, - "grad_norm": 0.55078125, - "learning_rate": 0.00016650109873308765, - "loss": 0.8963, + "grad_norm": 0.33984375, + "learning_rate": 0.00011309544962932862, + "loss": 0.7922, "step": 4140 }, { "epoch": 5.126777983920841, - "grad_norm": 0.458984375, - "learning_rate": 0.00016639348274852925, - "loss": 0.8986, + "grad_norm": 0.33203125, + "learning_rate": 0.00011288127331195998, + "loss": 0.7946, "step": 4145 }, { "epoch": 5.132962275819419, - "grad_norm": 0.53515625, - "learning_rate": 0.00016628572908269841, - "loss": 0.8841, + "grad_norm": 0.32421875, + "learning_rate": 0.00011266703689235394, + "loss": 0.7818, "step": 4150 }, { "epoch": 5.139146567717996, - "grad_norm": 0.5390625, - "learning_rate": 0.00016617783795904565, - "loss": 0.8892, + "grad_norm": 0.326171875, + "learning_rate": 0.00011245274137010791, + "loss": 0.7851, "step": 4155 }, { "epoch": 5.145330859616574, - "grad_norm": 0.578125, - "learning_rate": 0.00016606980960130665, - "loss": 0.892, + "grad_norm": 0.353515625, + "learning_rate": 0.00011223838774509514, + "loss": 0.7875, "step": 4160 }, { "epoch": 5.151515151515151, - "grad_norm": 0.55078125, - "learning_rate": 0.00016596164423350157, - "loss": 0.8884, + "grad_norm": 0.33984375, + "learning_rate": 0.00011202397701745994, + "loss": 0.7865, "step": 4165 }, { "epoch": 5.157699443413729, - "grad_norm": 0.5078125, - "learning_rate": 0.00016585334207993476, - "loss": 0.9061, + "grad_norm": 0.326171875, + "learning_rate": 0.00011180951018761314, + "loss": 0.8001, "step": 4170 }, { "epoch": 5.163883735312306, - "grad_norm": 0.51171875, - "learning_rate": 0.00016574490336519418, - "loss": 0.8863, + "grad_norm": 0.365234375, + "learning_rate": 0.00011159498825622718, + "loss": 0.7827, "step": 4175 }, { "epoch": 5.170068027210885, - "grad_norm": 0.51953125, - "learning_rate": 0.00016563632831415102, - "loss": 0.8963, + "grad_norm": 0.3671875, + "learning_rate": 0.00011138041222423177, + "loss": 0.7928, "step": 4180 }, { "epoch": 5.176252319109462, - "grad_norm": 0.48828125, - "learning_rate": 0.00016552761715195918, - "loss": 0.8934, + "grad_norm": 0.33984375, + "learning_rate": 0.00011116578309280887, + "loss": 0.7891, "step": 4185 }, { "epoch": 5.18243661100804, - "grad_norm": 0.66015625, - "learning_rate": 0.00016541877010405477, - "loss": 0.9016, + "grad_norm": 0.3359375, + "learning_rate": 0.00011095110186338835, + "loss": 0.795, "step": 4190 }, { "epoch": 5.188620902906617, - "grad_norm": 0.53515625, - "learning_rate": 0.00016530978739615578, - "loss": 0.8868, + "grad_norm": 0.359375, + "learning_rate": 0.00011073636953764306, + "loss": 0.7843, "step": 4195 }, { "epoch": 5.194805194805195, - "grad_norm": 0.498046875, - "learning_rate": 0.00016520066925426144, - "loss": 0.8907, + "grad_norm": 0.3359375, + "learning_rate": 0.00011052158711748434, + "loss": 0.7896, "step": 4200 }, { "epoch": 5.200989486703772, - "grad_norm": 0.5, - "learning_rate": 0.00016509141590465189, - "loss": 0.8899, + "grad_norm": 0.376953125, + "learning_rate": 0.00011030675560505717, + "loss": 0.785, "step": 4205 }, { "epoch": 5.20717377860235, - "grad_norm": 0.4375, - "learning_rate": 0.00016498202757388758, - "loss": 0.89, + "grad_norm": 0.337890625, + "learning_rate": 0.00011009187600273566, + "loss": 0.7892, "step": 4210 }, { "epoch": 5.213358070500927, - "grad_norm": 0.470703125, - "learning_rate": 0.00016487250448880893, - "loss": 0.8961, + "grad_norm": 0.3359375, + "learning_rate": 0.00010987694931311827, + "loss": 0.7908, "step": 4215 }, { "epoch": 5.219542362399506, - "grad_norm": 0.466796875, - "learning_rate": 0.0001647628468765358, - "loss": 0.8962, + "grad_norm": 0.345703125, + "learning_rate": 0.0001096619765390232, + "loss": 0.7925, "step": 4220 }, { "epoch": 5.225726654298083, - "grad_norm": 0.4453125, - "learning_rate": 0.00016465305496446702, - "loss": 0.8974, + "grad_norm": 0.369140625, + "learning_rate": 0.00010944695868348359, + "loss": 0.7926, "step": 4225 }, { "epoch": 5.231910946196661, - "grad_norm": 0.42578125, - "learning_rate": 0.0001645431289802799, - "loss": 0.8987, + "grad_norm": 0.3828125, + "learning_rate": 0.00010923189674974301, + "loss": 0.7941, "step": 4230 }, { "epoch": 5.238095238095238, - "grad_norm": 0.458984375, - "learning_rate": 0.00016443306915192975, - "loss": 0.8891, + "grad_norm": 0.357421875, + "learning_rate": 0.00010901679174125066, + "loss": 0.7873, "step": 4235 }, { "epoch": 5.244279529993816, - "grad_norm": 0.470703125, - "learning_rate": 0.00016432287570764952, - "loss": 0.8979, + "grad_norm": 0.353515625, + "learning_rate": 0.00010880164466165674, + "loss": 0.7951, "step": 4240 }, { "epoch": 5.250463821892393, - "grad_norm": 0.490234375, - "learning_rate": 0.00016421254887594917, - "loss": 0.8971, + "grad_norm": 0.3515625, + "learning_rate": 0.00010858645651480768, + "loss": 0.7936, "step": 4245 }, { "epoch": 5.256648113790971, - "grad_norm": 0.48046875, - "learning_rate": 0.0001641020888856153, - "loss": 0.8943, + "grad_norm": 0.345703125, + "learning_rate": 0.00010837122830474158, + "loss": 0.7933, "step": 4250 }, { "epoch": 5.262832405689548, - "grad_norm": 0.439453125, - "learning_rate": 0.00016399149596571064, - "loss": 0.901, + "grad_norm": 0.6015625, + "learning_rate": 0.00010815596103568353, + "loss": 0.7982, "step": 4255 }, { "epoch": 5.2690166975881265, - "grad_norm": 0.458984375, - "learning_rate": 0.00016388077034557355, - "loss": 0.874, + "grad_norm": 0.34375, + "learning_rate": 0.00010794065571204072, + "loss": 0.7757, "step": 4260 }, { "epoch": 5.275200989486704, - "grad_norm": 0.51953125, - "learning_rate": 0.0001637699122548176, - "loss": 0.894, + "grad_norm": 0.34375, + "learning_rate": 0.00010772531333839801, + "loss": 0.7929, "step": 4265 }, { "epoch": 5.2813852813852815, - "grad_norm": 0.41796875, - "learning_rate": 0.0001636589219233311, - "loss": 0.8834, + "grad_norm": 0.375, + "learning_rate": 0.0001075099349195131, + "loss": 0.7827, "step": 4270 }, { "epoch": 5.287569573283859, - "grad_norm": 0.435546875, - "learning_rate": 0.0001635477995812765, - "loss": 0.9, + "grad_norm": 0.408203125, + "learning_rate": 0.00010729452146031183, + "loss": 0.794, "step": 4275 }, { "epoch": 5.2937538651824365, - "grad_norm": 0.44140625, - "learning_rate": 0.00016343654545909007, - "loss": 0.897, + "grad_norm": 0.369140625, + "learning_rate": 0.00010707907396588361, + "loss": 0.7956, "step": 4280 }, { "epoch": 5.299938157081014, - "grad_norm": 0.546875, - "learning_rate": 0.00016332515978748134, - "loss": 0.8738, + "grad_norm": 0.330078125, + "learning_rate": 0.00010686359344147658, + "loss": 0.7739, "step": 4285 }, { "epoch": 5.3061224489795915, - "grad_norm": 0.48828125, - "learning_rate": 0.00016321364279743266, - "loss": 0.8839, + "grad_norm": 0.33203125, + "learning_rate": 0.00010664808089249305, + "loss": 0.7824, "step": 4290 }, { "epoch": 5.312306740878169, - "grad_norm": 0.546875, - "learning_rate": 0.00016310199472019865, - "loss": 0.8876, + "grad_norm": 0.369140625, + "learning_rate": 0.00010643253732448475, + "loss": 0.7853, "step": 4295 }, { "epoch": 5.318491032776747, - "grad_norm": 0.427734375, - "learning_rate": 0.00016299021578730579, - "loss": 0.8807, + "grad_norm": 0.3359375, + "learning_rate": 0.00010621696374314807, + "loss": 0.7836, "step": 4300 }, { "epoch": 5.324675324675325, - "grad_norm": 0.484375, - "learning_rate": 0.00016287830623055188, - "loss": 0.8866, + "grad_norm": 0.373046875, + "learning_rate": 0.0001060013611543195, + "loss": 0.7848, "step": 4305 }, { "epoch": 5.330859616573902, - "grad_norm": 0.47265625, - "learning_rate": 0.00016276626628200568, - "loss": 0.8949, + "grad_norm": 0.3828125, + "learning_rate": 0.00010578573056397085, + "loss": 0.7916, "step": 4310 }, { "epoch": 5.33704390847248, - "grad_norm": 0.486328125, - "learning_rate": 0.00016265409617400632, - "loss": 0.8963, + "grad_norm": 0.341796875, + "learning_rate": 0.00010557007297820468, + "loss": 0.7931, "step": 4315 }, { "epoch": 5.343228200371057, - "grad_norm": 0.41796875, - "learning_rate": 0.00016254179613916278, - "loss": 0.8974, + "grad_norm": 0.333984375, + "learning_rate": 0.0001053543894032493, + "loss": 0.7895, "step": 4320 }, { "epoch": 5.349412492269635, - "grad_norm": 0.46875, - "learning_rate": 0.00016242936641035357, - "loss": 0.8935, + "grad_norm": 0.34375, + "learning_rate": 0.00010513868084545446, + "loss": 0.7927, "step": 4325 }, { "epoch": 5.355596784168212, - "grad_norm": 0.431640625, - "learning_rate": 0.000162316807220726, - "loss": 0.8888, + "grad_norm": 0.318359375, + "learning_rate": 0.00010492294831128641, + "loss": 0.7862, "step": 4330 }, { "epoch": 5.361781076066791, - "grad_norm": 0.46875, - "learning_rate": 0.00016220411880369601, - "loss": 0.8833, + "grad_norm": 0.3359375, + "learning_rate": 0.00010470719280732333, + "loss": 0.7832, "step": 4335 }, { "epoch": 5.367965367965368, - "grad_norm": 0.49609375, - "learning_rate": 0.00016209130139294744, - "loss": 0.8855, + "grad_norm": 0.3671875, + "learning_rate": 0.00010449141534025045, + "loss": 0.786, "step": 4340 }, { "epoch": 5.374149659863946, - "grad_norm": 0.443359375, - "learning_rate": 0.00016197835522243162, - "loss": 0.8867, + "grad_norm": 0.328125, + "learning_rate": 0.00010427561691685557, + "loss": 0.7869, "step": 4345 }, { "epoch": 5.380333951762523, - "grad_norm": 0.5859375, - "learning_rate": 0.00016186528052636692, - "loss": 0.8878, + "grad_norm": 0.337890625, + "learning_rate": 0.00010405979854402425, + "loss": 0.7865, "step": 4350 }, { "epoch": 5.386518243661101, - "grad_norm": 0.455078125, - "learning_rate": 0.00016175207753923822, - "loss": 0.8938, + "grad_norm": 0.349609375, + "learning_rate": 0.00010384396122873515, + "loss": 0.7944, "step": 4355 }, { "epoch": 5.392702535559678, - "grad_norm": 0.38671875, - "learning_rate": 0.00016163874649579647, - "loss": 0.8857, + "grad_norm": 0.326171875, + "learning_rate": 0.00010362810597805526, + "loss": 0.7856, "step": 4360 }, { "epoch": 5.398886827458256, - "grad_norm": 0.482421875, - "learning_rate": 0.0001615252876310581, - "loss": 0.889, + "grad_norm": 0.3671875, + "learning_rate": 0.00010341223379913534, + "loss": 0.7869, "step": 4365 }, { "epoch": 5.405071119356833, - "grad_norm": 0.5625, - "learning_rate": 0.00016141170118030463, - "loss": 0.8858, + "grad_norm": 0.38671875, + "learning_rate": 0.00010319634569920504, + "loss": 0.7841, "step": 4370 }, { "epoch": 5.411255411255412, - "grad_norm": 0.470703125, - "learning_rate": 0.00016129798737908225, - "loss": 0.8901, + "grad_norm": 0.328125, + "learning_rate": 0.00010298044268556832, + "loss": 0.7895, "step": 4375 }, { "epoch": 5.417439703153989, - "grad_norm": 0.48046875, - "learning_rate": 0.0001611841464632011, - "loss": 0.8935, + "grad_norm": 0.359375, + "learning_rate": 0.00010276452576559879, + "loss": 0.7905, "step": 4380 }, { "epoch": 5.423623995052567, - "grad_norm": 0.462890625, - "learning_rate": 0.00016107017866873505, - "loss": 0.8871, + "grad_norm": 0.3359375, + "learning_rate": 0.0001025485959467349, + "loss": 0.7868, "step": 4385 }, { "epoch": 5.429808286951144, - "grad_norm": 0.478515625, - "learning_rate": 0.00016095608423202098, - "loss": 0.8979, + "grad_norm": 0.34375, + "learning_rate": 0.00010233265423647523, + "loss": 0.7945, "step": 4390 }, { "epoch": 5.435992578849722, - "grad_norm": 0.4921875, - "learning_rate": 0.00016084186338965843, - "loss": 0.8893, + "grad_norm": 0.357421875, + "learning_rate": 0.00010211670164237392, + "loss": 0.7897, "step": 4395 }, { "epoch": 5.442176870748299, - "grad_norm": 0.5234375, - "learning_rate": 0.00016072751637850904, - "loss": 0.8928, + "grad_norm": 0.34375, + "learning_rate": 0.00010190073917203589, + "loss": 0.7897, "step": 4400 }, { "epoch": 5.448361162646877, - "grad_norm": 0.474609375, - "learning_rate": 0.00016061304343569614, - "loss": 0.8916, + "grad_norm": 0.333984375, + "learning_rate": 0.0001016847678331121, + "loss": 0.7912, "step": 4405 }, { "epoch": 5.454545454545454, - "grad_norm": 0.45703125, - "learning_rate": 0.00016049844479860422, - "loss": 0.8866, + "grad_norm": 0.345703125, + "learning_rate": 0.00010146878863329492, + "loss": 0.7858, "step": 4410 }, { "epoch": 5.4607297464440325, - "grad_norm": 0.419921875, - "learning_rate": 0.00016038372070487832, - "loss": 0.9007, + "grad_norm": 0.32421875, + "learning_rate": 0.00010125280258031335, + "loss": 0.7962, "step": 4415 }, { "epoch": 5.46691403834261, - "grad_norm": 0.54296875, - "learning_rate": 0.00016026887139242372, - "loss": 0.8922, + "grad_norm": 0.35546875, + "learning_rate": 0.00010103681068192845, + "loss": 0.7914, "step": 4420 }, { "epoch": 5.4730983302411875, - "grad_norm": 0.46875, - "learning_rate": 0.00016015389709940538, - "loss": 0.8912, + "grad_norm": 0.384765625, + "learning_rate": 0.00010082081394592851, + "loss": 0.7891, "step": 4425 }, { "epoch": 5.479282622139765, - "grad_norm": 0.482421875, - "learning_rate": 0.0001600387980642474, - "loss": 0.8891, + "grad_norm": 0.3515625, + "learning_rate": 0.00010060481338012435, + "loss": 0.7881, "step": 4430 }, { "epoch": 5.4854669140383425, - "grad_norm": 0.51171875, - "learning_rate": 0.00015992357452563255, - "loss": 0.8973, + "grad_norm": 0.341796875, + "learning_rate": 0.0001003888099923447, + "loss": 0.796, "step": 4435 }, { "epoch": 5.49165120593692, - "grad_norm": 0.54296875, - "learning_rate": 0.0001598082267225018, - "loss": 0.9042, + "grad_norm": 0.349609375, + "learning_rate": 0.00010017280479043147, + "loss": 0.8005, "step": 4440 }, { "epoch": 5.4978354978354975, - "grad_norm": 0.58984375, - "learning_rate": 0.00015969275489405383, - "loss": 0.8975, + "grad_norm": 0.359375, + "learning_rate": 9.995679878223505e-05, + "loss": 0.7964, "step": 4445 }, { "epoch": 5.504019789734075, - "grad_norm": 0.4921875, - "learning_rate": 0.0001595771592797445, - "loss": 0.8788, + "grad_norm": 0.330078125, + "learning_rate": 9.97407929756095e-05, + "loss": 0.7767, "step": 4450 }, { "epoch": 5.510204081632653, - "grad_norm": 0.54296875, - "learning_rate": 0.00015946144011928638, - "loss": 0.8986, + "grad_norm": 0.35546875, + "learning_rate": 9.952478837840803e-05, + "loss": 0.7961, "step": 4455 }, { "epoch": 5.516388373531231, - "grad_norm": 0.53515625, - "learning_rate": 0.0001593455976526482, - "loss": 0.8966, + "grad_norm": 0.357421875, + "learning_rate": 9.930878599847821e-05, + "loss": 0.795, "step": 4460 }, { "epoch": 5.522572665429808, - "grad_norm": 0.462890625, - "learning_rate": 0.00015922963212005442, - "loss": 0.8858, + "grad_norm": 0.34765625, + "learning_rate": 9.909278684365718e-05, + "loss": 0.7861, "step": 4465 }, { "epoch": 5.528756957328386, - "grad_norm": 0.482421875, - "learning_rate": 0.0001591135437619847, - "loss": 0.8931, + "grad_norm": 0.34765625, + "learning_rate": 9.887679192176712e-05, + "loss": 0.7909, "step": 4470 }, { "epoch": 5.534941249226963, - "grad_norm": 0.52734375, - "learning_rate": 0.0001589973328191734, - "loss": 0.8913, + "grad_norm": 0.361328125, + "learning_rate": 9.866080224061038e-05, + "loss": 0.7894, "step": 4475 }, { "epoch": 5.541125541125541, - "grad_norm": 0.45703125, - "learning_rate": 0.00015888099953260905, - "loss": 0.8913, - "step": 4480 + "grad_norm": 0.337890625, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7904, + "step": 4480 }, { "epoch": 5.547309833024118, - "grad_norm": 0.4375, - "learning_rate": 0.000158764544143534, - "loss": 0.8886, + "grad_norm": 0.33203125, + "learning_rate": 9.822884263157957e-05, + "loss": 0.7901, "step": 4485 }, { "epoch": 5.553494124922697, - "grad_norm": 0.42578125, - "learning_rate": 0.0001586479668934437, - "loss": 0.9026, + "grad_norm": 0.33984375, + "learning_rate": 9.801287471916919e-05, + "loss": 0.8019, "step": 4490 }, { "epoch": 5.559678416821274, - "grad_norm": 0.466796875, - "learning_rate": 0.00015853126802408633, - "loss": 0.8857, + "grad_norm": 0.330078125, + "learning_rate": 9.779691607841019e-05, + "loss": 0.7863, "step": 4495 }, { "epoch": 5.565862708719852, - "grad_norm": 0.44921875, - "learning_rate": 0.0001584144477774623, - "loss": 0.8948, + "grad_norm": 0.35546875, + "learning_rate": 9.758096771693573e-05, + "loss": 0.7933, "step": 4500 }, { "epoch": 5.572047000618429, - "grad_norm": 0.5390625, - "learning_rate": 0.0001582975063958237, - "loss": 0.8877, + "grad_norm": 0.337890625, + "learning_rate": 9.736503064233086e-05, + "loss": 0.7874, "step": 4505 }, { "epoch": 5.578231292517007, - "grad_norm": 0.73828125, - "learning_rate": 0.0001581804441216738, - "loss": 0.8958, + "grad_norm": 0.341796875, + "learning_rate": 9.714910586212816e-05, + "loss": 0.7938, "step": 4510 }, { "epoch": 5.584415584415584, - "grad_norm": 0.482421875, - "learning_rate": 0.00015806326119776663, - "loss": 0.8914, + "grad_norm": 0.349609375, + "learning_rate": 9.693319438380266e-05, + "loss": 0.7906, "step": 4515 }, { "epoch": 5.590599876314162, - "grad_norm": 0.4140625, - "learning_rate": 0.00015794595786710632, - "loss": 0.8937, + "grad_norm": 0.357421875, + "learning_rate": 9.671729721476746e-05, + "loss": 0.7907, "step": 4520 }, { "epoch": 5.596784168212739, - "grad_norm": 0.419921875, - "learning_rate": 0.0001578285343729468, - "loss": 0.8957, + "grad_norm": 0.349609375, + "learning_rate": 9.650141536236889e-05, + "loss": 0.7948, "step": 4525 }, { "epoch": 5.602968460111317, - "grad_norm": 0.71875, - "learning_rate": 0.00015771099095879108, - "loss": 0.8916, + "grad_norm": 0.330078125, + "learning_rate": 9.628554983388173e-05, + "loss": 0.7899, "step": 4530 }, { "epoch": 5.609152752009895, - "grad_norm": 0.5078125, - "learning_rate": 0.00015759332786839092, - "loss": 0.891, + "grad_norm": 0.33984375, + "learning_rate": 9.606970163650465e-05, + "loss": 0.7881, "step": 4535 }, { "epoch": 5.615337043908473, - "grad_norm": 0.48828125, - "learning_rate": 0.00015747554534574626, - "loss": 0.8814, + "grad_norm": 0.341796875, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7821, "step": 4540 }, { "epoch": 5.62152133580705, - "grad_norm": 0.451171875, - "learning_rate": 0.0001573576436351046, - "loss": 0.8883, + "grad_norm": 0.3359375, + "learning_rate": 9.563806126346642e-05, + "loss": 0.7883, "step": 4545 }, { "epoch": 5.627705627705628, - "grad_norm": 0.44921875, - "learning_rate": 0.0001572396229809608, - "loss": 0.8992, + "grad_norm": 0.3359375, + "learning_rate": 9.542227110177945e-05, + "loss": 0.7947, "step": 4550 }, { "epoch": 5.633889919604205, - "grad_norm": 0.4609375, - "learning_rate": 0.00015712148362805617, - "loss": 0.883, + "grad_norm": 0.341796875, + "learning_rate": 9.520650229914157e-05, + "loss": 0.7849, "step": 4555 }, { "epoch": 5.640074211502783, - "grad_norm": 0.4296875, - "learning_rate": 0.00015700322582137827, - "loss": 0.8908, + "grad_norm": 0.326171875, + "learning_rate": 9.499075586230013e-05, + "loss": 0.7893, "step": 4560 }, { "epoch": 5.646258503401361, - "grad_norm": 0.4453125, - "learning_rate": 0.00015688484980616032, - "loss": 0.8953, + "grad_norm": 0.326171875, + "learning_rate": 9.477503279789817e-05, + "loss": 0.7945, "step": 4565 }, { "epoch": 5.6524427952999385, - "grad_norm": 0.4609375, - "learning_rate": 0.0001567663558278806, - "loss": 0.8968, + "grad_norm": 0.3359375, + "learning_rate": 9.455933411246958e-05, + "loss": 0.7949, "step": 4570 }, { "epoch": 5.658627087198516, - "grad_norm": 0.447265625, - "learning_rate": 0.0001566477441322621, - "loss": 0.8788, + "grad_norm": 0.36328125, + "learning_rate": 9.43436608124346e-05, + "loss": 0.7806, "step": 4575 }, { "epoch": 5.6648113790970935, - "grad_norm": 0.470703125, - "learning_rate": 0.0001565290149652718, - "loss": 0.8847, + "grad_norm": 0.359375, + "learning_rate": 9.412801390409497e-05, + "loss": 0.7862, "step": 4580 }, { "epoch": 5.670995670995671, - "grad_norm": 0.470703125, - "learning_rate": 0.00015641016857312044, - "loss": 0.8916, + "grad_norm": 0.349609375, + "learning_rate": 9.391239439362928e-05, + "loss": 0.7911, "step": 4585 }, { "epoch": 5.6771799628942485, - "grad_norm": 0.486328125, - "learning_rate": 0.00015629120520226165, - "loss": 0.8942, + "grad_norm": 0.3359375, + "learning_rate": 9.369680328708836e-05, + "loss": 0.7926, "step": 4590 }, { "epoch": 5.683364254792826, - "grad_norm": 0.44140625, - "learning_rate": 0.00015617212509939186, - "loss": 0.893, + "grad_norm": 0.373046875, + "learning_rate": 9.348124159039036e-05, + "loss": 0.7925, "step": 4595 }, { "epoch": 5.6895485466914035, - "grad_norm": 0.44921875, - "learning_rate": 0.00015605292851144942, - "loss": 0.8956, + "grad_norm": 0.357421875, + "learning_rate": 9.326571030931637e-05, + "loss": 0.7949, "step": 4600 }, { "epoch": 5.695732838589981, - "grad_norm": 0.546875, - "learning_rate": 0.00015593361568561428, - "loss": 0.8978, + "grad_norm": 0.330078125, + "learning_rate": 9.305021044950552e-05, + "loss": 0.7953, "step": 4605 }, { "epoch": 5.701917130488559, - "grad_norm": 0.421875, - "learning_rate": 0.00015581418686930743, - "loss": 0.8867, + "grad_norm": 0.330078125, + "learning_rate": 9.283474301645026e-05, + "loss": 0.7858, "step": 4610 }, { "epoch": 5.708101422387137, - "grad_norm": 0.490234375, - "learning_rate": 0.0001556946423101905, - "loss": 0.8914, + "grad_norm": 0.33203125, + "learning_rate": 9.261930901549181e-05, + "loss": 0.7911, "step": 4615 }, { "epoch": 5.714285714285714, - "grad_norm": 0.4609375, - "learning_rate": 0.00015557498225616487, - "loss": 0.892, + "grad_norm": 0.337890625, + "learning_rate": 9.240390945181543e-05, + "loss": 0.7924, "step": 4620 }, { "epoch": 5.720470006184292, - "grad_norm": 0.453125, - "learning_rate": 0.00015545520695537169, - "loss": 0.8906, + "grad_norm": 0.341796875, + "learning_rate": 9.21885453304456e-05, + "loss": 0.7896, "step": 4625 }, { "epoch": 5.726654298082869, - "grad_norm": 0.474609375, - "learning_rate": 0.00015533531665619098, - "loss": 0.8893, + "grad_norm": 0.330078125, + "learning_rate": 9.197321765624152e-05, + "loss": 0.7911, "step": 4630 }, { "epoch": 5.732838589981447, - "grad_norm": 0.46484375, - "learning_rate": 0.00015521531160724126, - "loss": 0.8866, + "grad_norm": 0.353515625, + "learning_rate": 9.175792743389227e-05, + "loss": 0.788, "step": 4635 }, { "epoch": 5.739022881880024, - "grad_norm": 0.462890625, - "learning_rate": 0.00015509519205737896, - "loss": 0.907, + "grad_norm": 0.341796875, + "learning_rate": 9.154267566791223e-05, + "loss": 0.804, "step": 4640 }, { "epoch": 5.745207173778603, - "grad_norm": 0.4921875, - "learning_rate": 0.00015497495825569807, - "loss": 0.8901, + "grad_norm": 0.341796875, + "learning_rate": 9.132746336263632e-05, + "loss": 0.7906, "step": 4645 }, { "epoch": 5.75139146567718, - "grad_norm": 0.486328125, - "learning_rate": 0.0001548546104515294, - "loss": 0.8843, + "grad_norm": 0.359375, + "learning_rate": 9.111229152221535e-05, + "loss": 0.7834, "step": 4650 }, { "epoch": 5.757575757575758, - "grad_norm": 0.51953125, - "learning_rate": 0.00015473414889444014, - "loss": 0.8911, + "grad_norm": 0.328125, + "learning_rate": 9.089716115061135e-05, + "loss": 0.7897, "step": 4655 }, { "epoch": 5.763760049474335, - "grad_norm": 0.625, - "learning_rate": 0.0001546135738342335, - "loss": 0.8918, + "grad_norm": 0.33203125, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7916, "step": 4660 }, { "epoch": 5.769944341372913, - "grad_norm": 0.55859375, - "learning_rate": 0.00015449288552094796, - "loss": 0.8893, + "grad_norm": 0.33203125, + "learning_rate": 9.046702882873016e-05, + "loss": 0.787, "step": 4665 }, { "epoch": 5.77612863327149, - "grad_norm": 0.51171875, - "learning_rate": 0.0001543720842048569, - "loss": 0.8921, + "grad_norm": 0.337890625, + "learning_rate": 9.02520288853908e-05, + "loss": 0.7902, "step": 4670 }, { "epoch": 5.782312925170068, - "grad_norm": 0.482421875, - "learning_rate": 0.000154251170136468, - "loss": 0.8927, + "grad_norm": 0.337890625, + "learning_rate": 9.003707442473479e-05, + "loss": 0.793, "step": 4675 }, { "epoch": 5.788497217068645, - "grad_norm": 0.5, - "learning_rate": 0.00015413014356652286, - "loss": 0.8886, + "grad_norm": 0.357421875, + "learning_rate": 8.982216644970979e-05, + "loss": 0.7871, "step": 4680 }, { "epoch": 5.794681508967223, - "grad_norm": 0.482421875, - "learning_rate": 0.0001540090047459962, - "loss": 0.8811, + "grad_norm": 0.37890625, + "learning_rate": 8.960730596304664e-05, + "loss": 0.7816, "step": 4685 }, { "epoch": 5.800865800865801, - "grad_norm": 0.58984375, - "learning_rate": 0.00015388775392609564, - "loss": 0.8938, + "grad_norm": 0.376953125, + "learning_rate": 8.939249396725467e-05, + "loss": 0.7936, "step": 4690 }, { "epoch": 5.807050092764379, - "grad_norm": 0.427734375, - "learning_rate": 0.00015376639135826107, - "loss": 0.8838, + "grad_norm": 0.345703125, + "learning_rate": 8.917773146461692e-05, + "loss": 0.783, "step": 4695 }, { "epoch": 5.813234384662956, - "grad_norm": 0.419921875, - "learning_rate": 0.000153644917294164, - "loss": 0.8913, + "grad_norm": 0.361328125, + "learning_rate": 8.896301945718541e-05, + "loss": 0.7901, "step": 4700 }, { "epoch": 5.819418676561534, - "grad_norm": 0.458984375, - "learning_rate": 0.0001535233319857073, - "loss": 0.8946, + "grad_norm": 0.330078125, + "learning_rate": 8.87483589467767e-05, + "loss": 0.7946, "step": 4705 }, { "epoch": 5.825602968460111, - "grad_norm": 0.453125, - "learning_rate": 0.0001534016356850244, - "loss": 0.8948, + "grad_norm": 0.333984375, + "learning_rate": 8.853375093496699e-05, + "loss": 0.7934, "step": 4710 }, { "epoch": 5.831787260358689, - "grad_norm": 0.4609375, - "learning_rate": 0.00015327982864447894, - "loss": 0.8854, + "grad_norm": 0.369140625, + "learning_rate": 8.831919642308756e-05, + "loss": 0.7853, "step": 4715 }, { "epoch": 5.837971552257266, - "grad_norm": 0.47265625, - "learning_rate": 0.00015315791111666425, - "loss": 0.8961, + "grad_norm": 0.388671875, + "learning_rate": 8.810469641222001e-05, + "loss": 0.795, "step": 4720 }, { "epoch": 5.8441558441558445, - "grad_norm": 0.435546875, - "learning_rate": 0.00015303588335440274, - "loss": 0.8981, + "grad_norm": 0.35546875, + "learning_rate": 8.789025190319169e-05, + "loss": 0.7956, "step": 4725 }, { "epoch": 5.850340136054422, - "grad_norm": 0.46875, - "learning_rate": 0.00015291374561074536, - "loss": 0.8957, + "grad_norm": 0.36328125, + "learning_rate": 8.767586389657098e-05, + "loss": 0.795, "step": 4730 }, { "epoch": 5.8565244279529995, - "grad_norm": 0.4609375, - "learning_rate": 0.00015279149813897126, - "loss": 0.8876, + "grad_norm": 0.365234375, + "learning_rate": 8.746153339266269e-05, + "loss": 0.7876, "step": 4735 }, { "epoch": 5.862708719851577, - "grad_norm": 0.43359375, - "learning_rate": 0.000152669141192587, - "loss": 0.8829, + "grad_norm": 0.34765625, + "learning_rate": 8.724726139150318e-05, + "loss": 0.7828, "step": 4740 }, { "epoch": 5.8688930117501545, - "grad_norm": 0.462890625, - "learning_rate": 0.00015254667502532626, - "loss": 0.8997, + "grad_norm": 0.369140625, + "learning_rate": 8.7033048892856e-05, + "loss": 0.7989, "step": 4745 }, { "epoch": 5.875077303648732, - "grad_norm": 0.431640625, - "learning_rate": 0.00015242409989114916, - "loss": 0.8926, + "grad_norm": 0.369140625, + "learning_rate": 8.6818896896207e-05, + "loss": 0.7921, "step": 4750 }, { "epoch": 5.8812615955473095, - "grad_norm": 0.42578125, - "learning_rate": 0.00015230141604424181, - "loss": 0.8902, + "grad_norm": 0.3515625, + "learning_rate": 8.66048064007597e-05, + "loss": 0.7923, "step": 4755 }, { "epoch": 5.887445887445887, - "grad_norm": 0.453125, - "learning_rate": 0.00015217862373901575, - "loss": 0.8962, + "grad_norm": 0.357421875, + "learning_rate": 8.639077840543077e-05, + "loss": 0.796, "step": 4760 }, { "epoch": 5.893630179344465, - "grad_norm": 0.546875, - "learning_rate": 0.0001520557232301074, - "loss": 0.8931, + "grad_norm": 0.3359375, + "learning_rate": 8.617681390884512e-05, + "loss": 0.7927, "step": 4765 }, { "epoch": 5.899814471243043, - "grad_norm": 0.50390625, - "learning_rate": 0.0001519327147723776, - "loss": 0.8883, + "grad_norm": 0.330078125, + "learning_rate": 8.596291390933147e-05, + "loss": 0.7888, "step": 4770 }, { "epoch": 5.90599876314162, - "grad_norm": 0.447265625, - "learning_rate": 0.0001518095986209111, - "loss": 0.8953, + "grad_norm": 0.330078125, + "learning_rate": 8.574907940491767e-05, + "loss": 0.7926, "step": 4775 }, { "epoch": 5.912183055040198, - "grad_norm": 0.484375, - "learning_rate": 0.00015168637503101584, - "loss": 0.9026, + "grad_norm": 0.341796875, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7997, "step": 4780 }, { "epoch": 5.918367346938775, - "grad_norm": 0.484375, - "learning_rate": 0.00015156304425822267, - "loss": 0.8927, + "grad_norm": 0.337890625, + "learning_rate": 8.532161087196791e-05, + "loss": 0.7912, "step": 4785 }, { "epoch": 5.924551638837353, - "grad_norm": 0.439453125, - "learning_rate": 0.00015143960655828468, - "loss": 0.8968, + "grad_norm": 0.341796875, + "learning_rate": 8.510797883794097e-05, + "loss": 0.794, "step": 4790 }, { "epoch": 5.93073593073593, - "grad_norm": 0.44921875, - "learning_rate": 0.00015131606218717666, - "loss": 0.9022, + "grad_norm": 0.341796875, + "learning_rate": 8.489441628802246e-05, + "loss": 0.7995, "step": 4795 }, { "epoch": 5.936920222634509, - "grad_norm": 0.4609375, - "learning_rate": 0.00015119241140109467, - "loss": 0.8945, + "grad_norm": 0.33984375, + "learning_rate": 8.468092421866573e-05, + "loss": 0.7934, "step": 4800 }, { "epoch": 5.943104514533086, - "grad_norm": 0.427734375, - "learning_rate": 0.00015106865445645536, - "loss": 0.8783, + "grad_norm": 0.326171875, + "learning_rate": 8.446750362599513e-05, + "loss": 0.7781, "step": 4805 }, { "epoch": 5.949288806431664, - "grad_norm": 0.44921875, - "learning_rate": 0.0001509447916098956, - "loss": 0.8833, + "grad_norm": 0.328125, + "learning_rate": 8.425415550580162e-05, + "loss": 0.784, "step": 4810 }, { "epoch": 5.955473098330241, - "grad_norm": 0.486328125, - "learning_rate": 0.00015082082311827183, - "loss": 0.8861, + "grad_norm": 0.3359375, + "learning_rate": 8.4040880853538e-05, + "loss": 0.7875, "step": 4815 }, { "epoch": 5.961657390228819, - "grad_norm": 0.458984375, - "learning_rate": 0.0001506967492386596, - "loss": 0.8853, + "grad_norm": 0.33984375, + "learning_rate": 8.382768066431425e-05, + "loss": 0.7874, "step": 4820 }, { "epoch": 5.967841682127396, - "grad_norm": 0.453125, - "learning_rate": 0.00015057257022835295, - "loss": 0.8897, + "grad_norm": 0.349609375, + "learning_rate": 8.361455593289292e-05, + "loss": 0.7898, "step": 4825 }, { "epoch": 5.974025974025974, - "grad_norm": 0.42578125, - "learning_rate": 0.000150448286344864, - "loss": 0.8951, + "grad_norm": 0.345703125, + "learning_rate": 8.340150765368452e-05, + "loss": 0.7973, "step": 4830 }, { "epoch": 5.980210265924551, - "grad_norm": 0.4296875, - "learning_rate": 0.00015032389784592226, - "loss": 0.8823, + "grad_norm": 0.349609375, + "learning_rate": 8.318853682074278e-05, + "loss": 0.7854, "step": 4835 }, { "epoch": 5.986394557823129, - "grad_norm": 0.43359375, - "learning_rate": 0.00015019940498947428, - "loss": 0.8795, + "grad_norm": 0.345703125, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7794, "step": 4840 }, { "epoch": 5.992578849721707, - "grad_norm": 0.48828125, - "learning_rate": 0.000150074808033683, - "loss": 0.9034, + "grad_norm": 0.333984375, + "learning_rate": 8.276283146806304e-05, + "loss": 0.8008, "step": 4845 }, { "epoch": 5.998763141620285, - "grad_norm": 0.4453125, - "learning_rate": 0.00014995010723692714, - "loss": 0.8939, + "grad_norm": 0.333984375, + "learning_rate": 8.255009893460724e-05, + "loss": 0.7949, "step": 4850 }, { "epoch": 6.0, - "eval_loss": 2.4935665130615234, - "eval_runtime": 0.5407, - "eval_samples_per_second": 18.496, - "eval_steps_per_second": 1.85, + "eval_loss": 2.6445889472961426, + "eval_runtime": 0.538, + "eval_samples_per_second": 18.587, + "eval_steps_per_second": 1.859, "step": 4851 }, { "epoch": 6.004947433518862, - "grad_norm": 0.48046875, - "learning_rate": 0.00014982530285780082, - "loss": 0.8858, + "grad_norm": 0.337890625, + "learning_rate": 8.233744781997329e-05, + "loss": 0.7779, "step": 4855 }, { "epoch": 6.01113172541744, - "grad_norm": 0.47265625, - "learning_rate": 0.00014970039515511304, - "loss": 0.8882, + "grad_norm": 0.33984375, + "learning_rate": 8.212487911636184e-05, + "loss": 0.7803, "step": 4860 }, { "epoch": 6.017316017316017, - "grad_norm": 0.435546875, - "learning_rate": 0.0001495753843878869, - "loss": 0.8914, + "grad_norm": 0.34375, + "learning_rate": 8.191239381558904e-05, + "loss": 0.779, "step": 4865 }, { "epoch": 6.023500309214595, - "grad_norm": 0.443359375, - "learning_rate": 0.00014945027081535937, - "loss": 0.8648, + "grad_norm": 0.3515625, + "learning_rate": 8.169999290908188e-05, + "loss": 0.7566, "step": 4870 }, { "epoch": 6.029684601113172, - "grad_norm": 0.5234375, - "learning_rate": 0.00014932505469698052, - "loss": 0.87, + "grad_norm": 0.3359375, + "learning_rate": 8.148767738787355e-05, + "loss": 0.7617, "step": 4875 }, { "epoch": 6.035868893011751, - "grad_norm": 0.43359375, - "learning_rate": 0.00014919973629241314, - "loss": 0.8876, + "grad_norm": 0.341796875, + "learning_rate": 8.127544824259889e-05, + "loss": 0.7754, "step": 4880 }, { "epoch": 6.042053184910328, - "grad_norm": 0.48828125, - "learning_rate": 0.00014907431586153201, - "loss": 0.8852, + "grad_norm": 0.32421875, + "learning_rate": 8.106330646348972e-05, + "loss": 0.7757, "step": 4885 }, { "epoch": 6.0482374768089056, - "grad_norm": 0.447265625, - "learning_rate": 0.0001489487936644237, - "loss": 0.8752, + "grad_norm": 0.341796875, + "learning_rate": 8.085125304037018e-05, + "loss": 0.7626, "step": 4890 }, { "epoch": 6.054421768707483, - "grad_norm": 0.458984375, - "learning_rate": 0.00014882316996138556, - "loss": 0.8855, + "grad_norm": 0.361328125, + "learning_rate": 8.063928896265217e-05, + "loss": 0.7734, "step": 4895 }, { "epoch": 6.0606060606060606, - "grad_norm": 0.54296875, - "learning_rate": 0.00014869744501292561, - "loss": 0.8878, + "grad_norm": 0.376953125, + "learning_rate": 8.042741521933071e-05, + "loss": 0.7789, "step": 4900 }, { "epoch": 6.066790352504638, - "grad_norm": 0.56640625, - "learning_rate": 0.00014857161907976183, - "loss": 0.8818, + "grad_norm": 0.361328125, + "learning_rate": 8.021563279897938e-05, + "loss": 0.7702, "step": 4905 }, { "epoch": 6.0729746444032155, - "grad_norm": 0.435546875, - "learning_rate": 0.00014844569242282148, - "loss": 0.8859, + "grad_norm": 0.34765625, + "learning_rate": 8.000394268974563e-05, + "loss": 0.7761, "step": 4910 }, { "epoch": 6.079158936301793, - "grad_norm": 0.5625, - "learning_rate": 0.00014831966530324078, - "loss": 0.876, + "grad_norm": 0.3671875, + "learning_rate": 7.979234587934616e-05, + "loss": 0.7661, "step": 4915 }, { "epoch": 6.085343228200371, - "grad_norm": 0.49609375, - "learning_rate": 0.00014819353798236427, - "loss": 0.8756, + "grad_norm": 0.361328125, + "learning_rate": 7.958084335506239e-05, + "loss": 0.7652, "step": 4920 }, { "epoch": 6.091527520098949, - "grad_norm": 0.50390625, - "learning_rate": 0.00014806731072174428, - "loss": 0.8867, + "grad_norm": 0.3359375, + "learning_rate": 7.936943610373584e-05, + "loss": 0.7751, "step": 4925 }, { "epoch": 6.097711811997526, - "grad_norm": 0.453125, - "learning_rate": 0.0001479409837831404, - "loss": 0.8736, + "grad_norm": 0.337890625, + "learning_rate": 7.915812511176347e-05, + "loss": 0.7661, "step": 4930 }, { "epoch": 6.103896103896104, - "grad_norm": 0.5859375, - "learning_rate": 0.00014781455742851892, - "loss": 0.878, + "grad_norm": 0.33984375, + "learning_rate": 7.894691136509305e-05, + "loss": 0.7664, "step": 4935 }, { "epoch": 6.110080395794681, - "grad_norm": 0.42578125, - "learning_rate": 0.00014768803192005223, - "loss": 0.8853, + "grad_norm": 0.37109375, + "learning_rate": 7.873579584921869e-05, + "loss": 0.7732, "step": 4940 }, { "epoch": 6.116264687693259, - "grad_norm": 0.6015625, - "learning_rate": 0.00014756140752011842, - "loss": 0.8778, + "grad_norm": 0.3515625, + "learning_rate": 7.852477954917618e-05, + "loss": 0.7684, "step": 4945 }, { "epoch": 6.122448979591836, - "grad_norm": 0.5625, - "learning_rate": 0.00014743468449130063, - "loss": 0.891, + "grad_norm": 0.34765625, + "learning_rate": 7.831386344953836e-05, + "loss": 0.7782, "step": 4950 }, { "epoch": 6.128633271490414, - "grad_norm": 0.48828125, - "learning_rate": 0.00014730786309638652, - "loss": 0.8794, + "grad_norm": 0.34765625, + "learning_rate": 7.810304853441051e-05, + "loss": 0.7699, "step": 4955 }, { "epoch": 6.134817563388992, - "grad_norm": 0.462890625, - "learning_rate": 0.00014718094359836772, - "loss": 0.8913, + "grad_norm": 0.34765625, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7796, "step": 4960 }, { "epoch": 6.14100185528757, - "grad_norm": 0.4375, - "learning_rate": 0.0001470539262604393, - "loss": 0.8825, + "grad_norm": 0.357421875, + "learning_rate": 7.768172619174086e-05, + "loss": 0.771, "step": 4965 }, { "epoch": 6.147186147186147, - "grad_norm": 0.5703125, - "learning_rate": 0.00014692681134599925, - "loss": 0.8865, + "grad_norm": 0.349609375, + "learning_rate": 7.747122073003075e-05, + "loss": 0.7766, "step": 4970 }, { "epoch": 6.153370439084725, - "grad_norm": 0.48046875, - "learning_rate": 0.00014679959911864784, - "loss": 0.8829, + "grad_norm": 0.345703125, + "learning_rate": 7.72608203844849e-05, + "loss": 0.7733, "step": 4975 }, { "epoch": 6.159554730983302, - "grad_norm": 0.4375, - "learning_rate": 0.0001466722898421873, - "loss": 0.8781, + "grad_norm": 0.341796875, + "learning_rate": 7.705052613680211e-05, + "loss": 0.7719, "step": 4980 }, { "epoch": 6.16573902288188, - "grad_norm": 0.419921875, - "learning_rate": 0.00014654488378062087, - "loss": 0.8809, + "grad_norm": 0.35546875, + "learning_rate": 7.684033896818627e-05, + "loss": 0.7711, "step": 4985 }, { "epoch": 6.171923314780457, - "grad_norm": 0.43359375, - "learning_rate": 0.00014641738119815266, - "loss": 0.885, + "grad_norm": 0.345703125, + "learning_rate": 7.663025985934158e-05, + "loss": 0.775, "step": 4990 }, { "epoch": 6.178107606679035, - "grad_norm": 0.53125, - "learning_rate": 0.00014628978235918695, - "loss": 0.8823, + "grad_norm": 0.330078125, + "learning_rate": 7.642028979046807e-05, + "loss": 0.7742, "step": 4995 }, { "epoch": 6.184291898577613, - "grad_norm": 0.4609375, - "learning_rate": 0.00014616208752832758, - "loss": 0.8882, + "grad_norm": 0.361328125, + "learning_rate": 7.6210429741257e-05, + "loss": 0.7785, "step": 5000 }, { "epoch": 6.190476190476191, - "grad_norm": 0.42578125, - "learning_rate": 0.0001460342969703774, - "loss": 0.885, + "grad_norm": 0.33203125, + "learning_rate": 7.600068069088634e-05, + "loss": 0.7774, "step": 5005 }, { "epoch": 6.196660482374768, - "grad_norm": 0.490234375, - "learning_rate": 0.00014590641095033787, - "loss": 0.8901, + "grad_norm": 0.330078125, + "learning_rate": 7.579104361801605e-05, + "loss": 0.7792, "step": 5010 }, { "epoch": 6.202844774273346, - "grad_norm": 0.44140625, - "learning_rate": 0.0001457784297334083, - "loss": 0.8722, + "grad_norm": 0.3515625, + "learning_rate": 7.558151950078376e-05, + "loss": 0.7627, "step": 5015 }, { "epoch": 6.209029066171923, - "grad_norm": 0.419921875, - "learning_rate": 0.0001456503535849855, - "loss": 0.8808, + "grad_norm": 0.353515625, + "learning_rate": 7.537210931679987e-05, + "loss": 0.7706, "step": 5020 }, { "epoch": 6.215213358070501, - "grad_norm": 0.474609375, - "learning_rate": 0.00014552218277066314, - "loss": 0.8734, + "grad_norm": 0.3359375, + "learning_rate": 7.516281404314341e-05, + "loss": 0.7667, "step": 5025 }, { "epoch": 6.221397649969078, - "grad_norm": 0.4296875, - "learning_rate": 0.0001453939175562312, - "loss": 0.8814, + "grad_norm": 0.3359375, + "learning_rate": 7.495363465635708e-05, + "loss": 0.7715, "step": 5030 }, { "epoch": 6.227581941867657, - "grad_norm": 0.494140625, - "learning_rate": 0.00014526555820767534, - "loss": 0.8775, + "grad_norm": 0.3359375, + "learning_rate": 7.474457213244293e-05, + "loss": 0.7713, "step": 5035 }, { "epoch": 6.233766233766234, - "grad_norm": 0.515625, - "learning_rate": 0.00014513710499117647, - "loss": 0.8756, + "grad_norm": 0.361328125, + "learning_rate": 7.453562744685778e-05, + "loss": 0.7678, "step": 5040 }, { "epoch": 6.239950525664812, - "grad_norm": 0.484375, - "learning_rate": 0.0001450085581731102, - "loss": 0.8814, + "grad_norm": 0.353515625, + "learning_rate": 7.432680157450857e-05, + "loss": 0.771, "step": 5045 }, { "epoch": 6.246134817563389, - "grad_norm": 0.4375, - "learning_rate": 0.00014487991802004623, - "loss": 0.8778, + "grad_norm": 0.353515625, + "learning_rate": 7.411809548974792e-05, + "loss": 0.7679, "step": 5050 }, { "epoch": 6.252319109461967, - "grad_norm": 0.56640625, - "learning_rate": 0.00014475118479874774, - "loss": 0.8744, + "grad_norm": 0.359375, + "learning_rate": 7.390951016636952e-05, + "loss": 0.7661, "step": 5055 }, { "epoch": 6.258503401360544, - "grad_norm": 0.4140625, - "learning_rate": 0.00014462235877617098, - "loss": 0.876, + "grad_norm": 0.359375, + "learning_rate": 7.370104657760361e-05, + "loss": 0.7689, "step": 5060 }, { "epoch": 6.264687693259122, - "grad_norm": 0.447265625, - "learning_rate": 0.00014449344021946468, - "loss": 0.888, + "grad_norm": 0.34375, + "learning_rate": 7.34927056961124e-05, + "loss": 0.7744, "step": 5065 }, { "epoch": 6.270871985157699, - "grad_norm": 0.4609375, - "learning_rate": 0.0001443644293959693, - "loss": 0.8792, + "grad_norm": 0.3359375, + "learning_rate": 7.328448849398558e-05, + "loss": 0.7728, "step": 5070 }, { "epoch": 6.2770562770562774, - "grad_norm": 0.443359375, - "learning_rate": 0.0001442353265732168, - "loss": 0.8672, + "grad_norm": 0.33203125, + "learning_rate": 7.307639594273581e-05, + "loss": 0.7649, "step": 5075 }, { "epoch": 6.283240568954855, - "grad_norm": 0.466796875, - "learning_rate": 0.00014410613201892985, - "loss": 0.8773, + "grad_norm": 0.34375, + "learning_rate": 7.286842901329412e-05, + "loss": 0.7695, "step": 5080 }, { "epoch": 6.289424860853432, - "grad_norm": 0.46875, - "learning_rate": 0.0001439768460010213, - "loss": 0.8835, + "grad_norm": 0.341796875, + "learning_rate": 7.266058867600537e-05, + "loss": 0.7775, "step": 5085 }, { "epoch": 6.29560915275201, - "grad_norm": 0.44921875, - "learning_rate": 0.0001438474687875938, - "loss": 0.8845, + "grad_norm": 0.330078125, + "learning_rate": 7.245287590062384e-05, + "loss": 0.775, "step": 5090 }, { "epoch": 6.301793444650587, - "grad_norm": 0.4609375, - "learning_rate": 0.000143718000646939, - "loss": 0.8753, + "grad_norm": 0.345703125, + "learning_rate": 7.224529165630856e-05, + "loss": 0.7714, "step": 5095 }, { "epoch": 6.307977736549165, - "grad_norm": 0.4609375, - "learning_rate": 0.00014358844184753712, - "loss": 0.8719, + "grad_norm": 0.365234375, + "learning_rate": 7.203783691161883e-05, + "loss": 0.7614, "step": 5100 }, { "epoch": 6.314162028447742, - "grad_norm": 0.447265625, - "learning_rate": 0.00014345879265805644, - "loss": 0.8807, + "grad_norm": 0.361328125, + "learning_rate": 7.183051263450983e-05, + "loss": 0.7715, "step": 5105 }, { "epoch": 6.32034632034632, - "grad_norm": 0.443359375, - "learning_rate": 0.00014332905334735261, - "loss": 0.8832, + "grad_norm": 0.345703125, + "learning_rate": 7.162331979232783e-05, + "loss": 0.7729, "step": 5110 }, { "epoch": 6.326530612244898, - "grad_norm": 0.423828125, - "learning_rate": 0.00014319922418446824, - "loss": 0.8805, + "grad_norm": 0.34375, + "learning_rate": 7.1416259351806e-05, + "loss": 0.7706, "step": 5115 }, { "epoch": 6.332714904143476, - "grad_norm": 0.474609375, - "learning_rate": 0.00014306930543863219, - "loss": 0.8839, + "grad_norm": 0.34375, + "learning_rate": 7.12093322790597e-05, + "loss": 0.7749, "step": 5120 }, { "epoch": 6.338899196042053, - "grad_norm": 0.427734375, - "learning_rate": 0.0001429392973792592, - "loss": 0.8863, + "grad_norm": 0.35546875, + "learning_rate": 7.100253953958195e-05, + "loss": 0.7782, "step": 5125 }, { "epoch": 6.345083487940631, - "grad_norm": 0.439453125, - "learning_rate": 0.00014280920027594907, - "loss": 0.8783, + "grad_norm": 0.36328125, + "learning_rate": 7.079588209823906e-05, + "loss": 0.7707, "step": 5130 }, { "epoch": 6.351267779839208, - "grad_norm": 0.443359375, - "learning_rate": 0.0001426790143984864, - "loss": 0.8823, + "grad_norm": 0.357421875, + "learning_rate": 7.058936091926603e-05, + "loss": 0.7723, "step": 5135 }, { "epoch": 6.357452071737786, - "grad_norm": 0.423828125, - "learning_rate": 0.00014254874001683976, - "loss": 0.8857, + "grad_norm": 0.349609375, + "learning_rate": 7.038297696626206e-05, + "loss": 0.7736, "step": 5140 }, { "epoch": 6.363636363636363, - "grad_norm": 0.419921875, - "learning_rate": 0.00014241837740116132, - "loss": 0.8865, + "grad_norm": 0.33984375, + "learning_rate": 7.017673120218615e-05, + "loss": 0.7765, "step": 5145 }, { "epoch": 6.369820655534941, - "grad_norm": 0.5078125, - "learning_rate": 0.00014228792682178623, - "loss": 0.8779, + "grad_norm": 0.34375, + "learning_rate": 6.99706245893524e-05, + "loss": 0.7703, "step": 5150 }, { "epoch": 6.376004947433519, - "grad_norm": 0.5, - "learning_rate": 0.00014215738854923203, - "loss": 0.8813, + "grad_norm": 0.341796875, + "learning_rate": 6.976465808942576e-05, + "loss": 0.7717, "step": 5155 }, { "epoch": 6.382189239332097, - "grad_norm": 0.45703125, - "learning_rate": 0.00014202676285419812, - "loss": 0.8824, + "grad_norm": 0.341796875, + "learning_rate": 6.955883266341741e-05, + "loss": 0.773, "step": 5160 }, { "epoch": 6.388373531230674, - "grad_norm": 0.439453125, - "learning_rate": 0.00014189605000756514, - "loss": 0.8897, + "grad_norm": 0.345703125, + "learning_rate": 6.935314927168026e-05, + "loss": 0.781, "step": 5165 }, { "epoch": 6.394557823129252, - "grad_norm": 0.423828125, - "learning_rate": 0.00014176525028039452, - "loss": 0.8711, + "grad_norm": 0.3515625, + "learning_rate": 6.914760887390452e-05, + "loss": 0.766, "step": 5170 }, { "epoch": 6.400742115027829, - "grad_norm": 0.455078125, - "learning_rate": 0.00014163436394392786, - "loss": 0.8929, + "grad_norm": 0.392578125, + "learning_rate": 6.894221242911329e-05, + "loss": 0.7808, "step": 5175 }, { "epoch": 6.406926406926407, - "grad_norm": 0.4921875, - "learning_rate": 0.00014150339126958633, - "loss": 0.8739, + "grad_norm": 0.33203125, + "learning_rate": 6.873696089565786e-05, + "loss": 0.769, "step": 5180 }, { "epoch": 6.413110698824984, - "grad_norm": 0.462890625, - "learning_rate": 0.0001413723325289701, - "loss": 0.876, + "grad_norm": 0.333984375, + "learning_rate": 6.853185523121348e-05, + "loss": 0.7708, "step": 5185 }, { "epoch": 6.419294990723563, - "grad_norm": 0.482421875, - "learning_rate": 0.00014124118799385796, - "loss": 0.8756, + "grad_norm": 0.33984375, + "learning_rate": 6.832689639277484e-05, + "loss": 0.7689, "step": 5190 }, { "epoch": 6.42547928262214, - "grad_norm": 0.431640625, - "learning_rate": 0.00014110995793620648, - "loss": 0.8882, + "grad_norm": 0.357421875, + "learning_rate": 6.812208533665141e-05, + "loss": 0.7797, "step": 5195 }, { "epoch": 6.431663574520718, - "grad_norm": 0.484375, - "learning_rate": 0.00014097864262814955, - "loss": 0.8771, + "grad_norm": 0.337890625, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7686, "step": 5200 }, { "epoch": 6.437847866419295, - "grad_norm": 0.484375, - "learning_rate": 0.000140847242341998, - "loss": 0.8794, + "grad_norm": 0.3359375, + "learning_rate": 6.77129103931364e-05, + "loss": 0.7735, "step": 5205 }, { "epoch": 6.444032158317873, - "grad_norm": 0.50390625, - "learning_rate": 0.00014071575735023875, - "loss": 0.8804, + "grad_norm": 0.369140625, + "learning_rate": 6.750854841489842e-05, + "loss": 0.7708, "step": 5210 }, { "epoch": 6.45021645021645, - "grad_norm": 0.5, - "learning_rate": 0.00014058418792553445, - "loss": 0.874, + "grad_norm": 0.3359375, + "learning_rate": 6.730433803727407e-05, + "loss": 0.765, "step": 5215 }, { "epoch": 6.456400742115028, - "grad_norm": 0.6171875, - "learning_rate": 0.0001404525343407228, - "loss": 0.8731, + "grad_norm": 0.361328125, + "learning_rate": 6.710028021308061e-05, + "loss": 0.7662, "step": 5220 }, { "epoch": 6.462585034013605, - "grad_norm": 0.462890625, - "learning_rate": 0.00014032079686881603, - "loss": 0.8762, + "grad_norm": 0.34765625, + "learning_rate": 6.689637589442361e-05, + "loss": 0.7692, "step": 5225 }, { "epoch": 6.4687693259121835, - "grad_norm": 0.48046875, - "learning_rate": 0.00014018897578300035, - "loss": 0.8906, + "grad_norm": 0.35546875, + "learning_rate": 6.669262603269246e-05, + "loss": 0.7819, "step": 5230 }, { "epoch": 6.474953617810761, - "grad_norm": 0.46875, - "learning_rate": 0.00014005707135663527, - "loss": 0.8737, + "grad_norm": 0.34375, + "learning_rate": 6.64890315785557e-05, + "loss": 0.7696, "step": 5235 }, { "epoch": 6.4811379097093385, - "grad_norm": 0.408203125, - "learning_rate": 0.0001399250838632533, - "loss": 0.8806, + "grad_norm": 0.3359375, + "learning_rate": 6.62855934819569e-05, + "loss": 0.7709, "step": 5240 }, { "epoch": 6.487322201607916, - "grad_norm": 0.4140625, - "learning_rate": 0.00013979301357655905, - "loss": 0.8737, + "grad_norm": 0.330078125, + "learning_rate": 6.608231269211002e-05, + "loss": 0.7673, "step": 5245 }, { "epoch": 6.4935064935064934, - "grad_norm": 0.458984375, - "learning_rate": 0.0001396608607704289, - "loss": 0.878, + "grad_norm": 0.34375, + "learning_rate": 6.587919015749511e-05, + "loss": 0.7745, "step": 5250 }, { "epoch": 6.499690785405071, - "grad_norm": 0.46484375, - "learning_rate": 0.00013952862571891034, - "loss": 0.8754, + "grad_norm": 0.333984375, + "learning_rate": 6.56762268258538e-05, + "loss": 0.7677, "step": 5255 }, { "epoch": 6.5058750773036484, - "grad_norm": 0.46875, - "learning_rate": 0.00013939630869622133, - "loss": 0.8818, + "grad_norm": 0.353515625, + "learning_rate": 6.547342364418481e-05, + "loss": 0.7729, "step": 5260 }, { "epoch": 6.512059369202227, - "grad_norm": 0.470703125, - "learning_rate": 0.00013926390997674997, - "loss": 0.8834, + "grad_norm": 0.337890625, + "learning_rate": 6.527078155873978e-05, + "loss": 0.7756, "step": 5265 }, { "epoch": 6.518243661100804, - "grad_norm": 0.5703125, - "learning_rate": 0.00013913142983505364, - "loss": 0.8787, + "grad_norm": 0.3359375, + "learning_rate": 6.506830151501861e-05, + "loss": 0.7707, "step": 5270 }, { "epoch": 6.524427952999382, - "grad_norm": 0.4375, - "learning_rate": 0.00013899886854585862, - "loss": 0.8813, + "grad_norm": 0.341796875, + "learning_rate": 6.486598445776513e-05, + "loss": 0.7715, "step": 5275 }, { "epoch": 6.530612244897959, - "grad_norm": 0.45703125, - "learning_rate": 0.00013886622638405952, - "loss": 0.8835, + "grad_norm": 0.33203125, + "learning_rate": 6.466383133096267e-05, + "loss": 0.7745, "step": 5280 }, { "epoch": 6.536796536796537, - "grad_norm": 0.416015625, - "learning_rate": 0.00013873350362471855, - "loss": 0.8811, + "grad_norm": 0.33984375, + "learning_rate": 6.446184307782978e-05, + "loss": 0.7744, "step": 5285 }, { "epoch": 6.542980828695114, - "grad_norm": 0.4765625, - "learning_rate": 0.00013860070054306516, - "loss": 0.891, + "grad_norm": 0.3515625, + "learning_rate": 6.426002064081565e-05, + "loss": 0.779, "step": 5290 }, { "epoch": 6.549165120593692, - "grad_norm": 0.5546875, - "learning_rate": 0.00013846781741449525, - "loss": 0.8807, + "grad_norm": 0.345703125, + "learning_rate": 6.405836496159585e-05, + "loss": 0.7714, "step": 5295 }, { "epoch": 6.555349412492269, - "grad_norm": 0.45703125, - "learning_rate": 0.0001383348545145708, - "loss": 0.883, + "grad_norm": 0.349609375, + "learning_rate": 6.385687698106781e-05, + "loss": 0.775, "step": 5300 }, { "epoch": 6.561533704390847, - "grad_norm": 0.431640625, - "learning_rate": 0.0001382018121190192, - "loss": 0.8807, + "grad_norm": 0.34375, + "learning_rate": 6.365555763934656e-05, + "loss": 0.7724, "step": 5305 }, { "epoch": 6.567717996289425, - "grad_norm": 0.66796875, - "learning_rate": 0.0001380686905037327, - "loss": 0.8844, + "grad_norm": 0.34765625, + "learning_rate": 6.345440787576031e-05, + "loss": 0.777, "step": 5310 }, { "epoch": 6.573902288188003, - "grad_norm": 0.431640625, - "learning_rate": 0.0001379354899447678, - "loss": 0.893, + "grad_norm": 0.33203125, + "learning_rate": 6.3253428628846e-05, + "loss": 0.7853, "step": 5315 }, { "epoch": 6.58008658008658, - "grad_norm": 0.48828125, - "learning_rate": 0.00013780221071834476, - "loss": 0.8825, + "grad_norm": 0.349609375, + "learning_rate": 6.305262083634488e-05, + "loss": 0.7741, "step": 5320 }, { "epoch": 6.586270871985158, - "grad_norm": 0.466796875, - "learning_rate": 0.00013766885310084688, - "loss": 0.8899, + "grad_norm": 0.345703125, + "learning_rate": 6.285198543519835e-05, + "loss": 0.7777, "step": 5325 }, { "epoch": 6.592455163883735, - "grad_norm": 0.43359375, - "learning_rate": 0.0001375354173688201, - "loss": 0.8774, + "grad_norm": 0.3359375, + "learning_rate": 6.265152336154345e-05, + "loss": 0.7692, "step": 5330 }, { "epoch": 6.598639455782313, - "grad_norm": 0.431640625, - "learning_rate": 0.00013740190379897226, - "loss": 0.8802, + "grad_norm": 0.33203125, + "learning_rate": 6.245123555070838e-05, + "loss": 0.7723, "step": 5335 }, { "epoch": 6.60482374768089, - "grad_norm": 0.416015625, - "learning_rate": 0.00013726831266817278, - "loss": 0.877, + "grad_norm": 0.353515625, + "learning_rate": 6.225112293720836e-05, + "loss": 0.7705, "step": 5340 }, { "epoch": 6.611008039579469, - "grad_norm": 0.4765625, - "learning_rate": 0.00013713464425345174, - "loss": 0.8791, + "grad_norm": 0.330078125, + "learning_rate": 6.205118645474115e-05, + "loss": 0.7708, "step": 5345 }, { "epoch": 6.617192331478046, - "grad_norm": 0.546875, - "learning_rate": 0.00013700089883199966, - "loss": 0.8822, + "grad_norm": 0.345703125, + "learning_rate": 6.18514270361827e-05, + "loss": 0.7716, "step": 5350 }, { "epoch": 6.623376623376624, - "grad_norm": 0.421875, - "learning_rate": 0.0001368670766811665, - "loss": 0.8786, + "grad_norm": 0.34765625, + "learning_rate": 6.165184561358275e-05, + "loss": 0.7692, "step": 5355 }, { "epoch": 6.629560915275201, - "grad_norm": 0.44140625, - "learning_rate": 0.0001367331780784616, - "loss": 0.876, + "grad_norm": 0.35546875, + "learning_rate": 6.145244311816063e-05, + "loss": 0.768, "step": 5360 }, { "epoch": 6.635745207173779, - "grad_norm": 0.458984375, - "learning_rate": 0.0001365992033015527, - "loss": 0.875, + "grad_norm": 0.337890625, + "learning_rate": 6.125322048030072e-05, + "loss": 0.7651, "step": 5365 }, { "epoch": 6.641929499072356, - "grad_norm": 0.423828125, - "learning_rate": 0.00013646515262826552, - "loss": 0.8762, + "grad_norm": 0.357421875, + "learning_rate": 6.105417862954828e-05, + "loss": 0.7685, "step": 5370 }, { "epoch": 6.648113790970934, - "grad_norm": 0.494140625, - "learning_rate": 0.00013633102633658318, - "loss": 0.8862, + "grad_norm": 0.3515625, + "learning_rate": 6.0855318494605004e-05, + "loss": 0.7773, "step": 5375 }, { "epoch": 6.654298082869511, - "grad_norm": 0.4609375, - "learning_rate": 0.00013619682470464558, - "loss": 0.889, + "grad_norm": 0.33984375, + "learning_rate": 6.065664100332478e-05, + "loss": 0.7813, "step": 5380 }, { "epoch": 6.660482374768089, - "grad_norm": 0.59375, - "learning_rate": 0.00013606254801074895, - "loss": 0.883, + "grad_norm": 0.341796875, + "learning_rate": 6.045814708270925e-05, + "loss": 0.7759, "step": 5385 }, { "epoch": 6.666666666666667, - "grad_norm": 0.53515625, - "learning_rate": 0.00013592819653334505, - "loss": 0.8794, + "grad_norm": 0.3359375, + "learning_rate": 6.025983765890353e-05, + "loss": 0.7689, "step": 5390 }, { "epoch": 6.6728509585652445, - "grad_norm": 0.55859375, - "learning_rate": 0.0001357937705510408, - "loss": 0.8812, + "grad_norm": 0.34765625, + "learning_rate": 6.0061713657191976e-05, + "loss": 0.7735, "step": 5395 }, { "epoch": 6.679035250463822, - "grad_norm": 0.50390625, - "learning_rate": 0.0001356592703425976, - "loss": 0.8742, + "grad_norm": 0.345703125, + "learning_rate": 5.986377600199371e-05, + "loss": 0.7647, "step": 5400 }, { "epoch": 6.6852195423623995, - "grad_norm": 0.51953125, - "learning_rate": 0.00013552469618693076, - "loss": 0.8759, + "grad_norm": 0.34375, + "learning_rate": 5.9666025616858475e-05, + "loss": 0.7687, "step": 5405 }, { "epoch": 6.691403834260977, - "grad_norm": 0.408203125, - "learning_rate": 0.00013539004836310894, - "loss": 0.8747, + "grad_norm": 0.330078125, + "learning_rate": 5.946846342446214e-05, + "loss": 0.7685, "step": 5410 }, { "epoch": 6.6975881261595545, - "grad_norm": 0.400390625, - "learning_rate": 0.00013525532715035366, - "loss": 0.8803, + "grad_norm": 0.3359375, + "learning_rate": 5.927109034660251e-05, + "loss": 0.7727, "step": 5415 }, { "epoch": 6.703772418058133, - "grad_norm": 0.4140625, - "learning_rate": 0.0001351205328280385, - "loss": 0.8769, + "grad_norm": 0.33203125, + "learning_rate": 5.907390730419507e-05, + "loss": 0.7711, "step": 5420 }, { "epoch": 6.70995670995671, - "grad_norm": 0.45703125, - "learning_rate": 0.00013498566567568865, - "loss": 0.8749, + "grad_norm": 0.349609375, + "learning_rate": 5.887691521726859e-05, + "loss": 0.769, "step": 5425 }, { "epoch": 6.716141001855288, - "grad_norm": 0.435546875, - "learning_rate": 0.00013485072597298038, - "loss": 0.8708, + "grad_norm": 0.33984375, + "learning_rate": 5.868011500496084e-05, + "loss": 0.7663, "step": 5430 }, { "epoch": 6.722325293753865, - "grad_norm": 0.4375, - "learning_rate": 0.00013471571399974045, - "loss": 0.8736, + "grad_norm": 0.33984375, + "learning_rate": 5.848350758551437e-05, + "loss": 0.7712, "step": 5435 }, { "epoch": 6.728509585652443, - "grad_norm": 0.44140625, - "learning_rate": 0.00013458063003594543, - "loss": 0.8774, + "grad_norm": 0.333984375, + "learning_rate": 5.828709387627218e-05, + "loss": 0.7715, "step": 5440 }, { "epoch": 6.73469387755102, - "grad_norm": 0.58203125, - "learning_rate": 0.00013444547436172117, - "loss": 0.877, + "grad_norm": 0.345703125, + "learning_rate": 5.80908747936735e-05, + "loss": 0.77, "step": 5445 }, { "epoch": 6.740878169449598, - "grad_norm": 0.515625, - "learning_rate": 0.0001343102472573423, - "loss": 0.8998, + "grad_norm": 0.359375, + "learning_rate": 5.789485125324926e-05, + "loss": 0.786, "step": 5450 }, { "epoch": 6.747062461348175, - "grad_norm": 0.470703125, - "learning_rate": 0.00013417494900323142, - "loss": 0.8903, + "grad_norm": 0.337890625, + "learning_rate": 5.7699024169618256e-05, + "loss": 0.778, "step": 5455 }, { "epoch": 6.753246753246753, - "grad_norm": 0.5078125, - "learning_rate": 0.00013403957987995882, - "loss": 0.8768, + "grad_norm": 0.3359375, + "learning_rate": 5.750339445648252e-05, + "loss": 0.7707, "step": 5460 }, { "epoch": 6.759431045145331, - "grad_norm": 0.470703125, - "learning_rate": 0.00013390414016824174, - "loss": 0.8859, + "grad_norm": 0.3515625, + "learning_rate": 5.7307963026623146e-05, + "loss": 0.7772, "step": 5465 }, { "epoch": 6.765615337043909, - "grad_norm": 0.43359375, - "learning_rate": 0.00013376863014894375, - "loss": 0.8784, + "grad_norm": 0.33984375, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.7712, "step": 5470 }, { "epoch": 6.771799628942486, - "grad_norm": 0.47265625, - "learning_rate": 0.00013363305010307425, - "loss": 0.8793, + "grad_norm": 0.359375, + "learning_rate": 5.691769866322825e-05, + "loss": 0.772, "step": 5475 }, { "epoch": 6.777983920841064, - "grad_norm": 0.46875, - "learning_rate": 0.00013349740031178784, - "loss": 0.8688, + "grad_norm": 0.34375, + "learning_rate": 5.6722867550612116e-05, + "loss": 0.7651, "step": 5480 }, { "epoch": 6.784168212739641, - "grad_norm": 0.47265625, - "learning_rate": 0.0001333616810563837, - "loss": 0.8727, + "grad_norm": 0.345703125, + "learning_rate": 5.652823836310288e-05, + "loss": 0.7676, "step": 5485 }, { "epoch": 6.790352504638219, - "grad_norm": 0.439453125, - "learning_rate": 0.00013322589261830517, - "loss": 0.8718, + "grad_norm": 0.330078125, + "learning_rate": 5.633381200881335e-05, + "loss": 0.7677, "step": 5490 }, { "epoch": 6.796536796536796, - "grad_norm": 0.455078125, - "learning_rate": 0.000133090035279139, - "loss": 0.8866, + "grad_norm": 0.359375, + "learning_rate": 5.613958939490995e-05, + "loss": 0.7768, "step": 5495 }, { "epoch": 6.802721088435375, - "grad_norm": 0.52734375, - "learning_rate": 0.00013295410932061478, - "loss": 0.8834, + "grad_norm": 0.34375, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.7766, "step": 5500 }, { "epoch": 6.808905380333952, - "grad_norm": 0.443359375, - "learning_rate": 0.0001328181150246045, - "loss": 0.8827, + "grad_norm": 0.337890625, + "learning_rate": 5.575175901216999e-05, + "loss": 0.7739, "step": 5505 }, { "epoch": 6.81508967223253, - "grad_norm": 0.5234375, - "learning_rate": 0.00013268205267312174, - "loss": 0.8842, + "grad_norm": 0.345703125, + "learning_rate": 5.555815305289631e-05, + "loss": 0.7766, "step": 5510 }, { "epoch": 6.821273964131107, - "grad_norm": 0.462890625, - "learning_rate": 0.0001325459225483213, - "loss": 0.8813, + "grad_norm": 0.353515625, + "learning_rate": 5.536475445312606e-05, + "loss": 0.7739, "step": 5515 }, { "epoch": 6.827458256029685, - "grad_norm": 0.484375, - "learning_rate": 0.00013240972493249847, - "loss": 0.8872, + "grad_norm": 0.34765625, + "learning_rate": 5.5171564115230254e-05, + "loss": 0.7771, "step": 5520 }, { "epoch": 6.833642547928262, - "grad_norm": 0.443359375, - "learning_rate": 0.0001322734601080885, - "loss": 0.8884, + "grad_norm": 0.34375, + "learning_rate": 5.4978582940608356e-05, + "loss": 0.7792, "step": 5525 }, { "epoch": 6.83982683982684, - "grad_norm": 0.458984375, - "learning_rate": 0.00013213712835766607, - "loss": 0.8877, + "grad_norm": 0.37109375, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.7781, "step": 5530 }, { "epoch": 6.846011131725417, - "grad_norm": 0.48828125, - "learning_rate": 0.0001320007299639446, - "loss": 0.8777, + "grad_norm": 0.365234375, + "learning_rate": 5.459325168189977e-05, + "loss": 0.772, "step": 5535 }, { "epoch": 6.852195423623995, - "grad_norm": 0.427734375, - "learning_rate": 0.0001318642652097757, - "loss": 0.8732, + "grad_norm": 0.33203125, + "learning_rate": 5.4400903395715366e-05, + "loss": 0.7666, "step": 5540 }, { "epoch": 6.858379715522573, - "grad_norm": 0.41015625, - "learning_rate": 0.00013172773437814865, - "loss": 0.8859, + "grad_norm": 0.345703125, + "learning_rate": 5.4208767868600986e-05, + "loss": 0.7778, "step": 5545 }, { "epoch": 6.8645640074211505, - "grad_norm": 0.46875, - "learning_rate": 0.00013159113775218964, - "loss": 0.8738, + "grad_norm": 0.337890625, + "learning_rate": 5.401684599703445e-05, + "loss": 0.7673, "step": 5550 }, { "epoch": 6.870748299319728, - "grad_norm": 0.447265625, - "learning_rate": 0.00013145447561516138, - "loss": 0.8849, + "grad_norm": 0.34765625, + "learning_rate": 5.382513867649663e-05, + "loss": 0.7766, "step": 5555 }, { "epoch": 6.8769325912183055, - "grad_norm": 0.462890625, - "learning_rate": 0.00013131774825046245, - "loss": 0.8766, + "grad_norm": 0.349609375, + "learning_rate": 5.363364680146725e-05, + "loss": 0.771, "step": 5560 }, { "epoch": 6.883116883116883, - "grad_norm": 0.447265625, - "learning_rate": 0.0001311809559416267, - "loss": 0.881, + "grad_norm": 0.349609375, + "learning_rate": 5.3442371265420995e-05, + "loss": 0.7721, "step": 5565 }, { "epoch": 6.8893011750154605, - "grad_norm": 0.51953125, - "learning_rate": 0.00013104409897232258, - "loss": 0.8833, + "grad_norm": 0.34375, + "learning_rate": 5.325131296082298e-05, + "loss": 0.7745, "step": 5570 }, { "epoch": 6.895485466914038, - "grad_norm": 0.44921875, - "learning_rate": 0.00013090717762635266, - "loss": 0.8787, + "grad_norm": 0.33984375, + "learning_rate": 5.306047277912479e-05, + "loss": 0.7693, "step": 5575 }, { "epoch": 6.901669758812616, - "grad_norm": 0.443359375, - "learning_rate": 0.00013077019218765305, - "loss": 0.8843, + "grad_norm": 0.35546875, + "learning_rate": 5.286985161076029e-05, + "loss": 0.7751, "step": 5580 }, { "epoch": 6.907854050711194, - "grad_norm": 0.4765625, - "learning_rate": 0.0001306331429402927, - "loss": 0.8901, + "grad_norm": 0.345703125, + "learning_rate": 5.2679450345141416e-05, + "loss": 0.7807, "step": 5585 }, { "epoch": 6.914038342609771, - "grad_norm": 0.55078125, - "learning_rate": 0.00013049603016847296, - "loss": 0.8918, + "grad_norm": 0.33984375, + "learning_rate": 5.248926987065417e-05, + "loss": 0.7848, "step": 5590 }, { "epoch": 6.920222634508349, - "grad_norm": 0.50390625, - "learning_rate": 0.00013035885415652685, - "loss": 0.8775, + "grad_norm": 0.34375, + "learning_rate": 5.2299311074654265e-05, + "loss": 0.7677, "step": 5595 }, { "epoch": 6.926406926406926, - "grad_norm": 0.57421875, - "learning_rate": 0.00013022161518891855, - "loss": 0.8947, + "grad_norm": 0.3515625, + "learning_rate": 5.210957484346314e-05, + "loss": 0.7877, "step": 5600 }, { "epoch": 6.932591218305504, - "grad_norm": 0.470703125, - "learning_rate": 0.00013008431355024283, - "loss": 0.8815, + "grad_norm": 0.373046875, + "learning_rate": 5.192006206236382e-05, + "loss": 0.7775, "step": 5605 }, { "epoch": 6.938775510204081, - "grad_norm": 0.49609375, - "learning_rate": 0.00012994694952522435, - "loss": 0.8678, + "grad_norm": 0.341796875, + "learning_rate": 5.173077361559665e-05, + "loss": 0.7645, "step": 5610 }, { "epoch": 6.944959802102659, - "grad_norm": 0.478515625, - "learning_rate": 0.00012980952339871718, - "loss": 0.8793, + "grad_norm": 0.34765625, + "learning_rate": 5.154171038635534e-05, + "loss": 0.7685, "step": 5615 }, { "epoch": 6.951144094001237, - "grad_norm": 0.484375, - "learning_rate": 0.00012967203545570418, - "loss": 0.8782, + "grad_norm": 0.349609375, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7718, "step": 5620 }, { "epoch": 6.957328385899815, - "grad_norm": 0.419921875, - "learning_rate": 0.00012953448598129643, - "loss": 0.8754, + "grad_norm": 0.34765625, + "learning_rate": 5.116426310796663e-05, + "loss": 0.7705, "step": 5625 }, { "epoch": 6.963512677798392, - "grad_norm": 0.5078125, - "learning_rate": 0.0001293968752607325, - "loss": 0.8751, + "grad_norm": 0.33984375, + "learning_rate": 5.0975880819936004e-05, + "loss": 0.7672, "step": 5630 }, { "epoch": 6.96969696969697, - "grad_norm": 0.47265625, - "learning_rate": 0.00012925920357937808, - "loss": 0.8816, + "grad_norm": 0.341796875, + "learning_rate": 5.078772727165646e-05, + "loss": 0.7737, "step": 5635 }, { "epoch": 6.975881261595547, - "grad_norm": 0.46875, - "learning_rate": 0.00012912147122272523, - "loss": 0.8865, + "grad_norm": 0.34765625, + "learning_rate": 5.059980334102637e-05, + "loss": 0.7793, "step": 5640 }, { "epoch": 6.982065553494125, - "grad_norm": 0.45703125, - "learning_rate": 0.0001289836784763918, - "loss": 0.8764, + "grad_norm": 0.337890625, + "learning_rate": 5.041210990487286e-05, + "loss": 0.7691, "step": 5645 }, { "epoch": 6.988249845392702, - "grad_norm": 0.54296875, - "learning_rate": 0.00012884582562612095, - "loss": 0.8862, + "grad_norm": 0.36328125, + "learning_rate": 5.022464783894744e-05, + "loss": 0.7773, "step": 5650 }, { "epoch": 6.994434137291281, - "grad_norm": 0.609375, - "learning_rate": 0.0001287079129577804, - "loss": 0.8795, + "grad_norm": 0.34375, + "learning_rate": 5.0037418017922125e-05, + "loss": 0.7732, "step": 5655 }, { "epoch": 6.999381570810142, - "eval_loss": 2.5032334327697754, - "eval_runtime": 0.6588, - "eval_samples_per_second": 15.178, - "eval_steps_per_second": 1.518, + "eval_loss": 2.678618907928467, + "eval_runtime": 0.8072, + "eval_samples_per_second": 12.388, + "eval_steps_per_second": 1.239, "step": 5659 }, { "epoch": 7.000618429189858, - "grad_norm": 0.51953125, - "learning_rate": 0.00012856994075736197, - "loss": 0.8769, + "grad_norm": 0.328125, + "learning_rate": 4.985042131538545e-05, + "loss": 0.7716, "step": 5660 }, { "epoch": 7.006802721088436, - "grad_norm": 0.482421875, - "learning_rate": 0.00012843190931098093, - "loss": 0.8531, + "grad_norm": 0.341796875, + "learning_rate": 4.966365860383798e-05, + "loss": 0.7436, "step": 5665 }, { "epoch": 7.012987012987013, - "grad_norm": 0.439453125, - "learning_rate": 0.00012829381890487536, - "loss": 0.8766, + "grad_norm": 0.349609375, + "learning_rate": 4.9477130754688775e-05, + "loss": 0.7644, "step": 5670 }, { "epoch": 7.019171304885591, - "grad_norm": 0.5078125, - "learning_rate": 0.00012815566982540567, - "loss": 0.8794, + "grad_norm": 0.34375, + "learning_rate": 4.92908386382509e-05, + "loss": 0.7701, "step": 5675 }, { "epoch": 7.025355596784168, - "grad_norm": 0.455078125, - "learning_rate": 0.00012801746235905384, - "loss": 0.8736, + "grad_norm": 0.341796875, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.7624, "step": 5680 }, { "epoch": 7.031539888682746, - "grad_norm": 0.48828125, - "learning_rate": 0.00012787919679242306, - "loss": 0.8789, + "grad_norm": 0.33984375, + "learning_rate": 4.891896507925808e-05, + "loss": 0.7688, "step": 5685 }, { "epoch": 7.037724180581323, - "grad_norm": 0.51953125, - "learning_rate": 0.00012774087341223695, - "loss": 0.8716, + "grad_norm": 0.353515625, + "learning_rate": 4.873338537181368e-05, + "loss": 0.7593, "step": 5690 }, { "epoch": 7.0439084724799015, - "grad_norm": 0.482421875, - "learning_rate": 0.000127602492505339, - "loss": 0.8688, + "grad_norm": 0.359375, + "learning_rate": 4.854804486729355e-05, + "loss": 0.7599, "step": 5695 }, { "epoch": 7.050092764378479, - "grad_norm": 0.51953125, - "learning_rate": 0.00012746405435869198, - "loss": 0.8736, + "grad_norm": 0.34765625, + "learning_rate": 4.836294443047088e-05, + "loss": 0.7649, "step": 5700 }, { "epoch": 7.0562770562770565, - "grad_norm": 0.455078125, - "learning_rate": 0.0001273255592593774, - "loss": 0.8623, + "grad_norm": 0.345703125, + "learning_rate": 4.817808492499866e-05, + "loss": 0.7533, "step": 5705 }, { "epoch": 7.062461348175634, - "grad_norm": 0.5078125, - "learning_rate": 0.00012718700749459486, - "loss": 0.8647, + "grad_norm": 0.34375, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.7545, "step": 5710 }, { "epoch": 7.0686456400742115, - "grad_norm": 0.59375, - "learning_rate": 0.00012704839935166143, - "loss": 0.8648, + "grad_norm": 0.345703125, + "learning_rate": 4.780909215709273e-05, + "loss": 0.7542, "step": 5715 }, { "epoch": 7.074829931972789, - "grad_norm": 0.45703125, - "learning_rate": 0.0001269097351180112, - "loss": 0.8635, + "grad_norm": 0.345703125, + "learning_rate": 4.762496061632814e-05, + "loss": 0.7556, "step": 5720 }, { "epoch": 7.0810142238713665, - "grad_norm": 0.474609375, - "learning_rate": 0.00012677101508119445, - "loss": 0.8724, + "grad_norm": 0.333984375, + "learning_rate": 4.744107345024432e-05, + "loss": 0.7599, "step": 5725 }, { "epoch": 7.087198515769944, - "grad_norm": 0.54296875, - "learning_rate": 0.00012663223952887723, - "loss": 0.8703, + "grad_norm": 0.345703125, + "learning_rate": 4.725743151683325e-05, + "loss": 0.7591, "step": 5730 }, { "epoch": 7.093382807668522, - "grad_norm": 0.43359375, - "learning_rate": 0.00012649340874884075, - "loss": 0.8688, + "grad_norm": 0.337890625, + "learning_rate": 4.707403567294275e-05, + "loss": 0.7558, "step": 5735 }, { "epoch": 7.0995670995671, - "grad_norm": 0.5234375, - "learning_rate": 0.0001263545230289807, - "loss": 0.8691, + "grad_norm": 0.345703125, + "learning_rate": 4.689088677427249e-05, + "loss": 0.7574, "step": 5740 }, { "epoch": 7.105751391465677, - "grad_norm": 0.515625, - "learning_rate": 0.0001262155826573067, - "loss": 0.8708, + "grad_norm": 0.341796875, + "learning_rate": 4.670798567536986e-05, + "loss": 0.7602, "step": 5745 }, { "epoch": 7.111935683364255, - "grad_norm": 0.4453125, - "learning_rate": 0.00012607658792194174, - "loss": 0.872, + "grad_norm": 0.33984375, + "learning_rate": 4.652533322962597e-05, + "loss": 0.7612, "step": 5750 }, { "epoch": 7.118119975262832, - "grad_norm": 0.427734375, - "learning_rate": 0.0001259375391111215, - "loss": 0.8774, + "grad_norm": 0.357421875, + "learning_rate": 4.6342930289271925e-05, + "loss": 0.7663, "step": 5755 }, { "epoch": 7.12430426716141, - "grad_norm": 0.4609375, - "learning_rate": 0.0001257984365131938, - "loss": 0.8759, + "grad_norm": 0.361328125, + "learning_rate": 4.6160777705374524e-05, + "loss": 0.7658, "step": 5760 }, { "epoch": 7.130488559059987, - "grad_norm": 0.466796875, - "learning_rate": 0.0001256592804166181, - "loss": 0.8642, + "grad_norm": 0.349609375, + "learning_rate": 4.597887632783258e-05, + "loss": 0.7526, "step": 5765 }, { "epoch": 7.136672850958565, - "grad_norm": 0.44140625, - "learning_rate": 0.00012552007110996463, - "loss": 0.8667, + "grad_norm": 0.34765625, + "learning_rate": 4.579722700537268e-05, + "loss": 0.7562, "step": 5770 }, { "epoch": 7.142857142857143, - "grad_norm": 0.48828125, - "learning_rate": 0.00012538080888191408, - "loss": 0.8672, + "grad_norm": 0.33984375, + "learning_rate": 4.561583058554537e-05, + "loss": 0.7587, "step": 5775 }, { "epoch": 7.149041434755721, - "grad_norm": 0.4296875, - "learning_rate": 0.00012524149402125685, - "loss": 0.8582, + "grad_norm": 0.357421875, + "learning_rate": 4.543468791472131e-05, + "loss": 0.7509, "step": 5780 }, { "epoch": 7.155225726654298, - "grad_norm": 0.515625, - "learning_rate": 0.0001251021268168925, - "loss": 0.8789, + "grad_norm": 0.341796875, + "learning_rate": 4.525379983808706e-05, + "loss": 0.7658, "step": 5785 }, { "epoch": 7.161410018552876, - "grad_norm": 0.51953125, - "learning_rate": 0.00012496270755782914, - "loss": 0.8787, + "grad_norm": 0.341796875, + "learning_rate": 4.5073167199641367e-05, + "loss": 0.766, "step": 5790 }, { "epoch": 7.167594310451453, - "grad_norm": 0.53125, - "learning_rate": 0.00012482323653318278, - "loss": 0.8723, + "grad_norm": 0.345703125, + "learning_rate": 4.489279084219108e-05, + "loss": 0.7609, "step": 5795 }, { "epoch": 7.173778602350031, - "grad_norm": 0.46484375, - "learning_rate": 0.00012468371403217684, - "loss": 0.8649, + "grad_norm": 0.349609375, + "learning_rate": 4.471267160734731e-05, + "loss": 0.7531, "step": 5800 }, { "epoch": 7.179962894248608, - "grad_norm": 0.64453125, - "learning_rate": 0.00012454414034414142, - "loss": 0.877, + "grad_norm": 0.33984375, + "learning_rate": 4.453281033552142e-05, + "loss": 0.7633, "step": 5805 }, { "epoch": 7.186147186147186, - "grad_norm": 0.466796875, - "learning_rate": 0.00012440451575851285, - "loss": 0.8851, + "grad_norm": 0.34375, + "learning_rate": 4.43532078659213e-05, + "loss": 0.7722, "step": 5810 }, { "epoch": 7.192331478045764, - "grad_norm": 0.4375, - "learning_rate": 0.00012426484056483292, - "loss": 0.8703, + "grad_norm": 0.33984375, + "learning_rate": 4.4173865036547105e-05, + "loss": 0.7576, "step": 5815 }, { "epoch": 7.198515769944342, - "grad_norm": 0.486328125, - "learning_rate": 0.00012412511505274844, - "loss": 0.8664, + "grad_norm": 0.3515625, + "learning_rate": 4.399478268418771e-05, + "loss": 0.7573, "step": 5820 }, { "epoch": 7.204700061842919, - "grad_norm": 0.458984375, - "learning_rate": 0.00012398533951201053, - "loss": 0.8617, + "grad_norm": 0.34375, + "learning_rate": 4.3815961644416536e-05, + "loss": 0.7523, "step": 5825 }, { "epoch": 7.210884353741497, - "grad_norm": 0.427734375, - "learning_rate": 0.00012384551423247407, - "loss": 0.8691, + "grad_norm": 0.353515625, + "learning_rate": 4.36374027515878e-05, + "loss": 0.7605, "step": 5830 }, { "epoch": 7.217068645640074, - "grad_norm": 0.48828125, - "learning_rate": 0.00012370563950409703, - "loss": 0.8808, + "grad_norm": 0.33984375, + "learning_rate": 4.3459106838832566e-05, + "loss": 0.7668, "step": 5835 }, { "epoch": 7.223252937538652, - "grad_norm": 0.494140625, - "learning_rate": 0.00012356571561693996, - "loss": 0.8674, + "grad_norm": 0.34375, + "learning_rate": 4.328107473805487e-05, + "loss": 0.7547, "step": 5840 }, { "epoch": 7.229437229437229, - "grad_norm": 0.435546875, - "learning_rate": 0.00012342574286116544, - "loss": 0.8644, + "grad_norm": 0.34765625, + "learning_rate": 4.3103307279927804e-05, + "loss": 0.7549, "step": 5845 }, { "epoch": 7.235621521335807, - "grad_norm": 0.49609375, - "learning_rate": 0.00012328572152703725, - "loss": 0.8745, + "grad_norm": 0.33203125, + "learning_rate": 4.2925805293889786e-05, + "loss": 0.7616, "step": 5850 }, { "epoch": 7.241805813234385, - "grad_norm": 0.4765625, - "learning_rate": 0.00012314565190491998, - "loss": 0.8642, + "grad_norm": 0.33984375, + "learning_rate": 4.274856960814045e-05, + "loss": 0.7548, "step": 5855 }, { "epoch": 7.2479901051329625, - "grad_norm": 0.51171875, - "learning_rate": 0.00012300553428527832, - "loss": 0.8785, + "grad_norm": 0.341796875, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7668, "step": 5860 }, { "epoch": 7.25417439703154, - "grad_norm": 0.44140625, - "learning_rate": 0.00012286536895867654, - "loss": 0.8719, + "grad_norm": 0.361328125, + "learning_rate": 4.2394900444090134e-05, + "loss": 0.7602, "step": 5865 }, { "epoch": 7.2603586889301175, - "grad_norm": 0.427734375, - "learning_rate": 0.00012272515621577782, - "loss": 0.8651, + "grad_norm": 0.341796875, + "learning_rate": 4.2218468615960484e-05, + "loss": 0.7551, "step": 5870 }, { "epoch": 7.266542980828695, - "grad_norm": 0.48046875, - "learning_rate": 0.00012258489634734367, - "loss": 0.87, + "grad_norm": 0.3515625, + "learning_rate": 4.204230638845458e-05, + "loss": 0.7587, "step": 5875 }, { "epoch": 7.2727272727272725, - "grad_norm": 0.462890625, - "learning_rate": 0.00012244458964423327, - "loss": 0.8693, + "grad_norm": 0.345703125, + "learning_rate": 4.1866414583520877e-05, + "loss": 0.7574, "step": 5880 }, { "epoch": 7.27891156462585, - "grad_norm": 0.46484375, - "learning_rate": 0.000122304236397403, - "loss": 0.8658, + "grad_norm": 0.3359375, + "learning_rate": 4.169079402184618e-05, + "loss": 0.7554, "step": 5885 }, { "epoch": 7.285095856524428, - "grad_norm": 0.458984375, - "learning_rate": 0.00012216383689790574, - "loss": 0.8749, + "grad_norm": 0.359375, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.7646, "step": 5890 }, { "epoch": 7.291280148423006, - "grad_norm": 0.408203125, - "learning_rate": 0.00012202339143689023, - "loss": 0.8737, + "grad_norm": 0.35546875, + "learning_rate": 4.134036990468946e-05, + "loss": 0.7644, "step": 5895 }, { "epoch": 7.297464440321583, - "grad_norm": 0.484375, - "learning_rate": 0.00012188290030560063, - "loss": 0.871, + "grad_norm": 0.3359375, + "learning_rate": 4.1165567984237764e-05, + "loss": 0.7581, "step": 5900 }, { "epoch": 7.303648732220161, - "grad_norm": 0.44921875, - "learning_rate": 0.00012174236379537572, - "loss": 0.8725, + "grad_norm": 0.337890625, + "learning_rate": 4.0991040577098316e-05, + "loss": 0.7615, "step": 5905 }, { "epoch": 7.309833024118738, - "grad_norm": 0.451171875, - "learning_rate": 0.00012160178219764837, - "loss": 0.8718, + "grad_norm": 0.359375, + "learning_rate": 4.081678849759181e-05, + "loss": 0.759, "step": 5910 }, { "epoch": 7.316017316017316, - "grad_norm": 0.44921875, - "learning_rate": 0.00012146115580394499, - "loss": 0.8763, + "grad_norm": 0.359375, + "learning_rate": 4.064281255875429e-05, + "loss": 0.7666, "step": 5915 }, { "epoch": 7.322201607915893, - "grad_norm": 0.455078125, - "learning_rate": 0.00012132048490588492, - "loss": 0.8658, + "grad_norm": 0.34765625, + "learning_rate": 4.046911357233343e-05, + "loss": 0.7573, "step": 5920 }, { "epoch": 7.328385899814471, - "grad_norm": 0.466796875, - "learning_rate": 0.00012117976979517973, - "loss": 0.8633, + "grad_norm": 0.34765625, + "learning_rate": 4.0295692348784586e-05, + "loss": 0.7538, "step": 5925 }, { "epoch": 7.334570191713049, - "grad_norm": 0.453125, - "learning_rate": 0.00012103901076363269, - "loss": 0.8827, + "grad_norm": 0.3515625, + "learning_rate": 4.0122549697267244e-05, + "loss": 0.7705, "step": 5930 }, { "epoch": 7.340754483611627, - "grad_norm": 0.4765625, - "learning_rate": 0.00012089820810313812, - "loss": 0.8769, + "grad_norm": 0.34375, + "learning_rate": 3.994968642564101e-05, + "loss": 0.7636, "step": 5935 }, { "epoch": 7.346938775510204, - "grad_norm": 0.52734375, - "learning_rate": 0.0001207573621056809, - "loss": 0.8767, + "grad_norm": 0.34375, + "learning_rate": 3.977710334046193e-05, + "loss": 0.7644, "step": 5940 }, { "epoch": 7.353123067408782, - "grad_norm": 0.451171875, - "learning_rate": 0.00012061647306333568, - "loss": 0.8909, + "grad_norm": 0.35546875, + "learning_rate": 3.960480124697885e-05, + "loss": 0.7771, "step": 5945 }, { "epoch": 7.359307359307359, - "grad_norm": 0.4921875, - "learning_rate": 0.00012047554126826643, - "loss": 0.8697, + "grad_norm": 0.34375, + "learning_rate": 3.943278094912946e-05, + "loss": 0.7582, "step": 5950 }, { "epoch": 7.365491651205937, - "grad_norm": 0.4296875, - "learning_rate": 0.00012033456701272576, - "loss": 0.8715, + "grad_norm": 0.3515625, + "learning_rate": 3.926104324953658e-05, + "loss": 0.7585, "step": 5955 }, { "epoch": 7.371675943104514, - "grad_norm": 0.4609375, - "learning_rate": 0.00012019355058905435, - "loss": 0.8731, + "grad_norm": 0.34375, + "learning_rate": 3.9089588949504655e-05, + "loss": 0.7614, "step": 5960 }, { "epoch": 7.377860235003092, - "grad_norm": 0.447265625, - "learning_rate": 0.00012005249228968032, - "loss": 0.8818, + "grad_norm": 0.345703125, + "learning_rate": 3.891841884901557e-05, + "loss": 0.7694, "step": 5965 }, { "epoch": 7.38404452690167, - "grad_norm": 0.466796875, - "learning_rate": 0.00011991139240711857, - "loss": 0.8656, + "grad_norm": 0.33984375, + "learning_rate": 3.874753374672542e-05, + "loss": 0.7549, "step": 5970 }, { "epoch": 7.390228818800248, - "grad_norm": 0.51171875, - "learning_rate": 0.00011977025123397033, - "loss": 0.8862, + "grad_norm": 0.345703125, + "learning_rate": 3.857693443996038e-05, + "loss": 0.7721, "step": 5975 }, { "epoch": 7.396413110698825, - "grad_norm": 0.49609375, - "learning_rate": 0.00011962906906292238, - "loss": 0.8726, + "grad_norm": 0.3359375, + "learning_rate": 3.840662172471315e-05, + "loss": 0.762, "step": 5980 }, { "epoch": 7.402597402597403, - "grad_norm": 0.4765625, - "learning_rate": 0.00011948784618674653, - "loss": 0.8633, + "grad_norm": 0.3359375, + "learning_rate": 3.8236596395639354e-05, + "loss": 0.7569, "step": 5985 }, { "epoch": 7.40878169449598, - "grad_norm": 0.45703125, - "learning_rate": 0.00011934658289829902, - "loss": 0.8705, + "grad_norm": 0.349609375, + "learning_rate": 3.806685924605361e-05, + "loss": 0.7617, "step": 5990 }, { "epoch": 7.414965986394558, - "grad_norm": 0.451171875, - "learning_rate": 0.00011920527949051991, - "loss": 0.8736, + "grad_norm": 0.33984375, + "learning_rate": 3.7897411067925894e-05, + "loss": 0.7598, "step": 5995 }, { "epoch": 7.421150278293135, - "grad_norm": 0.462890625, - "learning_rate": 0.00011906393625643244, - "loss": 0.8721, + "grad_norm": 0.34765625, + "learning_rate": 3.772825265187802e-05, + "loss": 0.7624, "step": 6000 }, { "epoch": 7.427334570191713, - "grad_norm": 0.44921875, - "learning_rate": 0.00011892255348914239, - "loss": 0.8733, + "grad_norm": 0.353515625, + "learning_rate": 3.755938478717968e-05, + "loss": 0.7614, "step": 6005 }, { "epoch": 7.433518862090291, - "grad_norm": 0.435546875, - "learning_rate": 0.00011878113148183758, - "loss": 0.8753, + "grad_norm": 0.3515625, + "learning_rate": 3.739080826174498e-05, + "loss": 0.7665, "step": 6010 }, { "epoch": 7.4397031539888685, - "grad_norm": 0.4296875, - "learning_rate": 0.00011863967052778721, - "loss": 0.8805, + "grad_norm": 0.345703125, + "learning_rate": 3.722252386212862e-05, + "loss": 0.7694, "step": 6015 }, { "epoch": 7.445887445887446, - "grad_norm": 0.41796875, - "learning_rate": 0.00011849817092034118, - "loss": 0.881, + "grad_norm": 0.33984375, + "learning_rate": 3.705453237352227e-05, + "loss": 0.7687, "step": 6020 }, { "epoch": 7.4520717377860235, - "grad_norm": 0.44140625, - "learning_rate": 0.00011835663295292963, - "loss": 0.8815, + "grad_norm": 0.341796875, + "learning_rate": 3.688683457975103e-05, + "loss": 0.7684, "step": 6025 }, { "epoch": 7.458256029684601, - "grad_norm": 0.47265625, - "learning_rate": 0.00011821505691906216, - "loss": 0.8684, + "grad_norm": 0.3359375, + "learning_rate": 3.6719431263269533e-05, + "loss": 0.7564, "step": 6030 }, { "epoch": 7.4644403215831785, - "grad_norm": 0.4453125, - "learning_rate": 0.00011807344311232738, - "loss": 0.88, + "grad_norm": 0.3515625, + "learning_rate": 3.655232320515844e-05, + "loss": 0.7689, "step": 6035 }, { "epoch": 7.470624613481756, - "grad_norm": 0.439453125, - "learning_rate": 0.00011793179182639218, - "loss": 0.8808, + "grad_norm": 0.341796875, + "learning_rate": 3.638551118512089e-05, + "loss": 0.7683, "step": 6040 }, { "epoch": 7.476808905380334, - "grad_norm": 0.484375, - "learning_rate": 0.0001177901033550012, - "loss": 0.8732, + "grad_norm": 0.359375, + "learning_rate": 3.621899598147863e-05, + "loss": 0.7605, "step": 6045 }, { "epoch": 7.482993197278912, - "grad_norm": 0.408203125, - "learning_rate": 0.00011764837799197622, - "loss": 0.8767, + "grad_norm": 0.33984375, + "learning_rate": 3.605277837116854e-05, + "loss": 0.7638, "step": 6050 }, { "epoch": 7.489177489177489, - "grad_norm": 0.482421875, - "learning_rate": 0.00011750661603121544, - "loss": 0.8694, + "grad_norm": 0.34375, + "learning_rate": 3.588685912973896e-05, + "loss": 0.7569, "step": 6055 }, { "epoch": 7.495361781076067, - "grad_norm": 0.4453125, - "learning_rate": 0.00011736481776669306, - "loss": 0.8702, + "grad_norm": 0.34765625, + "learning_rate": 3.5721239031346066e-05, + "loss": 0.7588, "step": 6060 }, { "epoch": 7.501546072974644, - "grad_norm": 0.451171875, - "learning_rate": 0.00011722298349245844, - "loss": 0.8751, + "grad_norm": 0.357421875, + "learning_rate": 3.555591884875038e-05, + "loss": 0.7602, "step": 6065 }, { "epoch": 7.507730364873222, - "grad_norm": 0.43359375, - "learning_rate": 0.0001170811135026357, - "loss": 0.8758, + "grad_norm": 0.34765625, + "learning_rate": 3.539089935331294e-05, + "loss": 0.7616, "step": 6070 }, { "epoch": 7.513914656771799, - "grad_norm": 0.427734375, - "learning_rate": 0.00011693920809142305, - "loss": 0.8732, + "grad_norm": 0.345703125, + "learning_rate": 3.52261813149918e-05, + "loss": 0.764, "step": 6075 }, { "epoch": 7.520098948670377, - "grad_norm": 0.4453125, - "learning_rate": 0.00011679726755309205, - "loss": 0.8649, + "grad_norm": 0.333984375, + "learning_rate": 3.506176550233863e-05, + "loss": 0.7588, "step": 6080 }, { "epoch": 7.526283240568954, - "grad_norm": 0.4453125, - "learning_rate": 0.00011665529218198721, - "loss": 0.8688, + "grad_norm": 0.3515625, + "learning_rate": 3.4897652682494776e-05, + "loss": 0.7579, "step": 6085 }, { "epoch": 7.532467532467533, - "grad_norm": 0.455078125, - "learning_rate": 0.00011651328227252517, - "loss": 0.8727, + "grad_norm": 0.3515625, + "learning_rate": 3.473384362118794e-05, + "loss": 0.7637, "step": 6090 }, { "epoch": 7.53865182436611, - "grad_norm": 0.48828125, - "learning_rate": 0.0001163712381191943, - "loss": 0.8796, + "grad_norm": 0.345703125, + "learning_rate": 3.457033908272852e-05, + "loss": 0.7657, "step": 6095 }, { "epoch": 7.544836116264688, - "grad_norm": 0.44140625, - "learning_rate": 0.00011622916001655388, - "loss": 0.8723, + "grad_norm": 0.34765625, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7603, "step": 6100 }, { "epoch": 7.551020408163265, - "grad_norm": 0.421875, - "learning_rate": 0.00011608704825923369, - "loss": 0.874, + "grad_norm": 0.341796875, + "learning_rate": 3.424424662448559e-05, + "loss": 0.7621, "step": 6105 }, { "epoch": 7.557204700061843, - "grad_norm": 0.44140625, - "learning_rate": 0.00011594490314193323, - "loss": 0.8792, + "grad_norm": 0.349609375, + "learning_rate": 3.4081660226204357e-05, + "loss": 0.7672, "step": 6110 }, { "epoch": 7.56338899196042, - "grad_norm": 0.443359375, - "learning_rate": 0.00011580272495942119, - "loss": 0.8685, + "grad_norm": 0.3359375, + "learning_rate": 3.3919381393767925e-05, + "loss": 0.7577, "step": 6115 }, { "epoch": 7.569573283858999, - "grad_norm": 0.43359375, - "learning_rate": 0.00011566051400653486, - "loss": 0.8656, + "grad_norm": 0.337890625, + "learning_rate": 3.3757410884346894e-05, + "loss": 0.7561, "step": 6120 }, { "epoch": 7.575757575757576, - "grad_norm": 0.42578125, - "learning_rate": 0.00011551827057817945, - "loss": 0.8799, + "grad_norm": 0.34765625, + "learning_rate": 3.3595749453673206e-05, + "loss": 0.765, "step": 6125 }, { "epoch": 7.581941867656154, - "grad_norm": 0.439453125, - "learning_rate": 0.00011537599496932752, - "loss": 0.8796, + "grad_norm": 0.359375, + "learning_rate": 3.34343978560367e-05, + "loss": 0.7675, "step": 6130 }, { "epoch": 7.588126159554731, - "grad_norm": 0.515625, - "learning_rate": 0.00011523368747501839, - "loss": 0.8738, + "grad_norm": 0.353515625, + "learning_rate": 3.3273356844281613e-05, + "loss": 0.7637, "step": 6135 }, { "epoch": 7.594310451453309, - "grad_norm": 0.53125, - "learning_rate": 0.00011509134839035748, - "loss": 0.8787, + "grad_norm": 0.357421875, + "learning_rate": 3.3112627169802946e-05, + "loss": 0.767, "step": 6140 }, { "epoch": 7.600494743351886, - "grad_norm": 0.453125, - "learning_rate": 0.00011494897801051574, - "loss": 0.8755, + "grad_norm": 0.3671875, + "learning_rate": 3.295220958254317e-05, + "loss": 0.7645, "step": 6145 }, { "epoch": 7.606679035250464, - "grad_norm": 0.48828125, - "learning_rate": 0.00011480657663072896, - "loss": 0.878, + "grad_norm": 0.341796875, + "learning_rate": 3.2792104830988515e-05, + "loss": 0.7654, "step": 6150 }, { "epoch": 7.612863327149041, - "grad_norm": 0.58984375, - "learning_rate": 0.00011466414454629731, - "loss": 0.8772, + "grad_norm": 0.3515625, + "learning_rate": 3.2632313662165525e-05, + "loss": 0.7671, "step": 6155 }, { "epoch": 7.619047619047619, - "grad_norm": 0.45703125, - "learning_rate": 0.0001145216820525845, - "loss": 0.8697, + "grad_norm": 0.341796875, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.7625, "step": 6160 }, { "epoch": 7.625231910946197, - "grad_norm": 0.6171875, - "learning_rate": 0.00011437918944501749, - "loss": 0.8816, + "grad_norm": 0.34375, + "learning_rate": 3.231367505350199e-05, + "loss": 0.7662, "step": 6165 }, { "epoch": 7.6314162028447745, - "grad_norm": 0.470703125, - "learning_rate": 0.00011423666701908547, - "loss": 0.8607, + "grad_norm": 0.3515625, + "learning_rate": 3.2154829100385e-05, + "loss": 0.7527, "step": 6170 }, { "epoch": 7.637600494743352, - "grad_norm": 0.4375, - "learning_rate": 0.00011409411507033962, - "loss": 0.8783, + "grad_norm": 0.337890625, + "learning_rate": 3.1996299703440095e-05, + "loss": 0.7665, "step": 6175 }, { "epoch": 7.6437847866419295, - "grad_norm": 0.5234375, - "learning_rate": 0.00011395153389439233, - "loss": 0.8765, + "grad_norm": 0.33984375, + "learning_rate": 3.1838087602343344e-05, + "loss": 0.7641, "step": 6180 }, { "epoch": 7.649969078540507, - "grad_norm": 0.41796875, - "learning_rate": 0.00011380892378691646, - "loss": 0.8723, + "grad_norm": 0.333984375, + "learning_rate": 3.1680193535290626e-05, + "loss": 0.7638, "step": 6185 }, { "epoch": 7.6561533704390845, - "grad_norm": 0.4921875, - "learning_rate": 0.00011366628504364509, - "loss": 0.8783, + "grad_norm": 0.345703125, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.7636, "step": 6190 }, { "epoch": 7.662337662337662, - "grad_norm": 0.49609375, - "learning_rate": 0.00011352361796037047, - "loss": 0.8764, + "grad_norm": 0.337890625, + "learning_rate": 3.1365362448677146e-05, + "loss": 0.7629, "step": 6195 }, { "epoch": 7.66852195423624, - "grad_norm": 0.4609375, - "learning_rate": 0.00011338092283294377, - "loss": 0.8831, + "grad_norm": 0.345703125, + "learning_rate": 3.120842689807468e-05, + "loss": 0.7709, "step": 6200 }, { "epoch": 7.674706246134818, - "grad_norm": 0.447265625, - "learning_rate": 0.00011323819995727421, - "loss": 0.8728, + "grad_norm": 0.353515625, + "learning_rate": 3.105181231942584e-05, + "loss": 0.7639, "step": 6205 }, { "epoch": 7.680890538033395, - "grad_norm": 0.4609375, - "learning_rate": 0.00011309544962932862, - "loss": 0.8765, + "grad_norm": 0.3359375, + "learning_rate": 3.089551944347255e-05, + "loss": 0.7675, "step": 6210 }, { "epoch": 7.687074829931973, - "grad_norm": 0.4609375, - "learning_rate": 0.0001129526721451307, - "loss": 0.8725, + "grad_norm": 0.341796875, + "learning_rate": 3.0739548999455805e-05, + "loss": 0.7611, "step": 6215 }, { "epoch": 7.69325912183055, - "grad_norm": 0.48046875, - "learning_rate": 0.00011280986780076057, - "loss": 0.8677, + "grad_norm": 0.349609375, + "learning_rate": 3.058390171511196e-05, + "loss": 0.7589, "step": 6220 }, { "epoch": 7.699443413729128, - "grad_norm": 0.50390625, - "learning_rate": 0.00011266703689235394, - "loss": 0.8769, + "grad_norm": 0.353515625, + "learning_rate": 3.0428578316669798e-05, + "loss": 0.7629, "step": 6225 }, { "epoch": 7.705627705627705, - "grad_norm": 0.625, - "learning_rate": 0.00011252417971610163, - "loss": 0.8719, + "grad_norm": 0.34765625, + "learning_rate": 3.0273579528846762e-05, + "loss": 0.7581, "step": 6230 }, { "epoch": 7.711811997526283, - "grad_norm": 0.474609375, - "learning_rate": 0.00011238129656824898, - "loss": 0.8735, + "grad_norm": 0.34375, + "learning_rate": 3.0118906074845678e-05, + "loss": 0.761, "step": 6235 }, { "epoch": 7.71799628942486, - "grad_norm": 0.490234375, - "learning_rate": 0.00011223838774509514, - "loss": 0.8828, + "grad_norm": 0.34765625, + "learning_rate": 2.996455867635155e-05, + "loss": 0.7692, "step": 6240 }, { "epoch": 7.724180581323439, - "grad_norm": 0.546875, - "learning_rate": 0.00011209545354299251, - "loss": 0.8807, + "grad_norm": 0.345703125, + "learning_rate": 2.9810538053527914e-05, + "loss": 0.7696, "step": 6245 }, { "epoch": 7.730364873222016, - "grad_norm": 0.52734375, - "learning_rate": 0.00011195249425834615, - "loss": 0.8844, + "grad_norm": 0.3515625, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.7705, "step": 6250 }, { "epoch": 7.736549165120594, - "grad_norm": 0.47265625, - "learning_rate": 0.00011180951018761314, - "loss": 0.8783, + "grad_norm": 0.353515625, + "learning_rate": 2.9503480007919648e-05, + "loss": 0.7645, "step": 6255 }, { "epoch": 7.742733457019171, - "grad_norm": 0.45703125, - "learning_rate": 0.00011166650162730188, - "loss": 0.8608, + "grad_norm": 0.345703125, + "learning_rate": 2.9350444017825385e-05, + "loss": 0.7537, "step": 6260 }, { "epoch": 7.748917748917749, - "grad_norm": 0.451171875, - "learning_rate": 0.0001115234688739716, - "loss": 0.8747, + "grad_norm": 0.3515625, + "learning_rate": 2.919773766877556e-05, + "loss": 0.7634, "step": 6265 }, { "epoch": 7.755102040816326, - "grad_norm": 0.484375, - "learning_rate": 0.00011138041222423177, - "loss": 0.8773, + "grad_norm": 0.349609375, + "learning_rate": 2.9045361673276872e-05, + "loss": 0.7648, "step": 6270 }, { "epoch": 7.761286332714904, - "grad_norm": 0.47265625, - "learning_rate": 0.00011123733197474128, - "loss": 0.8672, + "grad_norm": 0.349609375, + "learning_rate": 2.8893316742294562e-05, + "loss": 0.7568, "step": 6275 }, { "epoch": 7.767470624613482, - "grad_norm": 0.474609375, - "learning_rate": 0.00011109422842220805, - "loss": 0.8731, + "grad_norm": 0.341796875, + "learning_rate": 2.874160358524931e-05, + "loss": 0.7604, "step": 6280 }, { "epoch": 7.77365491651206, - "grad_norm": 0.5078125, - "learning_rate": 0.00011095110186338835, - "loss": 0.8684, + "grad_norm": 0.345703125, + "learning_rate": 2.8590222910013655e-05, + "loss": 0.7556, "step": 6285 }, { "epoch": 7.779839208410637, - "grad_norm": 0.427734375, - "learning_rate": 0.00011080795259508608, - "loss": 0.8699, + "grad_norm": 0.35546875, + "learning_rate": 2.8439175422908824e-05, + "loss": 0.7591, "step": 6290 }, { "epoch": 7.786023500309215, - "grad_norm": 0.427734375, - "learning_rate": 0.00011066478091415223, - "loss": 0.8673, + "grad_norm": 0.33984375, + "learning_rate": 2.828846182870155e-05, + "loss": 0.7548, "step": 6295 }, { "epoch": 7.792207792207792, - "grad_norm": 0.462890625, - "learning_rate": 0.00011052158711748434, - "loss": 0.8707, + "grad_norm": 0.34375, + "learning_rate": 2.8138082830600554e-05, + "loss": 0.7589, "step": 6300 }, { "epoch": 7.79839208410637, - "grad_norm": 0.462890625, - "learning_rate": 0.00011037837150202576, - "loss": 0.8729, + "grad_norm": 0.337890625, + "learning_rate": 2.798803913025343e-05, + "loss": 0.7604, "step": 6305 }, { "epoch": 7.804576376004947, - "grad_norm": 0.4375, - "learning_rate": 0.00011023513436476511, - "loss": 0.8627, + "grad_norm": 0.337890625, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.7524, "step": 6310 }, { "epoch": 7.810760667903525, - "grad_norm": 0.4140625, - "learning_rate": 0.00011009187600273566, - "loss": 0.8787, + "grad_norm": 0.3359375, + "learning_rate": 2.76889604215855e-05, + "loss": 0.7664, "step": 6315 }, { "epoch": 7.816944959802103, - "grad_norm": 0.462890625, - "learning_rate": 0.00010994859671301462, - "loss": 0.8708, + "grad_norm": 0.349609375, + "learning_rate": 2.753992680872457e-05, + "loss": 0.7627, "step": 6320 }, { "epoch": 7.8231292517006805, - "grad_norm": 0.439453125, - "learning_rate": 0.00010980529679272266, - "loss": 0.8691, + "grad_norm": 0.3359375, + "learning_rate": 2.739123128453066e-05, + "loss": 0.7592, "step": 6325 }, { "epoch": 7.829313543599258, - "grad_norm": 0.515625, - "learning_rate": 0.0001096619765390232, - "loss": 0.8678, + "grad_norm": 0.357421875, + "learning_rate": 2.7242874542796482e-05, + "loss": 0.7581, "step": 6330 }, { "epoch": 7.8354978354978355, - "grad_norm": 0.48046875, - "learning_rate": 0.00010951863624912185, - "loss": 0.8747, + "grad_norm": 0.3359375, + "learning_rate": 2.7094857275734076e-05, + "loss": 0.7632, "step": 6335 }, { "epoch": 7.841682127396413, - "grad_norm": 0.455078125, - "learning_rate": 0.00010937527622026575, - "loss": 0.8727, + "grad_norm": 0.345703125, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.7633, "step": 6340 }, { "epoch": 7.8478664192949905, - "grad_norm": 0.470703125, - "learning_rate": 0.00010923189674974301, - "loss": 0.8742, + "grad_norm": 0.361328125, + "learning_rate": 2.6799843926549685e-05, + "loss": 0.7628, "step": 6345 }, { "epoch": 7.854050711193568, - "grad_norm": 0.421875, - "learning_rate": 0.00010908849813488203, - "loss": 0.8675, + "grad_norm": 0.3359375, + "learning_rate": 2.665284922091912e-05, + "loss": 0.7538, "step": 6350 }, { "epoch": 7.860235003092146, - "grad_norm": 0.466796875, - "learning_rate": 0.00010894508067305088, - "loss": 0.8758, + "grad_norm": 0.34375, + "learning_rate": 2.6506196742936717e-05, + "loss": 0.7672, "step": 6355 }, { "epoch": 7.866419294990724, - "grad_norm": 0.474609375, - "learning_rate": 0.00010880164466165674, - "loss": 0.8716, + "grad_norm": 0.33984375, + "learning_rate": 2.6359887176862718e-05, + "loss": 0.7593, "step": 6360 }, { "epoch": 7.872603586889301, - "grad_norm": 0.45703125, - "learning_rate": 0.00010865819039814526, - "loss": 0.8738, + "grad_norm": 0.349609375, + "learning_rate": 2.621392120535724e-05, + "loss": 0.7619, "step": 6365 }, { "epoch": 7.878787878787879, - "grad_norm": 0.47265625, - "learning_rate": 0.00010851471817999997, - "loss": 0.8819, + "grad_norm": 0.35546875, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.7687, "step": 6370 }, { "epoch": 7.884972170686456, - "grad_norm": 0.46484375, - "learning_rate": 0.00010837122830474158, - "loss": 0.8703, + "grad_norm": 0.34375, + "learning_rate": 2.5923022768673532e-05, + "loss": 0.7602, "step": 6375 }, { "epoch": 7.891156462585034, - "grad_norm": 0.439453125, - "learning_rate": 0.00010822772106992747, - "loss": 0.8778, + "grad_norm": 0.341796875, + "learning_rate": 2.577809166078716e-05, + "loss": 0.7664, "step": 6380 }, { "epoch": 7.897340754483611, - "grad_norm": 0.4609375, - "learning_rate": 0.00010808419677315093, - "loss": 0.8737, + "grad_norm": 0.34375, + "learning_rate": 2.5633506862046607e-05, + "loss": 0.7607, "step": 6385 }, { "epoch": 7.903525046382189, - "grad_norm": 0.4921875, - "learning_rate": 0.00010794065571204072, - "loss": 0.8775, + "grad_norm": 0.3515625, + "learning_rate": 2.548926904706459e-05, + "loss": 0.7651, "step": 6390 }, { "epoch": 7.909709338280766, - "grad_norm": 0.42578125, - "learning_rate": 0.00010779709818426033, - "loss": 0.8834, + "grad_norm": 0.349609375, + "learning_rate": 2.5345378888834714e-05, + "loss": 0.7706, "step": 6395 }, { "epoch": 7.915893630179345, - "grad_norm": 0.41796875, - "learning_rate": 0.0001076535244875074, - "loss": 0.8678, + "grad_norm": 0.333984375, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.757, "step": 6400 }, { "epoch": 7.922077922077922, - "grad_norm": 0.4375, - "learning_rate": 0.0001075099349195131, - "loss": 0.8648, + "grad_norm": 0.3515625, + "learning_rate": 2.5058644226492346e-05, + "loss": 0.7556, "step": 6405 }, { "epoch": 7.9282622139765, - "grad_norm": 0.51953125, - "learning_rate": 0.00010736632977804149, - "loss": 0.8707, + "grad_norm": 0.337890625, + "learning_rate": 2.4915801060244092e-05, + "loss": 0.7614, "step": 6410 }, { "epoch": 7.934446505875077, - "grad_norm": 0.41796875, - "learning_rate": 0.00010722270936088898, - "loss": 0.8705, + "grad_norm": 0.337890625, + "learning_rate": 2.4773308226470238e-05, + "loss": 0.7577, "step": 6415 }, { "epoch": 7.940630797773655, - "grad_norm": 0.53515625, - "learning_rate": 0.00010707907396588361, - "loss": 0.8754, + "grad_norm": 0.345703125, + "learning_rate": 2.4631166390022574e-05, + "loss": 0.7622, "step": 6420 }, { "epoch": 7.946815089672232, - "grad_norm": 0.51953125, - "learning_rate": 0.00010693542389088452, - "loss": 0.8723, + "grad_norm": 0.34765625, + "learning_rate": 2.4489376214115212e-05, + "loss": 0.7594, "step": 6425 }, { "epoch": 7.95299938157081, - "grad_norm": 0.4453125, - "learning_rate": 0.00010679175943378119, - "loss": 0.8746, + "grad_norm": 0.33984375, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.7637, "step": 6430 }, { "epoch": 7.959183673469388, - "grad_norm": 0.515625, - "learning_rate": 0.00010664808089249305, - "loss": 0.8709, + "grad_norm": 0.3515625, + "learning_rate": 2.4206853488570957e-05, + "loss": 0.7606, "step": 6435 }, { "epoch": 7.965367965367966, - "grad_norm": 0.66796875, - "learning_rate": 0.00010650438856496872, - "loss": 0.871, + "grad_norm": 0.34765625, + "learning_rate": 2.4066122257145894e-05, + "loss": 0.7591, "step": 6440 }, { "epoch": 7.971552257266543, - "grad_norm": 0.455078125, - "learning_rate": 0.00010636068274918536, - "loss": 0.8661, + "grad_norm": 0.330078125, + "learning_rate": 2.392574532267886e-05, + "loss": 0.7565, "step": 6445 }, { "epoch": 7.977736549165121, - "grad_norm": 0.462890625, - "learning_rate": 0.00010621696374314807, - "loss": 0.8711, + "grad_norm": 0.341796875, + "learning_rate": 2.3785723340149134e-05, + "loss": 0.7602, "step": 6450 }, { "epoch": 7.983920841063698, - "grad_norm": 0.51953125, - "learning_rate": 0.0001060732318448894, - "loss": 0.8759, + "grad_norm": 0.3515625, + "learning_rate": 2.3646056962879946e-05, + "loss": 0.765, "step": 6455 }, { "epoch": 7.990105132962276, - "grad_norm": 0.51171875, - "learning_rate": 0.00010592948735246854, - "loss": 0.8632, + "grad_norm": 0.349609375, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.7504, "step": 6460 }, { "epoch": 7.996289424860853, - "grad_norm": 0.45703125, - "learning_rate": 0.00010578573056397085, - "loss": 0.8694, + "grad_norm": 0.337890625, + "learning_rate": 2.336779362911674e-05, + "loss": 0.7605, "step": 6465 }, { "epoch": 8.0, - "eval_loss": 2.506842851638794, - "eval_runtime": 0.5393, - "eval_samples_per_second": 18.543, - "eval_steps_per_second": 1.854, + "eval_loss": 2.6913065910339355, + "eval_runtime": 0.5419, + "eval_samples_per_second": 18.454, + "eval_steps_per_second": 1.845, "step": 6468 }, { "epoch": 8.00247371675943, - "grad_norm": 0.447265625, - "learning_rate": 0.00010564196177750725, - "loss": 0.8705, + "grad_norm": 0.345703125, + "learning_rate": 2.3229197970960924e-05, + "loss": 0.7614, "step": 6470 }, { "epoch": 8.008658008658008, - "grad_norm": 0.478515625, - "learning_rate": 0.00010549818129121338, - "loss": 0.8568, + "grad_norm": 0.3359375, + "learning_rate": 2.309096051473597e-05, + "loss": 0.7476, "step": 6475 }, { "epoch": 8.014842300556586, - "grad_norm": 0.49609375, - "learning_rate": 0.0001053543894032493, - "loss": 0.8752, + "grad_norm": 0.3515625, + "learning_rate": 2.295308190543859e-05, + "loss": 0.7644, "step": 6480 }, { "epoch": 8.021026592455163, - "grad_norm": 0.474609375, - "learning_rate": 0.00010521058641179861, - "loss": 0.8689, + "grad_norm": 0.337890625, + "learning_rate": 2.2815562786391387e-05, + "loss": 0.7602, "step": 6485 }, { "epoch": 8.02721088435374, - "grad_norm": 0.41015625, - "learning_rate": 0.00010506677261506797, - "loss": 0.8673, + "grad_norm": 0.3359375, + "learning_rate": 2.26784037992395e-05, + "loss": 0.7602, "step": 6490 }, { "epoch": 8.03339517625232, - "grad_norm": 0.490234375, - "learning_rate": 0.00010492294831128641, - "loss": 0.8757, + "grad_norm": 0.361328125, + "learning_rate": 2.2541605583947724e-05, + "loss": 0.7652, "step": 6495 }, { "epoch": 8.039579468150897, - "grad_norm": 0.462890625, - "learning_rate": 0.00010477911379870488, - "loss": 0.8522, + "grad_norm": 0.33984375, + "learning_rate": 2.2405168778797646e-05, + "loss": 0.7472, "step": 6500 }, { "epoch": 8.045763760049475, - "grad_norm": 0.47265625, - "learning_rate": 0.00010463526937559536, - "loss": 0.8703, + "grad_norm": 0.359375, + "learning_rate": 2.226909402038446e-05, + "loss": 0.7592, "step": 6505 }, { "epoch": 8.051948051948052, - "grad_norm": 0.455078125, - "learning_rate": 0.00010449141534025045, - "loss": 0.8631, + "grad_norm": 0.353515625, + "learning_rate": 2.2133381943614207e-05, + "loss": 0.7577, "step": 6510 }, { "epoch": 8.05813234384663, - "grad_norm": 0.46484375, - "learning_rate": 0.00010434755199098261, - "loss": 0.8672, + "grad_norm": 0.34765625, + "learning_rate": 2.1998033181700617e-05, + "loss": 0.7583, "step": 6515 }, { "epoch": 8.064316635745207, - "grad_norm": 0.5078125, - "learning_rate": 0.00010420367962612372, - "loss": 0.8732, + "grad_norm": 0.33984375, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.7609, "step": 6520 }, { "epoch": 8.070500927643785, - "grad_norm": 0.4609375, - "learning_rate": 0.00010405979854402425, - "loss": 0.8687, + "grad_norm": 0.337890625, + "learning_rate": 2.17284281268195e-05, + "loss": 0.7582, "step": 6525 }, { "epoch": 8.076685219542362, - "grad_norm": 0.4453125, - "learning_rate": 0.00010391590904305284, - "loss": 0.8756, + "grad_norm": 0.35546875, + "learning_rate": 2.159417309179189e-05, + "loss": 0.7651, "step": 6530 }, { "epoch": 8.08286951144094, - "grad_norm": 0.451171875, - "learning_rate": 0.00010377201142159554, - "loss": 0.8664, + "grad_norm": 0.353515625, + "learning_rate": 2.1460283887494724e-05, + "loss": 0.76, "step": 6535 }, { "epoch": 8.089053803339517, - "grad_norm": 0.48046875, - "learning_rate": 0.00010362810597805526, - "loss": 0.867, + "grad_norm": 0.34765625, + "learning_rate": 2.1326761138636553e-05, + "loss": 0.7578, "step": 6540 }, { "epoch": 8.095238095238095, - "grad_norm": 0.46484375, - "learning_rate": 0.00010348419301085113, - "loss": 0.8612, + "grad_norm": 0.341796875, + "learning_rate": 2.1193605468216005e-05, + "loss": 0.7515, "step": 6545 }, { "epoch": 8.101422387136672, - "grad_norm": 0.5078125, - "learning_rate": 0.00010334027281841781, - "loss": 0.8646, + "grad_norm": 0.345703125, + "learning_rate": 2.106081749751897e-05, + "loss": 0.7533, "step": 6550 }, { "epoch": 8.10760667903525, - "grad_norm": 0.458984375, - "learning_rate": 0.00010319634569920504, - "loss": 0.8753, + "grad_norm": 0.33984375, + "learning_rate": 2.092839784611579e-05, + "loss": 0.7621, "step": 6555 }, { "epoch": 8.113790970933827, - "grad_norm": 0.4765625, - "learning_rate": 0.00010305241195167687, - "loss": 0.8595, + "grad_norm": 0.34375, + "learning_rate": 2.0796347131858186e-05, + "loss": 0.7519, "step": 6560 }, { "epoch": 8.119975262832405, - "grad_norm": 0.48046875, - "learning_rate": 0.00010290847187431113, - "loss": 0.864, + "grad_norm": 0.337890625, + "learning_rate": 2.0664665970876496e-05, + "loss": 0.7582, "step": 6565 }, { "epoch": 8.126159554730984, - "grad_norm": 0.46875, - "learning_rate": 0.00010276452576559879, - "loss": 0.8822, + "grad_norm": 0.337890625, + "learning_rate": 2.05333549775768e-05, + "loss": 0.7682, "step": 6570 }, { "epoch": 8.132343846629562, - "grad_norm": 0.5, - "learning_rate": 0.00010262057392404328, - "loss": 0.8721, + "grad_norm": 0.341796875, + "learning_rate": 2.0402414764637978e-05, + "loss": 0.7644, "step": 6575 }, { "epoch": 8.13852813852814, - "grad_norm": 0.43359375, - "learning_rate": 0.00010247661664815986, - "loss": 0.8611, + "grad_norm": 0.35546875, + "learning_rate": 2.027184594300898e-05, + "loss": 0.7534, "step": 6580 }, { "epoch": 8.144712430426717, - "grad_norm": 0.4765625, - "learning_rate": 0.00010233265423647523, - "loss": 0.8668, + "grad_norm": 0.33984375, + "learning_rate": 2.0141649121905827e-05, + "loss": 0.7579, "step": 6585 }, { "epoch": 8.150896722325294, - "grad_norm": 0.455078125, - "learning_rate": 0.00010218868698752658, - "loss": 0.8696, + "grad_norm": 0.345703125, + "learning_rate": 2.0011824908808808e-05, + "loss": 0.7619, "step": 6590 }, { "epoch": 8.157081014223872, - "grad_norm": 0.45703125, - "learning_rate": 0.00010204471519986124, - "loss": 0.8711, + "grad_norm": 0.34765625, + "learning_rate": 1.9882373909459795e-05, + "loss": 0.7595, "step": 6595 }, { "epoch": 8.16326530612245, - "grad_norm": 0.416015625, - "learning_rate": 0.00010190073917203589, - "loss": 0.87, + "grad_norm": 0.33984375, + "learning_rate": 1.9753296727859195e-05, + "loss": 0.7598, "step": 6600 }, { "epoch": 8.169449598021027, - "grad_norm": 0.52734375, - "learning_rate": 0.00010175675920261602, - "loss": 0.8702, + "grad_norm": 0.337890625, + "learning_rate": 1.962459396626326e-05, + "loss": 0.759, "step": 6605 }, { "epoch": 8.175633889919604, - "grad_norm": 0.52734375, - "learning_rate": 0.00010161277559017528, - "loss": 0.864, + "grad_norm": 0.341796875, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.7556, "step": 6610 }, { "epoch": 8.181818181818182, - "grad_norm": 0.54296875, - "learning_rate": 0.00010146878863329492, - "loss": 0.8583, + "grad_norm": 0.341796875, + "learning_rate": 1.936831410337261e-05, + "loss": 0.7477, "step": 6615 }, { "epoch": 8.18800247371676, - "grad_norm": 0.474609375, - "learning_rate": 0.00010132479863056303, - "loss": 0.8689, + "grad_norm": 0.34375, + "learning_rate": 1.9240738197844278e-05, + "loss": 0.7617, "step": 6620 }, { "epoch": 8.194186765615337, - "grad_norm": 0.486328125, - "learning_rate": 0.00010118080588057416, - "loss": 0.8629, + "grad_norm": 0.345703125, + "learning_rate": 1.9113539103847723e-05, + "loss": 0.7535, "step": 6625 }, { "epoch": 8.200371057513914, - "grad_norm": 0.466796875, - "learning_rate": 0.00010103681068192845, - "loss": 0.8656, + "grad_norm": 0.341796875, + "learning_rate": 1.89867174148763e-05, + "loss": 0.7577, "step": 6630 }, { "epoch": 8.206555349412492, - "grad_norm": 0.482421875, - "learning_rate": 0.00010089281333323112, - "loss": 0.8723, + "grad_norm": 0.33984375, + "learning_rate": 1.886027372266247e-05, + "loss": 0.7597, "step": 6635 }, { "epoch": 8.21273964131107, - "grad_norm": 0.443359375, - "learning_rate": 0.00010074881413309193, - "loss": 0.8689, + "grad_norm": 0.337890625, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.7588, "step": 6640 }, { "epoch": 8.218923933209647, - "grad_norm": 0.423828125, - "learning_rate": 0.00010060481338012435, - "loss": 0.8706, + "grad_norm": 0.333984375, + "learning_rate": 1.860852268661616e-05, + "loss": 0.7583, "step": 6645 }, { "epoch": 8.225108225108226, - "grad_norm": 0.44140625, - "learning_rate": 0.00010046081137294516, - "loss": 0.8672, + "grad_norm": 0.357421875, + "learning_rate": 1.8483216517419142e-05, + "loss": 0.7556, "step": 6650 }, { "epoch": 8.231292517006803, - "grad_norm": 0.58203125, - "learning_rate": 0.00010031680841017377, - "loss": 0.8601, + "grad_norm": 0.34765625, + "learning_rate": 1.835829069424515e-05, + "loss": 0.7524, "step": 6655 }, { "epoch": 8.237476808905381, - "grad_norm": 0.490234375, - "learning_rate": 0.00010017280479043147, - "loss": 0.866, + "grad_norm": 0.34375, + "learning_rate": 1.8233745799980817e-05, + "loss": 0.7563, "step": 6660 }, { "epoch": 8.243661100803958, - "grad_norm": 0.5546875, - "learning_rate": 0.00010002880081234103, - "loss": 0.8567, + "grad_norm": 0.345703125, + "learning_rate": 1.810958241573535e-05, + "loss": 0.747, "step": 6665 }, { "epoch": 8.249845392702536, - "grad_norm": 0.546875, - "learning_rate": 9.988479677452584e-05, - "loss": 0.8731, + "grad_norm": 0.34765625, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.7622, "step": 6670 }, { "epoch": 8.256029684601113, - "grad_norm": 0.451171875, - "learning_rate": 9.97407929756095e-05, - "loss": 0.8686, + "grad_norm": 0.33984375, + "learning_rate": 1.7862402492834806e-05, + "loss": 0.76, "step": 6675 }, { "epoch": 8.262213976499691, - "grad_norm": 0.4765625, - "learning_rate": 9.959678971421508e-05, - "loss": 0.8743, + "grad_norm": 0.349609375, + "learning_rate": 1.773938710748706e-05, + "loss": 0.7635, "step": 6680 }, { "epoch": 8.268398268398268, - "grad_norm": 0.45703125, - "learning_rate": 9.94527872889646e-05, - "loss": 0.8558, + "grad_norm": 0.345703125, + "learning_rate": 1.761675553876736e-05, + "loss": 0.7445, "step": 6685 }, { "epoch": 8.274582560296846, - "grad_norm": 0.466796875, - "learning_rate": 9.930878599847821e-05, - "loss": 0.8717, + "grad_norm": 0.34375, + "learning_rate": 1.7494508358857677e-05, + "loss": 0.7611, "step": 6690 }, { "epoch": 8.280766852195423, - "grad_norm": 0.46484375, - "learning_rate": 9.916478614137383e-05, - "loss": 0.8627, + "grad_norm": 0.3359375, + "learning_rate": 1.737264613814633e-05, + "loss": 0.753, "step": 6695 }, { "epoch": 8.286951144094001, - "grad_norm": 0.470703125, - "learning_rate": 9.902078801626636e-05, - "loss": 0.8667, + "grad_norm": 0.34375, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.7568, "step": 6700 }, { "epoch": 8.293135435992578, - "grad_norm": 0.50390625, - "learning_rate": 9.887679192176712e-05, - "loss": 0.8584, + "grad_norm": 0.337890625, + "learning_rate": 1.713007884688904e-05, + "loss": 0.7483, "step": 6705 }, { "epoch": 8.299319727891156, - "grad_norm": 0.455078125, - "learning_rate": 9.873279815648318e-05, - "loss": 0.8681, + "grad_norm": 0.341796875, + "learning_rate": 1.700937490812844e-05, + "loss": 0.7579, "step": 6710 }, { "epoch": 8.305504019789733, - "grad_norm": 0.44140625, - "learning_rate": 9.858880701901682e-05, - "loss": 0.8734, + "grad_norm": 0.341796875, + "learning_rate": 1.6889058192131734e-05, + "loss": 0.7635, "step": 6715 }, { "epoch": 8.311688311688311, - "grad_norm": 0.44140625, - "learning_rate": 9.844481880796491e-05, - "loss": 0.8738, + "grad_norm": 0.34765625, + "learning_rate": 1.676912926028007e-05, + "loss": 0.7617, "step": 6720 }, { "epoch": 8.317872603586888, - "grad_norm": 0.45703125, - "learning_rate": 9.830083382191819e-05, - "loss": 0.8735, + "grad_norm": 0.349609375, + "learning_rate": 1.664958867214519e-05, + "loss": 0.7611, "step": 6725 }, { "epoch": 8.324056895485468, - "grad_norm": 0.44921875, - "learning_rate": 9.815685235946068e-05, - "loss": 0.8685, + "grad_norm": 0.345703125, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.7589, "step": 6730 }, { "epoch": 8.330241187384045, - "grad_norm": 0.447265625, - "learning_rate": 9.801287471916919e-05, - "loss": 0.8737, + "grad_norm": 0.341796875, + "learning_rate": 1.6411674756250663e-05, + "loss": 0.7638, "step": 6735 }, { "epoch": 8.336425479282623, - "grad_norm": 0.44140625, - "learning_rate": 9.786890119961253e-05, - "loss": 0.8647, + "grad_norm": 0.34765625, + "learning_rate": 1.6293302538564382e-05, + "loss": 0.7526, "step": 6740 }, { "epoch": 8.3426097711812, - "grad_norm": 0.486328125, - "learning_rate": 9.772493209935099e-05, - "loss": 0.8701, + "grad_norm": 0.365234375, + "learning_rate": 1.617532088473651e-05, + "loss": 0.7587, "step": 6745 }, { "epoch": 8.348794063079778, - "grad_norm": 0.455078125, - "learning_rate": 9.758096771693573e-05, - "loss": 0.8589, + "grad_norm": 0.35546875, + "learning_rate": 1.6057730345253065e-05, + "loss": 0.7523, "step": 6750 }, { "epoch": 8.354978354978355, - "grad_norm": 0.4453125, - "learning_rate": 9.743700835090804e-05, - "loss": 0.8674, + "grad_norm": 0.349609375, + "learning_rate": 1.594053146877529e-05, + "loss": 0.7557, "step": 6755 }, { "epoch": 8.361162646876933, - "grad_norm": 0.474609375, - "learning_rate": 9.729305429979887e-05, - "loss": 0.8682, + "grad_norm": 0.341796875, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.7588, "step": 6760 }, { "epoch": 8.36734693877551, - "grad_norm": 0.439453125, - "learning_rate": 9.714910586212816e-05, - "loss": 0.8732, + "grad_norm": 0.349609375, + "learning_rate": 1.570731089034151e-05, + "loss": 0.7627, "step": 6765 }, { "epoch": 8.373531230674088, - "grad_norm": 0.462890625, - "learning_rate": 9.700516333640415e-05, - "loss": 0.8845, + "grad_norm": 0.345703125, + "learning_rate": 1.5591290276560466e-05, + "loss": 0.7688, "step": 6770 }, { "epoch": 8.379715522572665, - "grad_norm": 0.439453125, - "learning_rate": 9.686122702112285e-05, - "loss": 0.8646, + "grad_norm": 0.345703125, + "learning_rate": 1.5475663502129822e-05, + "loss": 0.7577, "step": 6775 }, { "epoch": 8.385899814471243, - "grad_norm": 0.451171875, - "learning_rate": 9.671729721476746e-05, - "loss": 0.8635, + "grad_norm": 0.33984375, + "learning_rate": 1.536043110654809e-05, + "loss": 0.755, "step": 6780 }, { "epoch": 8.39208410636982, - "grad_norm": 0.470703125, - "learning_rate": 9.657337421580759e-05, - "loss": 0.8722, + "grad_norm": 0.34765625, + "learning_rate": 1.5245593627473675e-05, + "loss": 0.7617, "step": 6785 }, { "epoch": 8.398268398268398, - "grad_norm": 0.423828125, - "learning_rate": 9.642945832269874e-05, - "loss": 0.8505, + "grad_norm": 0.33984375, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.7438, "step": 6790 }, { "epoch": 8.404452690166975, - "grad_norm": 0.439453125, - "learning_rate": 9.628554983388173e-05, - "loss": 0.865, + "grad_norm": 0.33984375, + "learning_rate": 1.5017105560264755e-05, + "loss": 0.7559, "step": 6795 }, { "epoch": 8.410636982065553, - "grad_norm": 0.490234375, - "learning_rate": 9.614164904778196e-05, - "loss": 0.8605, + "grad_norm": 0.34765625, + "learning_rate": 1.4903456038223939e-05, + "loss": 0.7504, "step": 6800 }, { "epoch": 8.416821273964132, - "grad_norm": 0.431640625, - "learning_rate": 9.599775626280892e-05, - "loss": 0.867, + "grad_norm": 0.3359375, + "learning_rate": 1.4790203564872818e-05, + "loss": 0.7544, "step": 6805 }, { "epoch": 8.42300556586271, - "grad_norm": 0.443359375, - "learning_rate": 9.585387177735547e-05, - "loss": 0.8689, + "grad_norm": 0.341796875, + "learning_rate": 1.4677348668631763e-05, + "loss": 0.7612, "step": 6810 }, { "epoch": 8.429189857761287, - "grad_norm": 0.4765625, - "learning_rate": 9.570999588979728e-05, - "loss": 0.8721, + "grad_norm": 0.35546875, + "learning_rate": 1.45648918760661e-05, + "loss": 0.7613, "step": 6815 }, { "epoch": 8.435374149659864, - "grad_norm": 0.47265625, - "learning_rate": 9.556612889849214e-05, - "loss": 0.8601, + "grad_norm": 0.34375, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.7519, "step": 6820 }, { "epoch": 8.441558441558442, - "grad_norm": 0.466796875, - "learning_rate": 9.542227110177945e-05, - "loss": 0.8559, + "grad_norm": 0.3359375, + "learning_rate": 1.4341174698932224e-05, + "loss": 0.7503, "step": 6825 }, { "epoch": 8.44774273345702, - "grad_norm": 0.45703125, - "learning_rate": 9.527842279797953e-05, - "loss": 0.866, + "grad_norm": 0.34375, + "learning_rate": 1.4229915358197377e-05, + "loss": 0.7562, "step": 6830 }, { "epoch": 8.453927025355597, - "grad_norm": 0.447265625, - "learning_rate": 9.513458428539298e-05, - "loss": 0.8635, + "grad_norm": 0.33984375, + "learning_rate": 1.4119056208799786e-05, + "loss": 0.7516, "step": 6835 }, { "epoch": 8.460111317254174, - "grad_norm": 0.466796875, - "learning_rate": 9.499075586230013e-05, - "loss": 0.8694, + "grad_norm": 0.33984375, + "learning_rate": 1.4008597767992871e-05, + "loss": 0.7588, "step": 6840 }, { "epoch": 8.466295609152752, - "grad_norm": 0.44140625, - "learning_rate": 9.484693782696041e-05, - "loss": 0.8716, + "grad_norm": 0.3515625, + "learning_rate": 1.389854055116041e-05, + "loss": 0.7607, "step": 6845 }, { "epoch": 8.47247990105133, - "grad_norm": 0.455078125, - "learning_rate": 9.470313047761167e-05, - "loss": 0.8623, + "grad_norm": 0.34375, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.757, "step": 6850 }, { "epoch": 8.478664192949907, - "grad_norm": 0.423828125, - "learning_rate": 9.455933411246958e-05, - "loss": 0.8603, + "grad_norm": 0.34375, + "learning_rate": 1.3679631841591411e-05, + "loss": 0.7535, "step": 6855 }, { "epoch": 8.484848484848484, - "grad_norm": 0.42578125, - "learning_rate": 9.44155490297271e-05, - "loss": 0.863, + "grad_norm": 0.34375, + "learning_rate": 1.3570781370252582e-05, + "loss": 0.7512, "step": 6860 }, { "epoch": 8.491032776747062, - "grad_norm": 0.447265625, - "learning_rate": 9.427177552755374e-05, - "loss": 0.8601, + "grad_norm": 0.34765625, + "learning_rate": 1.3462334165678902e-05, + "loss": 0.7506, "step": 6865 }, { "epoch": 8.49721706864564, - "grad_norm": 0.443359375, - "learning_rate": 9.412801390409497e-05, - "loss": 0.8738, + "grad_norm": 0.36328125, + "learning_rate": 1.3354290733869979e-05, + "loss": 0.7614, "step": 6870 }, { "epoch": 8.503401360544217, - "grad_norm": 0.50390625, - "learning_rate": 9.398426445747171e-05, - "loss": 0.8707, + "grad_norm": 0.34765625, + "learning_rate": 1.3246651578941572e-05, + "loss": 0.7618, "step": 6875 }, { "epoch": 8.509585652442794, - "grad_norm": 0.60546875, - "learning_rate": 9.38405274857796e-05, - "loss": 0.8667, + "grad_norm": 0.3359375, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.7581, "step": 6880 }, { "epoch": 8.515769944341374, - "grad_norm": 0.48828125, - "learning_rate": 9.369680328708836e-05, - "loss": 0.876, + "grad_norm": 0.3359375, + "learning_rate": 1.3032588106755084e-05, + "loss": 0.7652, "step": 6885 }, { "epoch": 8.521954236239951, - "grad_norm": 0.439453125, - "learning_rate": 9.355309215944124e-05, - "loss": 0.8676, + "grad_norm": 0.341796875, + "learning_rate": 1.2926164788287543e-05, + "loss": 0.7585, "step": 6890 }, { "epoch": 8.528138528138529, - "grad_norm": 0.43359375, - "learning_rate": 9.340939440085445e-05, - "loss": 0.8488, + "grad_norm": 0.34375, + "learning_rate": 1.2820147744276866e-05, + "loss": 0.7428, "step": 6895 }, { "epoch": 8.534322820037106, - "grad_norm": 0.455078125, - "learning_rate": 9.326571030931637e-05, - "loss": 0.8724, + "grad_norm": 0.337890625, + "learning_rate": 1.2714537469383858e-05, + "loss": 0.7608, "step": 6900 }, { "epoch": 8.540507111935684, - "grad_norm": 0.447265625, - "learning_rate": 9.312204018278716e-05, - "loss": 0.8657, + "grad_norm": 0.33984375, + "learning_rate": 1.2609334456371514e-05, + "loss": 0.7555, "step": 6905 }, { "epoch": 8.546691403834261, - "grad_norm": 0.421875, - "learning_rate": 9.297838431919794e-05, - "loss": 0.8611, + "grad_norm": 0.337890625, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.753, "step": 6910 }, { "epoch": 8.552875695732839, - "grad_norm": 0.443359375, - "learning_rate": 9.283474301645026e-05, - "loss": 0.8634, + "grad_norm": 0.337890625, + "learning_rate": 1.240015217753685e-05, + "loss": 0.7556, "step": 6915 }, { "epoch": 8.559059987631416, - "grad_norm": 0.474609375, - "learning_rate": 9.269111657241548e-05, - "loss": 0.8627, + "grad_norm": 0.341796875, + "learning_rate": 1.2296173887730123e-05, + "loss": 0.755, "step": 6920 }, { "epoch": 8.565244279529994, - "grad_norm": 0.462890625, - "learning_rate": 9.254750528493417e-05, - "loss": 0.8572, + "grad_norm": 0.337890625, + "learning_rate": 1.2192604811830532e-05, + "loss": 0.7499, "step": 6925 }, { "epoch": 8.571428571428571, - "grad_norm": 0.46875, - "learning_rate": 9.240390945181543e-05, - "loss": 0.8648, + "grad_norm": 0.337890625, + "learning_rate": 1.2089445433077073e-05, + "loss": 0.7544, "step": 6930 }, { "epoch": 8.577612863327149, - "grad_norm": 0.490234375, - "learning_rate": 9.226032937083635e-05, - "loss": 0.8621, + "grad_norm": 0.34375, + "learning_rate": 1.1986696232797101e-05, + "loss": 0.7542, "step": 6935 }, { "epoch": 8.583797155225726, - "grad_norm": 0.427734375, - "learning_rate": 9.211676533974131e-05, - "loss": 0.8634, + "grad_norm": 0.33984375, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.7524, "step": 6940 }, { "epoch": 8.589981447124304, - "grad_norm": 0.4453125, - "learning_rate": 9.197321765624152e-05, - "loss": 0.8667, + "grad_norm": 0.34765625, + "learning_rate": 1.178243028339574e-05, + "loss": 0.7568, "step": 6945 }, { "epoch": 8.596165739022881, - "grad_norm": 0.45703125, - "learning_rate": 9.182968661801412e-05, - "loss": 0.8707, + "grad_norm": 0.345703125, + "learning_rate": 1.1680914487350959e-05, + "loss": 0.7597, "step": 6950 }, { "epoch": 8.602350030921459, - "grad_norm": 0.46875, - "learning_rate": 9.168617252270183e-05, - "loss": 0.8772, + "grad_norm": 0.337890625, + "learning_rate": 1.1579810775928502e-05, + "loss": 0.7645, "step": 6955 }, { "epoch": 8.608534322820038, - "grad_norm": 0.4453125, - "learning_rate": 9.154267566791223e-05, - "loss": 0.8584, + "grad_norm": 0.349609375, + "learning_rate": 1.1479119620864276e-05, + "loss": 0.7513, "step": 6960 }, { "epoch": 8.614718614718615, - "grad_norm": 0.470703125, - "learning_rate": 9.139919635121714e-05, - "loss": 0.8657, + "grad_norm": 0.3359375, + "learning_rate": 1.1378841491969239e-05, + "loss": 0.7533, "step": 6965 }, { "epoch": 8.620902906617193, - "grad_norm": 0.4375, - "learning_rate": 9.125573487015203e-05, - "loss": 0.8632, + "grad_norm": 0.333984375, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.7549, "step": 6970 }, { "epoch": 8.62708719851577, - "grad_norm": 0.45703125, - "learning_rate": 9.111229152221535e-05, - "loss": 0.8705, + "grad_norm": 0.33984375, + "learning_rate": 1.117952618229301e-05, + "loss": 0.7588, "step": 6975 }, { "epoch": 8.633271490414348, - "grad_norm": 0.458984375, - "learning_rate": 9.096886660486797e-05, - "loss": 0.869, + "grad_norm": 0.333984375, + "learning_rate": 1.1080489931489391e-05, + "loss": 0.7599, "step": 6980 }, { "epoch": 8.639455782312925, - "grad_norm": 0.4375, - "learning_rate": 9.082546041553253e-05, - "loss": 0.875, + "grad_norm": 0.341796875, + "learning_rate": 1.0981868566805942e-05, + "loss": 0.765, "step": 6985 }, { "epoch": 8.645640074211503, - "grad_norm": 0.470703125, - "learning_rate": 9.068207325159284e-05, - "loss": 0.8667, + "grad_norm": 0.349609375, + "learning_rate": 1.0883662548396257e-05, + "loss": 0.757, "step": 6990 }, { "epoch": 8.65182436611008, - "grad_norm": 0.439453125, - "learning_rate": 9.053870541039327e-05, - "loss": 0.8749, + "grad_norm": 0.34765625, + "learning_rate": 1.0785872334476033e-05, + "loss": 0.7657, "step": 6995 }, { "epoch": 8.658008658008658, - "grad_norm": 0.451171875, - "learning_rate": 9.039535718923804e-05, - "loss": 0.8608, + "grad_norm": 0.337890625, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.7525, "step": 7000 }, { "epoch": 8.664192949907235, - "grad_norm": 0.474609375, - "learning_rate": 9.02520288853908e-05, - "loss": 0.8621, + "grad_norm": 0.3515625, + "learning_rate": 1.0591541143264084e-05, + "loss": 0.7522, "step": 7005 }, { "epoch": 8.670377241805813, - "grad_norm": 0.42578125, - "learning_rate": 9.01087207960739e-05, - "loss": 0.8716, + "grad_norm": 0.34765625, + "learning_rate": 1.049500107269481e-05, + "loss": 0.7616, "step": 7010 }, { "epoch": 8.67656153370439, - "grad_norm": 0.4609375, - "learning_rate": 8.996543321846759e-05, - "loss": 0.8588, + "grad_norm": 0.3515625, + "learning_rate": 1.0398878620055618e-05, + "loss": 0.7494, "step": 7015 }, { "epoch": 8.682745825602968, - "grad_norm": 0.439453125, - "learning_rate": 8.982216644970979e-05, - "loss": 0.8595, + "grad_norm": 0.349609375, + "learning_rate": 1.0303174233840528e-05, + "loss": 0.7502, "step": 7020 }, { "epoch": 8.688930117501545, - "grad_norm": 0.46484375, - "learning_rate": 8.967892078689513e-05, - "loss": 0.8813, + "grad_norm": 0.34765625, + "learning_rate": 1.0207888360592998e-05, + "loss": 0.7725, "step": 7025 }, { "epoch": 8.695114409400123, - "grad_norm": 0.435546875, - "learning_rate": 8.953569652707459e-05, - "loss": 0.8618, + "grad_norm": 0.34765625, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.7546, "step": 7030 }, { "epoch": 8.7012987012987, - "grad_norm": 0.44921875, - "learning_rate": 8.939249396725467e-05, - "loss": 0.8558, + "grad_norm": 0.349609375, + "learning_rate": 1.0018573929408526e-05, + "loss": 0.746, "step": 7035 }, { "epoch": 8.70748299319728, - "grad_norm": 0.4765625, - "learning_rate": 8.924931340439694e-05, - "loss": 0.8712, + "grad_norm": 0.34375, + "learning_rate": 9.924546254786493e-06, + "loss": 0.7605, "step": 7040 }, { "epoch": 8.713667285095857, - "grad_norm": 0.44921875, - "learning_rate": 8.910615513541729e-05, - "loss": 0.8496, + "grad_norm": 0.3359375, + "learning_rate": 9.83093885975771e-06, + "loss": 0.7406, "step": 7045 }, { "epoch": 8.719851576994435, - "grad_norm": 0.439453125, - "learning_rate": 8.896301945718541e-05, - "loss": 0.8614, + "grad_norm": 0.341796875, + "learning_rate": 9.737752181081338e-06, + "loss": 0.7543, "step": 7050 }, { "epoch": 8.726035868893012, - "grad_norm": 0.486328125, - "learning_rate": 8.881990666652417e-05, - "loss": 0.8733, + "grad_norm": 0.361328125, + "learning_rate": 9.644986653553512e-06, + "loss": 0.7606, "step": 7055 }, { "epoch": 8.73222016079159, - "grad_norm": 0.486328125, - "learning_rate": 8.867681706020894e-05, - "loss": 0.8747, + "grad_norm": 0.345703125, + "learning_rate": 9.552642710005299e-06, + "loss": 0.7609, "step": 7060 }, { "epoch": 8.738404452690167, - "grad_norm": 0.41015625, - "learning_rate": 8.853375093496699e-05, - "loss": 0.8717, + "grad_norm": 0.337890625, + "learning_rate": 9.460720781300814e-06, + "loss": 0.7606, "step": 7065 }, { "epoch": 8.744588744588745, - "grad_norm": 0.458984375, - "learning_rate": 8.839070858747697e-05, - "loss": 0.8632, + "grad_norm": 0.345703125, + "learning_rate": 9.369221296335006e-06, + "loss": 0.7521, "step": 7070 }, { "epoch": 8.750773036487322, - "grad_norm": 0.40625, - "learning_rate": 8.824769031436822e-05, - "loss": 0.867, + "grad_norm": 0.33984375, + "learning_rate": 9.278144682031809e-06, + "loss": 0.7593, "step": 7075 }, { "epoch": 8.7569573283859, - "grad_norm": 0.4921875, - "learning_rate": 8.810469641222001e-05, - "loss": 0.8627, + "grad_norm": 0.333984375, + "learning_rate": 9.187491363342093e-06, + "loss": 0.7523, "step": 7080 }, { "epoch": 8.763141620284477, - "grad_norm": 0.498046875, - "learning_rate": 8.796172717756124e-05, - "loss": 0.8723, + "grad_norm": 0.353515625, + "learning_rate": 9.097261763241694e-06, + "loss": 0.7608, "step": 7085 }, { "epoch": 8.769325912183055, - "grad_norm": 0.5078125, - "learning_rate": 8.781878290686959e-05, - "loss": 0.878, + "grad_norm": 0.345703125, + "learning_rate": 9.0074563027294e-06, + "loss": 0.7652, "step": 7090 }, { "epoch": 8.775510204081632, - "grad_norm": 0.5078125, - "learning_rate": 8.767586389657098e-05, - "loss": 0.8723, + "grad_norm": 0.359375, + "learning_rate": 8.918075400825098e-06, + "loss": 0.7634, "step": 7095 }, { "epoch": 8.78169449598021, - "grad_norm": 0.49609375, - "learning_rate": 8.753297044303896e-05, - "loss": 0.8715, + "grad_norm": 0.34765625, + "learning_rate": 8.829119474567671e-06, + "loss": 0.7596, "step": 7100 }, { "epoch": 8.787878787878787, - "grad_norm": 0.439453125, - "learning_rate": 8.739010284259406e-05, - "loss": 0.8626, + "grad_norm": 0.34375, + "learning_rate": 8.740588939013173e-06, + "loss": 0.7522, "step": 7105 }, { "epoch": 8.794063079777365, - "grad_norm": 0.427734375, - "learning_rate": 8.724726139150318e-05, - "loss": 0.8616, + "grad_norm": 0.333984375, + "learning_rate": 8.652484207232803e-06, + "loss": 0.7516, "step": 7110 }, { "epoch": 8.800247371675944, - "grad_norm": 0.4375, - "learning_rate": 8.710444638597905e-05, - "loss": 0.8726, + "grad_norm": 0.33984375, + "learning_rate": 8.564805690311029e-06, + "loss": 0.7628, "step": 7115 }, { "epoch": 8.806431663574521, - "grad_norm": 0.498046875, - "learning_rate": 8.696165812217953e-05, - "loss": 0.8644, + "grad_norm": 0.34765625, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7541, "step": 7120 }, { "epoch": 8.812615955473099, - "grad_norm": 0.447265625, - "learning_rate": 8.6818896896207e-05, - "loss": 0.86, + "grad_norm": 0.359375, + "learning_rate": 8.390728935436088e-06, + "loss": 0.7502, "step": 7125 }, { "epoch": 8.818800247371676, - "grad_norm": 0.47265625, - "learning_rate": 8.667616300410778e-05, - "loss": 0.871, + "grad_norm": 0.345703125, + "learning_rate": 8.304331509700891e-06, + "loss": 0.7605, "step": 7130 }, { "epoch": 8.824984539270254, - "grad_norm": 0.4609375, - "learning_rate": 8.653345674187157e-05, - "loss": 0.8669, + "grad_norm": 0.345703125, + "learning_rate": 8.218361923256601e-06, + "loss": 0.7572, "step": 7135 }, { "epoch": 8.831168831168831, - "grad_norm": 0.458984375, - "learning_rate": 8.639077840543077e-05, - "loss": 0.8533, + "grad_norm": 0.36328125, + "learning_rate": 8.132820577225387e-06, + "loss": 0.7471, "step": 7140 }, { "epoch": 8.837353123067409, - "grad_norm": 0.44921875, - "learning_rate": 8.62481282906597e-05, - "loss": 0.8853, + "grad_norm": 0.341796875, + "learning_rate": 8.047707870731291e-06, + "loss": 0.7698, "step": 7145 }, { "epoch": 8.843537414965986, - "grad_norm": 0.4296875, - "learning_rate": 8.610550669337433e-05, - "loss": 0.864, + "grad_norm": 0.34375, + "learning_rate": 7.963024200898462e-06, + "loss": 0.755, "step": 7150 }, { "epoch": 8.849721706864564, - "grad_norm": 0.4375, - "learning_rate": 8.596291390933147e-05, - "loss": 0.8616, + "grad_norm": 0.341796875, + "learning_rate": 7.878769962849141e-06, + "loss": 0.7533, "step": 7155 }, { "epoch": 8.855905998763141, - "grad_norm": 0.4375, - "learning_rate": 8.582035023422815e-05, - "loss": 0.8634, + "grad_norm": 0.345703125, + "learning_rate": 7.794945549701993e-06, + "loss": 0.7536, "step": 7160 }, { "epoch": 8.862090290661719, - "grad_norm": 0.416015625, - "learning_rate": 8.567781596370104e-05, - "loss": 0.872, + "grad_norm": 0.337890625, + "learning_rate": 7.711551352570056e-06, + "loss": 0.7605, "step": 7165 }, { "epoch": 8.868274582560296, - "grad_norm": 0.42578125, - "learning_rate": 8.553531139332582e-05, - "loss": 0.8525, + "grad_norm": 0.34765625, + "learning_rate": 7.6285877605591135e-06, + "loss": 0.7442, "step": 7170 }, { "epoch": 8.874458874458874, - "grad_norm": 0.43359375, - "learning_rate": 8.539283681861661e-05, - "loss": 0.8559, + "grad_norm": 0.33984375, + "learning_rate": 7.546055160765819e-06, + "loss": 0.7463, "step": 7175 }, { "epoch": 8.880643166357451, - "grad_norm": 0.40625, - "learning_rate": 8.525039253502529e-05, - "loss": 0.8702, + "grad_norm": 0.3359375, + "learning_rate": 7.463953938275858e-06, + "loss": 0.7583, "step": 7180 }, { "epoch": 8.886827458256029, - "grad_norm": 0.45703125, - "learning_rate": 8.510797883794097e-05, - "loss": 0.867, + "grad_norm": 0.333984375, + "learning_rate": 7.382284476162127e-06, + "loss": 0.758, "step": 7185 }, { "epoch": 8.893011750154606, - "grad_norm": 0.44140625, - "learning_rate": 8.496559602268928e-05, - "loss": 0.8573, + "grad_norm": 0.3359375, + "learning_rate": 7.3010471554830766e-06, + "loss": 0.7482, "step": 7190 }, { "epoch": 8.899196042053186, - "grad_norm": 0.45703125, - "learning_rate": 8.482324438453187e-05, - "loss": 0.8687, + "grad_norm": 0.357421875, + "learning_rate": 7.220242355280771e-06, + "loss": 0.7571, "step": 7195 }, { "epoch": 8.905380333951763, - "grad_norm": 0.4765625, - "learning_rate": 8.468092421866573e-05, - "loss": 0.8658, + "grad_norm": 0.357421875, + "learning_rate": 7.1398704525792e-06, + "loss": 0.7541, "step": 7200 }, { "epoch": 8.91156462585034, - "grad_norm": 0.44921875, - "learning_rate": 8.45386358202225e-05, - "loss": 0.8594, + "grad_norm": 0.3359375, + "learning_rate": 7.0599318223825925e-06, + "loss": 0.752, "step": 7205 }, { "epoch": 8.917748917748918, - "grad_norm": 0.427734375, - "learning_rate": 8.439637948426801e-05, - "loss": 0.8699, + "grad_norm": 0.34375, + "learning_rate": 6.980426837673437e-06, + "loss": 0.7591, "step": 7210 }, { "epoch": 8.923933209647496, - "grad_norm": 0.447265625, - "learning_rate": 8.425415550580162e-05, - "loss": 0.865, + "grad_norm": 0.345703125, + "learning_rate": 6.901355869411053e-06, + "loss": 0.7568, "step": 7215 }, { "epoch": 8.930117501546073, - "grad_norm": 0.4453125, - "learning_rate": 8.411196417975558e-05, - "loss": 0.8646, + "grad_norm": 0.333984375, + "learning_rate": 6.8227192865295995e-06, + "loss": 0.752, "step": 7220 }, { "epoch": 8.93630179344465, - "grad_norm": 0.43359375, - "learning_rate": 8.396980580099438e-05, - "loss": 0.8553, + "grad_norm": 0.333984375, + "learning_rate": 6.744517455936483e-06, + "loss": 0.7447, "step": 7225 }, { "epoch": 8.942486085343228, - "grad_norm": 0.478515625, - "learning_rate": 8.382768066431425e-05, - "loss": 0.8629, + "grad_norm": 0.34375, + "learning_rate": 6.666750742510619e-06, + "loss": 0.7554, "step": 7230 }, { "epoch": 8.948670377241806, - "grad_norm": 0.427734375, - "learning_rate": 8.368558906444244e-05, - "loss": 0.8691, + "grad_norm": 0.341796875, + "learning_rate": 6.589419509100736e-06, + "loss": 0.7593, "step": 7235 }, { "epoch": 8.954854669140383, - "grad_norm": 0.4296875, - "learning_rate": 8.354353129603668e-05, - "loss": 0.8654, + "grad_norm": 0.341796875, + "learning_rate": 6.512524116523633e-06, + "loss": 0.7571, "step": 7240 }, { "epoch": 8.96103896103896, - "grad_norm": 0.421875, - "learning_rate": 8.340150765368452e-05, - "loss": 0.8607, + "grad_norm": 0.33984375, + "learning_rate": 6.436064923562601e-06, + "loss": 0.7516, "step": 7245 }, { "epoch": 8.967223252937538, - "grad_norm": 0.421875, - "learning_rate": 8.325951843190274e-05, - "loss": 0.8741, + "grad_norm": 0.341796875, + "learning_rate": 6.360042286965595e-06, + "loss": 0.7647, "step": 7250 }, { "epoch": 8.973407544836116, - "grad_norm": 0.447265625, - "learning_rate": 8.311756392513681e-05, - "loss": 0.8696, + "grad_norm": 0.3515625, + "learning_rate": 6.284456561443763e-06, + "loss": 0.7573, "step": 7255 }, { "epoch": 8.979591836734693, - "grad_norm": 0.59375, - "learning_rate": 8.297564442776014e-05, - "loss": 0.876, + "grad_norm": 0.359375, + "learning_rate": 6.209308099669597e-06, + "loss": 0.7641, "step": 7260 }, { "epoch": 8.98577612863327, - "grad_norm": 0.4921875, - "learning_rate": 8.283376023407357e-05, - "loss": 0.8813, + "grad_norm": 0.3515625, + "learning_rate": 6.134597252275409e-06, + "loss": 0.7687, "step": 7265 }, { "epoch": 8.99196042053185, - "grad_norm": 0.447265625, - "learning_rate": 8.269191163830467e-05, - "loss": 0.8682, + "grad_norm": 0.34375, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.7573, "step": 7270 }, { "epoch": 8.998144712430427, - "grad_norm": 0.439453125, - "learning_rate": 8.255009893460724e-05, - "loss": 0.8608, + "grad_norm": 0.333984375, + "learning_rate": 5.9864897929454374e-06, + "loss": 0.7532, "step": 7275 }, { "epoch": 8.999381570810142, - "eval_loss": 2.514324903488159, - "eval_runtime": 0.6521, - "eval_samples_per_second": 15.335, - "eval_steps_per_second": 1.534, + "eval_loss": 2.69946551322937, + "eval_runtime": 0.8232, + "eval_samples_per_second": 12.147, + "eval_steps_per_second": 1.215, "step": 7276 }, { "epoch": 9.004329004329005, - "grad_norm": 0.443359375, - "learning_rate": 8.240832241706068e-05, - "loss": 0.8585, + "grad_norm": 0.33984375, + "learning_rate": 5.913093872058528e-06, + "loss": 0.7539, "step": 7280 }, { "epoch": 9.010513296227582, - "grad_norm": 0.423828125, - "learning_rate": 8.226658237966933e-05, - "loss": 0.8596, + "grad_norm": 0.333984375, + "learning_rate": 5.84013694764618e-06, + "loss": 0.7551, "step": 7285 }, { "epoch": 9.01669758812616, - "grad_norm": 0.466796875, - "learning_rate": 8.212487911636184e-05, - "loss": 0.8587, + "grad_norm": 0.3515625, + "learning_rate": 5.767619360115295e-06, + "loss": 0.7517, "step": 7290 }, { "epoch": 9.022881880024737, - "grad_norm": 0.48828125, - "learning_rate": 8.198321292099064e-05, - "loss": 0.8649, + "grad_norm": 0.361328125, + "learning_rate": 5.695541447822905e-06, + "loss": 0.7582, "step": 7295 }, { "epoch": 9.029066171923315, - "grad_norm": 0.431640625, - "learning_rate": 8.184158408733131e-05, - "loss": 0.8721, + "grad_norm": 0.349609375, + "learning_rate": 5.623903547074549e-06, + "loss": 0.768, "step": 7300 }, { "epoch": 9.035250463821892, - "grad_norm": 0.44140625, - "learning_rate": 8.169999290908188e-05, - "loss": 0.861, + "grad_norm": 0.337890625, + "learning_rate": 5.552705992122742e-06, + "loss": 0.7567, "step": 7305 }, { "epoch": 9.04143475572047, - "grad_norm": 0.439453125, - "learning_rate": 8.155843967986236e-05, - "loss": 0.8489, + "grad_norm": 0.33984375, + "learning_rate": 5.481949115165452e-06, + "loss": 0.7477, "step": 7310 }, { "epoch": 9.047619047619047, - "grad_norm": 0.455078125, - "learning_rate": 8.141692469321403e-05, - "loss": 0.8582, + "grad_norm": 0.34765625, + "learning_rate": 5.41163324634445e-06, + "loss": 0.7538, "step": 7315 }, { "epoch": 9.053803339517625, - "grad_norm": 0.4296875, - "learning_rate": 8.127544824259889e-05, - "loss": 0.8654, + "grad_norm": 0.345703125, + "learning_rate": 5.341758713743828e-06, + "loss": 0.7573, "step": 7320 }, { "epoch": 9.059987631416202, - "grad_norm": 0.421875, - "learning_rate": 8.113401062139901e-05, - "loss": 0.8545, + "grad_norm": 0.33203125, + "learning_rate": 5.272325843388504e-06, + "loss": 0.7519, "step": 7325 }, { "epoch": 9.06617192331478, - "grad_norm": 0.439453125, - "learning_rate": 8.099261212291601e-05, - "loss": 0.8645, + "grad_norm": 0.345703125, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.7595, "step": 7330 }, { "epoch": 9.072356215213357, - "grad_norm": 0.498046875, - "learning_rate": 8.085125304037018e-05, - "loss": 0.869, + "grad_norm": 0.357421875, + "learning_rate": 5.134786383208112e-06, + "loss": 0.7615, "step": 7335 }, { "epoch": 9.078540507111935, - "grad_norm": 0.45703125, - "learning_rate": 8.070993366690029e-05, - "loss": 0.8699, + "grad_norm": 0.337890625, + "learning_rate": 5.066680435123106e-06, + "loss": 0.7608, "step": 7340 }, { "epoch": 9.084724799010512, - "grad_norm": 0.4609375, - "learning_rate": 8.056865429556267e-05, - "loss": 0.8558, + "grad_norm": 0.341796875, + "learning_rate": 4.9990174327605225e-06, + "loss": 0.7508, "step": 7345 }, { "epoch": 9.090909090909092, - "grad_norm": 0.453125, - "learning_rate": 8.042741521933071e-05, - "loss": 0.8524, + "grad_norm": 0.3359375, + "learning_rate": 4.931797691826601e-06, + "loss": 0.7466, "step": 7350 }, { "epoch": 9.09709338280767, - "grad_norm": 0.474609375, - "learning_rate": 8.028621673109425e-05, - "loss": 0.8664, + "grad_norm": 0.345703125, + "learning_rate": 4.865021525959323e-06, + "loss": 0.7615, "step": 7355 }, { "epoch": 9.103277674706247, - "grad_norm": 0.486328125, - "learning_rate": 8.014505912365893e-05, - "loss": 0.8638, + "grad_norm": 0.353515625, + "learning_rate": 4.798689246727006e-06, + "loss": 0.7569, "step": 7360 }, { "epoch": 9.109461966604824, - "grad_norm": 0.435546875, - "learning_rate": 8.000394268974563e-05, - "loss": 0.8599, + "grad_norm": 0.333984375, + "learning_rate": 4.732801163626921e-06, + "loss": 0.7551, "step": 7365 }, { "epoch": 9.115646258503402, - "grad_norm": 0.451171875, - "learning_rate": 7.986286772198986e-05, - "loss": 0.8649, + "grad_norm": 0.3515625, + "learning_rate": 4.667357584083721e-06, + "loss": 0.7565, "step": 7370 }, { "epoch": 9.12183055040198, - "grad_norm": 0.42578125, - "learning_rate": 7.972183451294112e-05, - "loss": 0.8691, + "grad_norm": 0.33984375, + "learning_rate": 4.602358813448093e-06, + "loss": 0.7618, "step": 7375 }, { "epoch": 9.128014842300557, - "grad_norm": 0.458984375, - "learning_rate": 7.958084335506239e-05, - "loss": 0.8663, + "grad_norm": 0.35546875, + "learning_rate": 4.537805154995278e-06, + "loss": 0.7622, "step": 7380 }, { "epoch": 9.134199134199134, - "grad_norm": 0.453125, - "learning_rate": 7.943989454072931e-05, - "loss": 0.8588, + "grad_norm": 0.3359375, + "learning_rate": 4.473696909923719e-06, + "loss": 0.7535, "step": 7385 }, { "epoch": 9.140383426097712, - "grad_norm": 0.462890625, - "learning_rate": 7.929898836222983e-05, - "loss": 0.8668, + "grad_norm": 0.341796875, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.7599, "step": 7390 }, { "epoch": 9.14656771799629, - "grad_norm": 0.4609375, - "learning_rate": 7.915812511176347e-05, - "loss": 0.8612, + "grad_norm": 0.337890625, + "learning_rate": 4.346817854325535e-06, + "loss": 0.7546, "step": 7395 }, { "epoch": 9.152752009894867, - "grad_norm": 0.462890625, - "learning_rate": 7.90173050814406e-05, - "loss": 0.8627, + "grad_norm": 0.33984375, + "learning_rate": 4.2840476357989825e-06, + "loss": 0.7529, "step": 7400 }, { "epoch": 9.158936301793444, - "grad_norm": 0.421875, - "learning_rate": 7.887652856328214e-05, - "loss": 0.8531, + "grad_norm": 0.330078125, + "learning_rate": 4.221724014651151e-06, + "loss": 0.7502, "step": 7405 }, { "epoch": 9.165120593692022, - "grad_norm": 0.4609375, - "learning_rate": 7.873579584921869e-05, - "loss": 0.8705, + "grad_norm": 0.341796875, + "learning_rate": 4.159847281675411e-06, + "loss": 0.7637, "step": 7410 }, { "epoch": 9.1713048855906, - "grad_norm": 0.45703125, - "learning_rate": 7.859510723109003e-05, - "loss": 0.8737, + "grad_norm": 0.34375, + "learning_rate": 4.098417725580006e-06, + "loss": 0.7642, "step": 7415 }, { "epoch": 9.177489177489177, - "grad_norm": 0.421875, - "learning_rate": 7.84544630006445e-05, - "loss": 0.8651, + "grad_norm": 0.333984375, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7588, "step": 7420 }, { "epoch": 9.183673469387756, - "grad_norm": 0.455078125, - "learning_rate": 7.831386344953836e-05, - "loss": 0.8513, + "grad_norm": 0.34375, + "learning_rate": 3.976901288429691e-06, + "loss": 0.7466, "step": 7425 }, { "epoch": 9.189857761286333, - "grad_norm": 0.447265625, - "learning_rate": 7.817330886933527e-05, - "loss": 0.8665, + "grad_norm": 0.349609375, + "learning_rate": 3.916814974353633e-06, + "loss": 0.7612, "step": 7430 }, { "epoch": 9.196042053184911, - "grad_norm": 0.447265625, - "learning_rate": 7.803279955150558e-05, - "loss": 0.8645, + "grad_norm": 0.34375, + "learning_rate": 3.857176971113019e-06, + "loss": 0.758, "step": 7435 }, { "epoch": 9.202226345083488, - "grad_norm": 0.4375, - "learning_rate": 7.789233578742582e-05, - "loss": 0.8693, + "grad_norm": 0.345703125, + "learning_rate": 3.797987556970495e-06, + "loss": 0.761, "step": 7440 }, { "epoch": 9.208410636982066, - "grad_norm": 0.4375, - "learning_rate": 7.775191786837807e-05, - "loss": 0.8573, + "grad_norm": 0.337890625, + "learning_rate": 3.7392470080957033e-06, + "loss": 0.7517, "step": 7445 }, { "epoch": 9.214594928880643, - "grad_norm": 0.427734375, - "learning_rate": 7.761154608554927e-05, - "loss": 0.8614, + "grad_norm": 0.34375, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.7572, "step": 7450 }, { "epoch": 9.220779220779221, - "grad_norm": 0.5078125, - "learning_rate": 7.747122073003075e-05, - "loss": 0.8628, + "grad_norm": 0.357421875, + "learning_rate": 3.6231136003547106e-06, + "loss": 0.7568, "step": 7455 }, { "epoch": 9.226963512677798, - "grad_norm": 0.439453125, - "learning_rate": 7.733094209281756e-05, - "loss": 0.8563, + "grad_norm": 0.3359375, + "learning_rate": 3.565721283350931e-06, + "loss": 0.7516, "step": 7460 }, { "epoch": 9.233147804576376, - "grad_norm": 0.44921875, - "learning_rate": 7.719071046480776e-05, - "loss": 0.8668, + "grad_norm": 0.34765625, + "learning_rate": 3.5087789153371187e-06, + "loss": 0.7611, "step": 7465 }, { "epoch": 9.239332096474953, - "grad_norm": 0.43359375, - "learning_rate": 7.705052613680211e-05, - "loss": 0.8607, + "grad_norm": 0.341796875, + "learning_rate": 3.452286761998491e-06, + "loss": 0.7555, "step": 7470 }, { "epoch": 9.245516388373531, - "grad_norm": 0.439453125, - "learning_rate": 7.691038939950316e-05, - "loss": 0.8543, + "grad_norm": 0.337890625, + "learning_rate": 3.396245086919636e-06, + "loss": 0.749, "step": 7475 }, { "epoch": 9.251700680272108, - "grad_norm": 0.44921875, - "learning_rate": 7.677030054351477e-05, - "loss": 0.858, + "grad_norm": 0.33984375, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.7539, "step": 7480 }, { "epoch": 9.257884972170686, - "grad_norm": 0.419921875, - "learning_rate": 7.663025985934158e-05, - "loss": 0.8605, + "grad_norm": 0.34375, + "learning_rate": 3.2855142153688457e-06, + "loss": 0.7569, "step": 7485 }, { "epoch": 9.264069264069263, - "grad_norm": 0.4609375, - "learning_rate": 7.649026763738827e-05, - "loss": 0.8628, + "grad_norm": 0.34375, + "learning_rate": 3.2308255355518403e-06, + "loss": 0.7589, "step": 7490 }, { "epoch": 9.270253555967841, - "grad_norm": 0.48046875, - "learning_rate": 7.635032416795905e-05, - "loss": 0.8618, + "grad_norm": 0.33984375, + "learning_rate": 3.1765883673019914e-06, + "loss": 0.7575, "step": 7495 }, { "epoch": 9.276437847866418, - "grad_norm": 0.455078125, - "learning_rate": 7.6210429741257e-05, - "loss": 0.8675, + "grad_norm": 0.345703125, + "learning_rate": 3.1228029636824475e-06, + "loss": 0.7598, "step": 7500 }, { "epoch": 9.282622139764998, - "grad_norm": 0.423828125, - "learning_rate": 7.607058464738357e-05, - "loss": 0.8614, + "grad_norm": 0.3359375, + "learning_rate": 3.0694695756484205e-06, + "loss": 0.7557, "step": 7505 }, { "epoch": 9.288806431663575, - "grad_norm": 0.453125, - "learning_rate": 7.593078917633787e-05, - "loss": 0.8658, + "grad_norm": 0.34765625, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.7577, "step": 7510 }, { "epoch": 9.294990723562153, - "grad_norm": 0.443359375, - "learning_rate": 7.579104361801605e-05, - "loss": 0.8552, + "grad_norm": 0.34375, + "learning_rate": 2.96415983961158e-06, + "loss": 0.7531, "step": 7515 }, { "epoch": 9.30117501546073, - "grad_norm": 0.4765625, - "learning_rate": 7.565134826221083e-05, - "loss": 0.8667, + "grad_norm": 0.34765625, + "learning_rate": 2.912183982969385e-06, + "loss": 0.7611, "step": 7520 }, { "epoch": 9.307359307359308, - "grad_norm": 0.4453125, - "learning_rate": 7.551170339861083e-05, - "loss": 0.8498, + "grad_norm": 0.337890625, + "learning_rate": 2.860661124631725e-06, + "loss": 0.7458, "step": 7525 }, { "epoch": 9.313543599257885, - "grad_norm": 0.48828125, - "learning_rate": 7.537210931679987e-05, - "loss": 0.8498, + "grad_norm": 0.34375, + "learning_rate": 2.809591504997111e-06, + "loss": 0.7474, "step": 7530 }, { "epoch": 9.319727891156463, - "grad_norm": 0.466796875, - "learning_rate": 7.523256630625657e-05, - "loss": 0.8632, + "grad_norm": 0.341796875, + "learning_rate": 2.7589753623493142e-06, + "loss": 0.758, "step": 7535 }, { "epoch": 9.32591218305504, - "grad_norm": 0.412109375, - "learning_rate": 7.509307465635358e-05, - "loss": 0.8497, + "grad_norm": 0.33203125, + "learning_rate": 2.708812932856253e-06, + "loss": 0.7467, "step": 7540 }, { "epoch": 9.332096474953618, - "grad_norm": 0.466796875, - "learning_rate": 7.495363465635708e-05, - "loss": 0.8685, + "grad_norm": 0.33984375, + "learning_rate": 2.6591044505688833e-06, + "loss": 0.7613, "step": 7545 }, { "epoch": 9.338280766852195, - "grad_norm": 0.423828125, - "learning_rate": 7.481424659542609e-05, - "loss": 0.8521, + "grad_norm": 0.357421875, + "learning_rate": 2.6098501474200787e-06, + "loss": 0.7486, "step": 7550 }, { "epoch": 9.344465058750773, - "grad_norm": 0.41796875, - "learning_rate": 7.467491076261197e-05, - "loss": 0.8644, + "grad_norm": 0.33984375, + "learning_rate": 2.561050253223618e-06, + "loss": 0.7575, "step": 7555 }, { "epoch": 9.35064935064935, - "grad_norm": 0.431640625, - "learning_rate": 7.453562744685778e-05, - "loss": 0.8676, + "grad_norm": 0.3515625, + "learning_rate": 2.5127049956730207e-06, + "loss": 0.7614, "step": 7560 }, { "epoch": 9.356833642547928, - "grad_norm": 0.44140625, - "learning_rate": 7.439639693699763e-05, - "loss": 0.8689, + "grad_norm": 0.3359375, + "learning_rate": 2.4648146003405925e-06, + "loss": 0.7606, "step": 7565 }, { "epoch": 9.363017934446505, - "grad_norm": 0.451171875, - "learning_rate": 7.425721952175618e-05, - "loss": 0.8636, + "grad_norm": 0.341796875, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.7555, "step": 7570 }, { "epoch": 9.369202226345083, - "grad_norm": 0.427734375, - "learning_rate": 7.411809548974792e-05, - "loss": 0.8494, + "grad_norm": 0.337890625, + "learning_rate": 2.3703992880066638e-06, + "loss": 0.7457, "step": 7575 }, { "epoch": 9.375386518243662, - "grad_norm": 0.431640625, - "learning_rate": 7.39790251294767e-05, - "loss": 0.8693, + "grad_norm": 0.337890625, + "learning_rate": 2.3238748115339324e-06, + "loss": 0.7602, "step": 7580 }, { "epoch": 9.38157081014224, - "grad_norm": 0.45703125, - "learning_rate": 7.384000872933506e-05, - "loss": 0.8667, + "grad_norm": 0.345703125, + "learning_rate": 2.277806078334843e-06, + "loss": 0.7638, "step": 7585 }, { "epoch": 9.387755102040817, - "grad_norm": 0.4375, - "learning_rate": 7.370104657760361e-05, - "loss": 0.8572, + "grad_norm": 0.33984375, + "learning_rate": 2.232193303359742e-06, + "loss": 0.7501, "step": 7590 }, { "epoch": 9.393939393939394, - "grad_norm": 0.47265625, - "learning_rate": 7.356213896245046e-05, - "loss": 0.8636, + "grad_norm": 0.353515625, + "learning_rate": 2.1870366994315106e-06, + "loss": 0.759, "step": 7595 }, { "epoch": 9.400123685837972, - "grad_norm": 0.43359375, - "learning_rate": 7.342328617193067e-05, - "loss": 0.8729, + "grad_norm": 0.34375, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.7654, "step": 7600 }, { "epoch": 9.40630797773655, - "grad_norm": 0.435546875, - "learning_rate": 7.328448849398558e-05, - "loss": 0.8612, + "grad_norm": 0.3359375, + "learning_rate": 2.0980928453640637e-06, + "loss": 0.7575, "step": 7605 }, { "epoch": 9.412492269635127, - "grad_norm": 0.439453125, - "learning_rate": 7.314574621644225e-05, - "loss": 0.8672, + "grad_norm": 0.34375, + "learning_rate": 2.0543060102245717e-06, + "loss": 0.7584, "step": 7610 }, { "epoch": 9.418676561533704, - "grad_norm": 0.431640625, - "learning_rate": 7.300705962701287e-05, - "loss": 0.8604, + "grad_norm": 0.33203125, + "learning_rate": 2.0109761761294087e-06, + "loss": 0.7539, "step": 7615 }, { "epoch": 9.424860853432282, - "grad_norm": 0.4921875, - "learning_rate": 7.286842901329412e-05, - "loss": 0.854, + "grad_norm": 0.34765625, + "learning_rate": 1.968103545249611e-06, + "loss": 0.7491, "step": 7620 }, { "epoch": 9.43104514533086, - "grad_norm": 0.458984375, - "learning_rate": 7.272985466276661e-05, - "loss": 0.8513, + "grad_norm": 0.337890625, + "learning_rate": 1.9256883176229202e-06, + "loss": 0.7501, "step": 7625 }, { "epoch": 9.437229437229437, - "grad_norm": 0.4375, - "learning_rate": 7.259133686279429e-05, - "loss": 0.8654, + "grad_norm": 0.341796875, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.7571, "step": 7630 }, { "epoch": 9.443413729128014, - "grad_norm": 0.443359375, - "learning_rate": 7.245287590062384e-05, - "loss": 0.8533, + "grad_norm": 0.341796875, + "learning_rate": 1.8422308616080853e-06, + "loss": 0.7482, "step": 7635 }, { "epoch": 9.449598021026592, - "grad_norm": 0.50390625, - "learning_rate": 7.231447206338407e-05, - "loss": 0.8777, + "grad_norm": 0.34375, + "learning_rate": 1.8011890226208527e-06, + "loss": 0.7693, "step": 7640 }, { "epoch": 9.45578231292517, - "grad_norm": 0.4453125, - "learning_rate": 7.217612563808528e-05, - "loss": 0.8644, + "grad_norm": 0.34765625, + "learning_rate": 1.760605365686785e-06, + "loss": 0.7588, "step": 7645 }, { "epoch": 9.461966604823747, - "grad_norm": 0.4453125, - "learning_rate": 7.203783691161883e-05, - "loss": 0.8646, + "grad_norm": 0.34375, + "learning_rate": 1.7204800801636e-06, + "loss": 0.7564, "step": 7650 }, { "epoch": 9.468150896722324, - "grad_norm": 0.4453125, - "learning_rate": 7.189960617075633e-05, - "loss": 0.8791, + "grad_norm": 0.34765625, + "learning_rate": 1.6808133532703163e-06, + "loss": 0.7676, "step": 7655 }, { "epoch": 9.474335188620904, - "grad_norm": 0.412109375, - "learning_rate": 7.176143370214914e-05, - "loss": 0.8591, + "grad_norm": 0.341796875, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.754, "step": 7660 }, { "epoch": 9.480519480519481, - "grad_norm": 0.45703125, - "learning_rate": 7.162331979232783e-05, - "loss": 0.8641, + "grad_norm": 0.3515625, + "learning_rate": 1.602856313550849e-06, + "loss": 0.7572, "step": 7665 }, { "epoch": 9.486703772418059, - "grad_norm": 0.43359375, - "learning_rate": 7.148526472770154e-05, - "loss": 0.8763, + "grad_norm": 0.33203125, + "learning_rate": 1.5645663644614172e-06, + "loss": 0.7688, "step": 7670 }, { "epoch": 9.492888064316636, - "grad_norm": 0.50390625, - "learning_rate": 7.134726879455734e-05, - "loss": 0.8668, + "grad_norm": 0.3515625, + "learning_rate": 1.5267357014737027e-06, + "loss": 0.7603, "step": 7675 }, { "epoch": 9.499072356215214, - "grad_norm": 0.43359375, - "learning_rate": 7.12093322790597e-05, - "loss": 0.8619, + "grad_norm": 0.3359375, + "learning_rate": 1.489364501100332e-06, + "loss": 0.7567, "step": 7680 }, { "epoch": 9.505256648113791, - "grad_norm": 0.427734375, - "learning_rate": 7.107145546724989e-05, - "loss": 0.8695, + "grad_norm": 0.33984375, + "learning_rate": 1.4524529377101358e-06, + "loss": 0.7631, "step": 7685 }, { "epoch": 9.511440940012369, - "grad_norm": 0.4921875, - "learning_rate": 7.09336386450453e-05, - "loss": 0.8523, + "grad_norm": 0.34765625, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.7482, "step": 7690 }, { "epoch": 9.517625231910946, - "grad_norm": 0.455078125, - "learning_rate": 7.079588209823906e-05, - "loss": 0.8676, + "grad_norm": 0.345703125, + "learning_rate": 1.3800094086309112e-06, + "loss": 0.7595, "step": 7695 }, { "epoch": 9.523809523809524, - "grad_norm": 0.44140625, - "learning_rate": 7.065818611249915e-05, - "loss": 0.8555, + "grad_norm": 0.33984375, + "learning_rate": 1.344477780953346e-06, + "loss": 0.7519, "step": 7700 }, { "epoch": 9.529993815708101, - "grad_norm": 0.4296875, - "learning_rate": 7.052055097336805e-05, - "loss": 0.8612, + "grad_norm": 0.34375, + "learning_rate": 1.3094064662803385e-06, + "loss": 0.7549, "step": 7705 }, { "epoch": 9.536178107606679, - "grad_norm": 0.447265625, - "learning_rate": 7.038297696626206e-05, - "loss": 0.8602, + "grad_norm": 0.341796875, + "learning_rate": 1.274795628249792e-06, + "loss": 0.7529, "step": 7710 }, { "epoch": 9.542362399505256, - "grad_norm": 0.431640625, - "learning_rate": 7.02454643764707e-05, - "loss": 0.8636, + "grad_norm": 0.34375, + "learning_rate": 1.2406454283510948e-06, + "loss": 0.7552, "step": 7715 }, { "epoch": 9.548546691403834, - "grad_norm": 0.423828125, - "learning_rate": 7.010801348915608e-05, - "loss": 0.8566, + "grad_norm": 0.333984375, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.749, "step": 7720 }, { "epoch": 9.554730983302411, - "grad_norm": 0.45703125, - "learning_rate": 6.99706245893524e-05, - "loss": 0.8603, + "grad_norm": 0.33984375, + "learning_rate": 1.173727578159589e-06, + "loss": 0.7532, "step": 7725 }, { "epoch": 9.560915275200989, - "grad_norm": 0.515625, - "learning_rate": 6.983329796196534e-05, - "loss": 0.858, + "grad_norm": 0.349609375, + "learning_rate": 1.1409602400962227e-06, + "loss": 0.7513, "step": 7730 }, { "epoch": 9.567099567099568, - "grad_norm": 0.455078125, - "learning_rate": 6.969603389177142e-05, - "loss": 0.858, + "grad_norm": 0.341796875, + "learning_rate": 1.1086541646220693e-06, + "loss": 0.7514, "step": 7735 }, { "epoch": 9.573283858998145, - "grad_norm": 0.421875, - "learning_rate": 6.955883266341741e-05, - "loss": 0.8738, + "grad_norm": 0.333984375, + "learning_rate": 1.076809502472831e-06, + "loss": 0.7645, "step": 7740 }, { "epoch": 9.579468150896723, - "grad_norm": 0.421875, - "learning_rate": 6.94216945614198e-05, - "loss": 0.8643, + "grad_norm": 0.333984375, + "learning_rate": 1.0454264022312644e-06, + "loss": 0.7566, "step": 7745 }, { "epoch": 9.5856524427953, - "grad_norm": 0.494140625, - "learning_rate": 6.928461987016413e-05, - "loss": 0.8708, + "grad_norm": 0.353515625, + "learning_rate": 1.014505010326583e-06, + "loss": 0.7652, "step": 7750 }, { "epoch": 9.591836734693878, - "grad_norm": 0.44921875, - "learning_rate": 6.914760887390452e-05, - "loss": 0.8582, + "grad_norm": 0.3515625, + "learning_rate": 9.840454710337122e-07, + "loss": 0.7521, "step": 7755 }, { "epoch": 9.598021026592455, - "grad_norm": 0.42578125, - "learning_rate": 6.901066185676295e-05, - "loss": 0.8593, + "grad_norm": 0.3359375, + "learning_rate": 9.540479264726676e-07, + "loss": 0.7529, "step": 7760 }, { "epoch": 9.604205318491033, - "grad_norm": 0.44140625, - "learning_rate": 6.887377910272869e-05, - "loss": 0.8639, + "grad_norm": 0.3359375, + "learning_rate": 9.245125166078005e-07, + "loss": 0.7574, "step": 7765 }, { "epoch": 9.61038961038961, - "grad_norm": 0.451171875, - "learning_rate": 6.873696089565786e-05, - "loss": 0.8507, + "grad_norm": 0.34375, + "learning_rate": 8.954393792472649e-07, + "loss": 0.7483, "step": 7770 }, { "epoch": 9.616573902288188, - "grad_norm": 0.45703125, - "learning_rate": 6.860020751927259e-05, - "loss": 0.8571, + "grad_norm": 0.341796875, + "learning_rate": 8.668286500422951e-07, + "loss": 0.7509, "step": 7775 }, { "epoch": 9.622758194186765, - "grad_norm": 0.474609375, - "learning_rate": 6.846351925716068e-05, - "loss": 0.8637, + "grad_norm": 0.33984375, + "learning_rate": 8.386804624865851e-07, + "loss": 0.7564, "step": 7780 }, { "epoch": 9.628942486085343, - "grad_norm": 0.443359375, - "learning_rate": 6.832689639277484e-05, - "loss": 0.8621, + "grad_norm": 0.34765625, + "learning_rate": 8.109949479156886e-07, + "loss": 0.7577, "step": 7785 }, { "epoch": 9.63512677798392, - "grad_norm": 0.419921875, - "learning_rate": 6.819033920943219e-05, - "loss": 0.8631, + "grad_norm": 0.345703125, + "learning_rate": 7.837722355063637e-07, + "loss": 0.7574, "step": 7790 }, { "epoch": 9.641311069882498, - "grad_norm": 0.4375, - "learning_rate": 6.805384799031361e-05, - "loss": 0.8645, + "grad_norm": 0.341796875, + "learning_rate": 7.570124522760402e-07, + "loss": 0.7579, "step": 7795 }, { "epoch": 9.647495361781075, - "grad_norm": 0.439453125, - "learning_rate": 6.791742301846326e-05, - "loss": 0.8617, + "grad_norm": 0.341796875, + "learning_rate": 7.307157230821426e-07, + "loss": 0.7525, "step": 7800 }, { "epoch": 9.653679653679653, - "grad_norm": 0.44140625, - "learning_rate": 6.778106457678785e-05, - "loss": 0.8613, + "grad_norm": 0.33984375, + "learning_rate": 7.048821706215792e-07, + "loss": 0.7549, "step": 7805 }, { "epoch": 9.65986394557823, - "grad_norm": 0.447265625, - "learning_rate": 6.764477294805615e-05, - "loss": 0.8543, + "grad_norm": 0.333984375, + "learning_rate": 6.7951191543012e-07, + "loss": 0.7468, "step": 7810 }, { "epoch": 9.666048237476808, - "grad_norm": 0.42578125, - "learning_rate": 6.750854841489842e-05, - "loss": 0.8599, + "grad_norm": 0.337890625, + "learning_rate": 6.546050758818756e-07, + "loss": 0.7565, "step": 7815 }, { "epoch": 9.672232529375387, - "grad_norm": 0.427734375, - "learning_rate": 6.737239125980573e-05, - "loss": 0.8622, + "grad_norm": 0.3515625, + "learning_rate": 6.301617681886863e-07, + "loss": 0.7556, "step": 7820 }, { "epoch": 9.678416821273965, - "grad_norm": 0.4296875, - "learning_rate": 6.723630176512944e-05, - "loss": 0.8587, + "grad_norm": 0.33203125, + "learning_rate": 6.061821063996665e-07, + "loss": 0.7546, "step": 7825 }, { "epoch": 9.684601113172542, - "grad_norm": 0.451171875, - "learning_rate": 6.710028021308061e-05, - "loss": 0.8655, + "grad_norm": 0.34375, + "learning_rate": 5.826662024005835e-07, + "loss": 0.7565, "step": 7830 }, { "epoch": 9.69078540507112, - "grad_norm": 0.4375, - "learning_rate": 6.696432688572937e-05, - "loss": 0.8626, + "grad_norm": 0.341796875, + "learning_rate": 5.596141659133913e-07, + "loss": 0.7576, "step": 7835 }, { "epoch": 9.696969696969697, - "grad_norm": 0.44921875, - "learning_rate": 6.682844206500445e-05, - "loss": 0.8685, + "grad_norm": 0.337890625, + "learning_rate": 5.370261044956971e-07, + "loss": 0.7583, "step": 7840 }, { "epoch": 9.703153988868275, - "grad_norm": 0.435546875, - "learning_rate": 6.669262603269246e-05, - "loss": 0.8599, + "grad_norm": 0.345703125, + "learning_rate": 5.149021235402729e-07, + "loss": 0.7512, "step": 7845 }, { "epoch": 9.709338280766852, - "grad_norm": 0.439453125, - "learning_rate": 6.655687907043734e-05, - "loss": 0.8549, + "grad_norm": 0.337890625, + "learning_rate": 4.932423262745456e-07, + "loss": 0.7478, "step": 7850 }, { "epoch": 9.71552257266543, - "grad_norm": 0.447265625, - "learning_rate": 6.642120145973985e-05, - "loss": 0.8692, + "grad_norm": 0.345703125, + "learning_rate": 4.7204681376014084e-07, + "loss": 0.7614, "step": 7855 }, { "epoch": 9.721706864564007, - "grad_norm": 0.447265625, - "learning_rate": 6.62855934819569e-05, - "loss": 0.8663, + "grad_norm": 0.34765625, + "learning_rate": 4.5131568489236166e-07, + "loss": 0.7562, "step": 7860 }, { "epoch": 9.727891156462585, - "grad_norm": 0.486328125, - "learning_rate": 6.615005541830103e-05, - "loss": 0.8571, + "grad_norm": 0.365234375, + "learning_rate": 4.3104903639981097e-07, + "loss": 0.7494, "step": 7865 }, { "epoch": 9.734075448361162, - "grad_norm": 0.4453125, - "learning_rate": 6.601458754983978e-05, - "loss": 0.862, + "grad_norm": 0.337890625, + "learning_rate": 4.112469628438365e-07, + "loss": 0.7569, "step": 7870 }, { "epoch": 9.74025974025974, - "grad_norm": 0.419921875, - "learning_rate": 6.587919015749511e-05, - "loss": 0.8637, + "grad_norm": 0.341796875, + "learning_rate": 3.919095566181974e-07, + "loss": 0.7585, "step": 7875 }, { "epoch": 9.746444032158317, - "grad_norm": 0.4609375, - "learning_rate": 6.574386352204289e-05, - "loss": 0.8586, + "grad_norm": 0.3515625, + "learning_rate": 3.73036907948543e-07, + "loss": 0.7532, "step": 7880 }, { "epoch": 9.752628324056895, - "grad_norm": 0.44140625, - "learning_rate": 6.560860792411219e-05, - "loss": 0.8576, + "grad_norm": 0.3359375, + "learning_rate": 3.546291048920347e-07, + "loss": 0.7544, "step": 7885 }, { "epoch": 9.758812615955474, - "grad_norm": 0.435546875, - "learning_rate": 6.547342364418481e-05, - "loss": 0.8503, + "grad_norm": 0.34375, + "learning_rate": 3.366862333369358e-07, + "loss": 0.7473, "step": 7890 }, { "epoch": 9.764996907854051, - "grad_norm": 0.419921875, - "learning_rate": 6.533831096259467e-05, - "loss": 0.8615, + "grad_norm": 0.337890625, + "learning_rate": 3.192083770021892e-07, + "loss": 0.7525, "step": 7895 }, { "epoch": 9.771181199752629, - "grad_norm": 0.44140625, - "learning_rate": 6.520327015952713e-05, - "loss": 0.8564, + "grad_norm": 0.34765625, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.7504, "step": 7900 }, { "epoch": 9.777365491651206, - "grad_norm": 0.447265625, - "learning_rate": 6.506830151501861e-05, - "loss": 0.8571, + "grad_norm": 0.3515625, + "learning_rate": 2.856480340207579e-07, + "loss": 0.7495, "step": 7905 }, { "epoch": 9.783549783549784, - "grad_norm": 0.439453125, - "learning_rate": 6.493340530895583e-05, - "loss": 0.8671, + "grad_norm": 0.337890625, + "learning_rate": 2.6956570396197143e-07, + "loss": 0.7595, "step": 7910 }, { "epoch": 9.789734075448361, - "grad_norm": 0.451171875, - "learning_rate": 6.479858182107527e-05, - "loss": 0.8657, + "grad_norm": 0.3359375, + "learning_rate": 2.539487022986453e-07, + "loss": 0.7584, "step": 7915 }, { "epoch": 9.795918367346939, - "grad_norm": 0.47265625, - "learning_rate": 6.466383133096267e-05, - "loss": 0.8556, + "grad_norm": 0.34375, + "learning_rate": 2.3879710189753656e-07, + "loss": 0.7476, "step": 7920 }, { "epoch": 9.802102659245516, - "grad_norm": 0.4296875, - "learning_rate": 6.452915411805238e-05, - "loss": 0.8502, + "grad_norm": 0.34375, + "learning_rate": 2.2411097345392818e-07, + "loss": 0.7463, "step": 7925 }, { "epoch": 9.808286951144094, - "grad_norm": 0.42578125, - "learning_rate": 6.439455046162677e-05, - "loss": 0.8599, + "grad_norm": 0.337890625, + "learning_rate": 2.098903854912515e-07, + "loss": 0.7549, "step": 7930 }, { "epoch": 9.814471243042671, - "grad_norm": 0.44921875, - "learning_rate": 6.426002064081565e-05, - "loss": 0.8641, + "grad_norm": 0.34375, + "learning_rate": 1.9613540436080878e-07, + "loss": 0.7567, "step": 7935 }, { "epoch": 9.820655534941249, - "grad_norm": 0.5078125, - "learning_rate": 6.412556493459581e-05, - "loss": 0.8644, + "grad_norm": 0.353515625, + "learning_rate": 1.8284609424142895e-07, + "loss": 0.7578, "step": 7940 }, { "epoch": 9.826839826839826, - "grad_norm": 0.43359375, - "learning_rate": 6.399118362179028e-05, - "loss": 0.8574, + "grad_norm": 0.33984375, + "learning_rate": 1.7002251713920114e-07, + "loss": 0.7542, "step": 7945 }, { "epoch": 9.833024118738404, - "grad_norm": 0.451171875, - "learning_rate": 6.385687698106781e-05, - "loss": 0.8651, + "grad_norm": 0.345703125, + "learning_rate": 1.5766473288715278e-07, + "loss": 0.7565, "step": 7950 }, { "epoch": 9.839208410636981, - "grad_norm": 0.494140625, - "learning_rate": 6.372264529094233e-05, - "loss": 0.8756, + "grad_norm": 0.3515625, + "learning_rate": 1.457727991449942e-07, + "loss": 0.7649, "step": 7955 }, { "epoch": 9.845392702535559, - "grad_norm": 0.46484375, - "learning_rate": 6.358848882977233e-05, - "loss": 0.8606, + "grad_norm": 0.345703125, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.753, "step": 7960 }, { "epoch": 9.851576994434136, - "grad_norm": 0.470703125, - "learning_rate": 6.345440787576031e-05, - "loss": 0.8615, + "grad_norm": 0.349609375, + "learning_rate": 1.2338670296097034e-07, + "loss": 0.7556, "step": 7965 }, { "epoch": 9.857761286332714, - "grad_norm": 0.4453125, - "learning_rate": 6.332040270695219e-05, - "loss": 0.8724, + "grad_norm": 0.349609375, + "learning_rate": 1.1289264496953111e-07, + "loss": 0.7629, "step": 7970 }, { "epoch": 9.863945578231293, - "grad_norm": 0.44921875, - "learning_rate": 6.31864736012367e-05, - "loss": 0.8641, + "grad_norm": 0.345703125, + "learning_rate": 1.0286464638834536e-07, + "loss": 0.7587, "step": 7975 }, { "epoch": 9.87012987012987, - "grad_norm": 0.451171875, - "learning_rate": 6.305262083634488e-05, - "loss": 0.8693, + "grad_norm": 0.34765625, + "learning_rate": 9.330275400666332e-08, + "loss": 0.7626, "step": 7980 }, { "epoch": 9.876314162028448, - "grad_norm": 0.43359375, - "learning_rate": 6.291884468984941e-05, - "loss": 0.8602, + "grad_norm": 0.33984375, + "learning_rate": 8.420701243895268e-08, + "loss": 0.7528, "step": 7985 }, { "epoch": 9.882498453927026, - "grad_norm": 0.443359375, - "learning_rate": 6.278514543916415e-05, - "loss": 0.8574, + "grad_norm": 0.34765625, + "learning_rate": 7.557746412468758e-08, + "loss": 0.7496, "step": 7990 }, { "epoch": 9.888682745825603, - "grad_norm": 0.431640625, - "learning_rate": 6.265152336154345e-05, - "loss": 0.8529, + "grad_norm": 0.3359375, + "learning_rate": 6.741414932813773e-08, + "loss": 0.7468, "step": 7995 }, { "epoch": 9.89486703772418, - "grad_norm": 0.482421875, - "learning_rate": 6.251797873408161e-05, - "loss": 0.8701, + "grad_norm": 0.34765625, + "learning_rate": 5.971710613821291e-08, + "loss": 0.7611, "step": 8000 }, { "epoch": 9.901051329622758, - "grad_norm": 0.45703125, - "learning_rate": 6.238451183371241e-05, - "loss": 0.8587, + "grad_norm": 0.333984375, + "learning_rate": 5.248637046824101e-08, + "loss": 0.753, "step": 8005 }, { "epoch": 9.907235621521336, - "grad_norm": 0.44140625, - "learning_rate": 6.225112293720836e-05, - "loss": 0.858, + "grad_norm": 0.33984375, + "learning_rate": 4.572197605583473e-08, + "loss": 0.7515, "step": 8010 }, { "epoch": 9.913419913419913, - "grad_norm": 0.466796875, - "learning_rate": 6.211781232118025e-05, - "loss": 0.8736, + "grad_norm": 0.365234375, + "learning_rate": 3.9423954462713964e-08, + "loss": 0.7673, "step": 8015 }, { "epoch": 9.91960420531849, - "grad_norm": 0.4453125, - "learning_rate": 6.198458026207652e-05, - "loss": 0.8559, + "grad_norm": 0.3359375, + "learning_rate": 3.359233507459481e-08, + "loss": 0.7515, "step": 8020 }, { "epoch": 9.925788497217068, - "grad_norm": 0.458984375, - "learning_rate": 6.18514270361827e-05, - "loss": 0.8657, + "grad_norm": 0.349609375, + "learning_rate": 2.8227145100989672e-08, + "loss": 0.7569, "step": 8025 }, { "epoch": 9.931972789115646, - "grad_norm": 0.431640625, - "learning_rate": 6.171835291962088e-05, - "loss": 0.8569, + "grad_norm": 0.341796875, + "learning_rate": 2.3328409575129608e-08, + "loss": 0.7495, "step": 8030 }, { "epoch": 9.938157081014223, - "grad_norm": 0.431640625, - "learning_rate": 6.158535818834906e-05, - "loss": 0.8585, + "grad_norm": 0.341796875, + "learning_rate": 1.8896151353853253e-08, + "loss": 0.7516, "step": 8035 }, { "epoch": 9.9443413729128, - "grad_norm": 0.451171875, - "learning_rate": 6.145244311816063e-05, - "loss": 0.863, + "grad_norm": 0.3515625, + "learning_rate": 1.4930391117451426e-08, + "loss": 0.7551, "step": 8040 }, { "epoch": 9.95052566481138, - "grad_norm": 0.46484375, - "learning_rate": 6.13196079846838e-05, - "loss": 0.8619, + "grad_norm": 0.34375, + "learning_rate": 1.1431147369611595e-08, + "loss": 0.7553, "step": 8045 }, { "epoch": 9.956709956709958, - "grad_norm": 0.4375, - "learning_rate": 6.1186853063381e-05, - "loss": 0.8664, + "grad_norm": 0.33203125, + "learning_rate": 8.398436437317969e-09, + "loss": 0.759, "step": 8050 }, { "epoch": 9.962894248608535, - "grad_norm": 0.43359375, - "learning_rate": 6.105417862954828e-05, - "loss": 0.8636, + "grad_norm": 0.3359375, + "learning_rate": 5.832272470795985e-09, + "loss": 0.7562, "step": 8055 }, { "epoch": 9.969078540507113, - "grad_norm": 0.431640625, - "learning_rate": 6.092158495831486e-05, - "loss": 0.8536, + "grad_norm": 0.3359375, + "learning_rate": 3.732667443390181e-09, + "loss": 0.7497, "step": 8060 }, { "epoch": 9.97526283240569, - "grad_norm": 0.44921875, - "learning_rate": 6.078907232464248e-05, - "loss": 0.8701, + "grad_norm": 0.353515625, + "learning_rate": 2.099631151586401e-09, + "loss": 0.7637, "step": 8065 }, { "epoch": 9.981447124304268, - "grad_norm": 0.421875, - "learning_rate": 6.065664100332478e-05, - "loss": 0.8686, + "grad_norm": 0.33984375, + "learning_rate": 9.33171214889672e-10, + "loss": 0.7618, "step": 8070 }, { "epoch": 9.987631416202845, - "grad_norm": 0.421875, - "learning_rate": 6.0524291268986766e-05, - "loss": 0.8626, + "grad_norm": 0.337890625, + "learning_rate": 2.3329307584640804e-10, + "loss": 0.7574, "step": 8075 }, { "epoch": 9.993815708101423, - "grad_norm": 0.419921875, - "learning_rate": 6.039202339608432e-05, - "loss": 0.8695, + "grad_norm": 0.337890625, + "learning_rate": 0.0, + "loss": 0.7647, "step": 8080 }, { - "epoch": 10.0, - "grad_norm": 0.462890625, - "learning_rate": 6.025983765890353e-05, - "loss": 0.8421, - "step": 8085 - }, - { - "epoch": 10.0, - "eval_loss": 2.518721103668213, - "eval_runtime": 0.5383, - "eval_samples_per_second": 18.576, - "eval_steps_per_second": 1.858, - "step": 8085 + "epoch": 9.993815708101423, + "eval_loss": 2.699876308441162, + "eval_runtime": 0.5372, + "eval_samples_per_second": 18.614, + "eval_steps_per_second": 1.861, + "step": 8080 }, { - "epoch": 10.006184291898577, - "grad_norm": 0.443359375, - "learning_rate": 6.012773433156017e-05, - "loss": 0.8602, - "step": 8090 - }, - { - "epoch": 10.012368583797155, - "grad_norm": 0.46484375, - "learning_rate": 5.99957136879991e-05, - "loss": 0.8548, - "step": 8095 - }, - { - "epoch": 10.018552875695732, - "grad_norm": 0.451171875, - "learning_rate": 5.986377600199371e-05, - "loss": 0.8597, - "step": 8100 - }, - { - "epoch": 10.02473716759431, - "grad_norm": 0.423828125, - "learning_rate": 5.973192154714547e-05, - "loss": 0.8643, - "step": 8105 - }, - { - "epoch": 10.030921459492887, - "grad_norm": 0.4453125, - "learning_rate": 5.9600150596883066e-05, - "loss": 0.8591, - "step": 8110 - }, - { - "epoch": 10.037105751391465, - "grad_norm": 0.462890625, - "learning_rate": 5.946846342446214e-05, - "loss": 0.8699, - "step": 8115 - }, - { - "epoch": 10.043290043290042, - "grad_norm": 0.427734375, - "learning_rate": 5.933686030296459e-05, - "loss": 0.8558, - "step": 8120 - }, - { - "epoch": 10.049474335188622, - "grad_norm": 0.439453125, - "learning_rate": 5.920534150529797e-05, - "loss": 0.8567, - "step": 8125 - }, - { - "epoch": 10.0556586270872, - "grad_norm": 0.46875, - "learning_rate": 5.907390730419507e-05, - "loss": 0.8566, - "step": 8130 - }, - { - "epoch": 10.061842918985777, - "grad_norm": 0.427734375, - "learning_rate": 5.894255797221313e-05, - "loss": 0.8462, - "step": 8135 - }, - { - "epoch": 10.068027210884354, - "grad_norm": 0.4375, - "learning_rate": 5.881129378173347e-05, - "loss": 0.8521, - "step": 8140 - }, - { - "epoch": 10.074211502782932, - "grad_norm": 0.490234375, - "learning_rate": 5.868011500496084e-05, - "loss": 0.858, - "step": 8145 - }, - { - "epoch": 10.08039579468151, - "grad_norm": 0.453125, - "learning_rate": 5.854902191392284e-05, - "loss": 0.8568, - "step": 8150 - }, - { - "epoch": 10.086580086580087, - "grad_norm": 0.474609375, - "learning_rate": 5.84180147804694e-05, - "loss": 0.864, - "step": 8155 - }, - { - "epoch": 10.092764378478664, - "grad_norm": 0.474609375, - "learning_rate": 5.828709387627218e-05, - "loss": 0.8633, - "step": 8160 - }, - { - "epoch": 10.098948670377242, - "grad_norm": 0.44921875, - "learning_rate": 5.8156259472824124e-05, - "loss": 0.8608, - "step": 8165 - }, - { - "epoch": 10.10513296227582, - "grad_norm": 0.447265625, - "learning_rate": 5.802551184143865e-05, - "loss": 0.8512, - "step": 8170 - }, - { - "epoch": 10.111317254174397, - "grad_norm": 0.44921875, - "learning_rate": 5.789485125324926e-05, - "loss": 0.8572, - "step": 8175 - }, - { - "epoch": 10.117501546072974, - "grad_norm": 0.4453125, - "learning_rate": 5.7764277979209094e-05, - "loss": 0.8498, - "step": 8180 - }, - { - "epoch": 10.123685837971552, - "grad_norm": 0.4453125, - "learning_rate": 5.763379229009003e-05, - "loss": 0.8596, - "step": 8185 - }, - { - "epoch": 10.12987012987013, - "grad_norm": 0.427734375, - "learning_rate": 5.750339445648252e-05, - "loss": 0.8667, - "step": 8190 - }, - { - "epoch": 10.136054421768707, - "grad_norm": 0.453125, - "learning_rate": 5.7373084748794626e-05, - "loss": 0.8661, - "step": 8195 - }, - { - "epoch": 10.142238713667284, - "grad_norm": 0.451171875, - "learning_rate": 5.724286343725185e-05, - "loss": 0.8522, - "step": 8200 - }, - { - "epoch": 10.148423005565864, - "grad_norm": 0.4375, - "learning_rate": 5.7112730791896207e-05, - "loss": 0.863, - "step": 8205 - }, - { - "epoch": 10.154607297464441, - "grad_norm": 0.43359375, - "learning_rate": 5.6982687082585994e-05, - "loss": 0.8652, - "step": 8210 - }, - { - "epoch": 10.160791589363019, - "grad_norm": 0.43359375, - "learning_rate": 5.685273257899505e-05, - "loss": 0.8557, - "step": 8215 - }, - { - "epoch": 10.166975881261596, - "grad_norm": 0.431640625, - "learning_rate": 5.6722867550612116e-05, - "loss": 0.8567, - "step": 8220 - }, - { - "epoch": 10.173160173160174, - "grad_norm": 0.45703125, - "learning_rate": 5.6593092266740545e-05, - "loss": 0.8569, - "step": 8225 - }, - { - "epoch": 10.179344465058751, - "grad_norm": 0.44921875, - "learning_rate": 5.6463406996497456e-05, - "loss": 0.8591, - "step": 8230 - }, - { - "epoch": 10.185528756957329, - "grad_norm": 0.4453125, - "learning_rate": 5.633381200881335e-05, - "loss": 0.8581, - "step": 8235 - }, - { - "epoch": 10.191713048855906, - "grad_norm": 0.427734375, - "learning_rate": 5.620430757243156e-05, - "loss": 0.8579, - "step": 8240 - }, - { - "epoch": 10.197897340754484, - "grad_norm": 0.458984375, - "learning_rate": 5.6074893955907535e-05, - "loss": 0.8621, - "step": 8245 - }, - { - "epoch": 10.204081632653061, - "grad_norm": 0.4140625, - "learning_rate": 5.5945571427608526e-05, - "loss": 0.8627, - "step": 8250 - }, - { - "epoch": 10.210265924551639, - "grad_norm": 0.421875, - "learning_rate": 5.581634025571274e-05, - "loss": 0.8577, - "step": 8255 - }, - { - "epoch": 10.216450216450216, - "grad_norm": 0.45703125, - "learning_rate": 5.5687200708209076e-05, - "loss": 0.8604, - "step": 8260 - }, - { - "epoch": 10.222634508348794, - "grad_norm": 0.451171875, - "learning_rate": 5.555815305289631e-05, - "loss": 0.8684, - "step": 8265 - }, - { - "epoch": 10.228818800247371, - "grad_norm": 0.44140625, - "learning_rate": 5.542919755738275e-05, - "loss": 0.8599, - "step": 8270 - }, - { - "epoch": 10.235003092145949, - "grad_norm": 0.515625, - "learning_rate": 5.5300334489085595e-05, - "loss": 0.8535, - "step": 8275 - }, - { - "epoch": 10.241187384044528, - "grad_norm": 0.421875, - "learning_rate": 5.5171564115230254e-05, - "loss": 0.8628, - "step": 8280 - }, - { - "epoch": 10.247371675943105, - "grad_norm": 0.42578125, - "learning_rate": 5.504288670285008e-05, - "loss": 0.8571, - "step": 8285 - }, - { - "epoch": 10.253555967841683, - "grad_norm": 0.43359375, - "learning_rate": 5.491430251878551e-05, - "loss": 0.8539, - "step": 8290 - }, - { - "epoch": 10.25974025974026, - "grad_norm": 0.443359375, - "learning_rate": 5.4785811829683764e-05, - "loss": 0.8602, - "step": 8295 - }, - { - "epoch": 10.265924551638838, - "grad_norm": 0.44921875, - "learning_rate": 5.4657414901998095e-05, - "loss": 0.8619, - "step": 8300 - }, - { - "epoch": 10.272108843537415, - "grad_norm": 0.427734375, - "learning_rate": 5.4529112001987314e-05, - "loss": 0.8605, - "step": 8305 - }, - { - "epoch": 10.278293135435993, - "grad_norm": 0.416015625, - "learning_rate": 5.4400903395715366e-05, - "loss": 0.8549, - "step": 8310 - }, - { - "epoch": 10.28447742733457, - "grad_norm": 0.486328125, - "learning_rate": 5.427278934905049e-05, - "loss": 0.8649, - "step": 8315 - }, - { - "epoch": 10.290661719233148, - "grad_norm": 0.451171875, - "learning_rate": 5.4144770127665024e-05, - "loss": 0.8582, - "step": 8320 - }, - { - "epoch": 10.296846011131725, - "grad_norm": 0.4140625, - "learning_rate": 5.401684599703445e-05, - "loss": 0.8613, - "step": 8325 - }, - { - "epoch": 10.303030303030303, - "grad_norm": 0.44140625, - "learning_rate": 5.388901722243724e-05, - "loss": 0.8498, - "step": 8330 - }, - { - "epoch": 10.30921459492888, - "grad_norm": 0.43359375, - "learning_rate": 5.376128406895408e-05, - "loss": 0.8547, - "step": 8335 - }, - { - "epoch": 10.315398886827458, - "grad_norm": 0.43359375, - "learning_rate": 5.363364680146725e-05, - "loss": 0.8501, - "step": 8340 - }, - { - "epoch": 10.321583178726035, - "grad_norm": 0.4453125, - "learning_rate": 5.350610568466039e-05, - "loss": 0.8616, - "step": 8345 - }, - { - "epoch": 10.327767470624613, - "grad_norm": 0.44140625, - "learning_rate": 5.3378660983017536e-05, - "loss": 0.8573, - "step": 8350 - }, - { - "epoch": 10.33395176252319, - "grad_norm": 0.4375, - "learning_rate": 5.325131296082298e-05, - "loss": 0.8596, - "step": 8355 - }, - { - "epoch": 10.34013605442177, - "grad_norm": 0.423828125, - "learning_rate": 5.31240618821604e-05, - "loss": 0.8661, - "step": 8360 - }, - { - "epoch": 10.346320346320347, - "grad_norm": 0.431640625, - "learning_rate": 5.2996908010912437e-05, - "loss": 0.8436, - "step": 8365 - }, - { - "epoch": 10.352504638218925, - "grad_norm": 0.4296875, - "learning_rate": 5.286985161076029e-05, - "loss": 0.8591, - "step": 8370 - }, - { - "epoch": 10.358688930117502, - "grad_norm": 0.4375, - "learning_rate": 5.274289294518283e-05, - "loss": 0.8619, - "step": 8375 - }, - { - "epoch": 10.36487322201608, - "grad_norm": 0.45703125, - "learning_rate": 5.2616032277456463e-05, - "loss": 0.8588, - "step": 8380 - }, - { - "epoch": 10.371057513914657, - "grad_norm": 0.4375, - "learning_rate": 5.248926987065417e-05, - "loss": 0.8616, - "step": 8385 - }, - { - "epoch": 10.377241805813235, - "grad_norm": 0.435546875, - "learning_rate": 5.236260598764535e-05, - "loss": 0.8631, - "step": 8390 - }, - { - "epoch": 10.383426097711812, - "grad_norm": 0.44140625, - "learning_rate": 5.223604089109495e-05, - "loss": 0.8628, - "step": 8395 - }, - { - "epoch": 10.38961038961039, - "grad_norm": 0.453125, - "learning_rate": 5.210957484346314e-05, - "loss": 0.8553, - "step": 8400 - }, - { - "epoch": 10.395794681508967, - "grad_norm": 0.41796875, - "learning_rate": 5.198320810700472e-05, - "loss": 0.8551, - "step": 8405 - }, - { - "epoch": 10.401978973407545, - "grad_norm": 0.42578125, - "learning_rate": 5.185694094376843e-05, - "loss": 0.8565, - "step": 8410 - }, - { - "epoch": 10.408163265306122, - "grad_norm": 0.439453125, - "learning_rate": 5.173077361559665e-05, - "loss": 0.8714, - "step": 8415 - }, - { - "epoch": 10.4143475572047, - "grad_norm": 0.4609375, - "learning_rate": 5.160470638412461e-05, - "loss": 0.8585, - "step": 8420 - }, - { - "epoch": 10.420531849103277, - "grad_norm": 0.431640625, - "learning_rate": 5.1478739510780104e-05, - "loss": 0.8488, - "step": 8425 - }, - { - "epoch": 10.426716141001855, - "grad_norm": 0.462890625, - "learning_rate": 5.135287325678271e-05, - "loss": 0.8691, - "step": 8430 - }, - { - "epoch": 10.432900432900432, - "grad_norm": 0.4453125, - "learning_rate": 5.122710788314331e-05, - "loss": 0.8578, - "step": 8435 - }, - { - "epoch": 10.439084724799011, - "grad_norm": 0.44921875, - "learning_rate": 5.1101443650663764e-05, - "loss": 0.8591, - "step": 8440 - }, - { - "epoch": 10.445269016697589, - "grad_norm": 0.4296875, - "learning_rate": 5.0975880819936004e-05, - "loss": 0.8588, - "step": 8445 - }, - { - "epoch": 10.451453308596166, - "grad_norm": 0.4453125, - "learning_rate": 5.085041965134183e-05, - "loss": 0.8542, - "step": 8450 - }, - { - "epoch": 10.457637600494744, - "grad_norm": 0.41796875, - "learning_rate": 5.072506040505208e-05, - "loss": 0.8595, - "step": 8455 - }, - { - "epoch": 10.463821892393321, - "grad_norm": 0.44921875, - "learning_rate": 5.059980334102637e-05, - "loss": 0.8456, - "step": 8460 - }, - { - "epoch": 10.470006184291899, - "grad_norm": 0.439453125, - "learning_rate": 5.04746487190124e-05, - "loss": 0.855, - "step": 8465 - }, - { - "epoch": 10.476190476190476, - "grad_norm": 0.435546875, - "learning_rate": 5.034959679854532e-05, - "loss": 0.8573, - "step": 8470 - }, - { - "epoch": 10.482374768089054, - "grad_norm": 0.4375, - "learning_rate": 5.022464783894744e-05, - "loss": 0.8549, - "step": 8475 - }, - { - "epoch": 10.488559059987631, - "grad_norm": 0.46484375, - "learning_rate": 5.009980209932743e-05, - "loss": 0.8708, - "step": 8480 - }, - { - "epoch": 10.494743351886209, - "grad_norm": 0.453125, - "learning_rate": 4.9975059838580083e-05, - "loss": 0.8615, - "step": 8485 - }, - { - "epoch": 10.500927643784786, - "grad_norm": 0.421875, - "learning_rate": 4.985042131538545e-05, - "loss": 0.8629, - "step": 8490 - }, - { - "epoch": 10.507111935683364, - "grad_norm": 0.455078125, - "learning_rate": 4.9725886788208474e-05, - "loss": 0.8657, - "step": 8495 - }, - { - "epoch": 10.513296227581941, - "grad_norm": 0.439453125, - "learning_rate": 4.960145651529856e-05, - "loss": 0.8473, - "step": 8500 - }, - { - "epoch": 10.519480519480519, - "grad_norm": 0.419921875, - "learning_rate": 4.9477130754688775e-05, - "loss": 0.8669, - "step": 8505 - }, - { - "epoch": 10.525664811379096, - "grad_norm": 0.435546875, - "learning_rate": 4.9352909764195576e-05, - "loss": 0.8594, - "step": 8510 - }, - { - "epoch": 10.531849103277676, - "grad_norm": 0.46484375, - "learning_rate": 4.922879380141805e-05, - "loss": 0.8501, - "step": 8515 - }, - { - "epoch": 10.538033395176253, - "grad_norm": 0.466796875, - "learning_rate": 4.9104783123737566e-05, - "loss": 0.8597, - "step": 8520 - }, - { - "epoch": 10.54421768707483, - "grad_norm": 0.431640625, - "learning_rate": 4.898087798831716e-05, - "loss": 0.8558, - "step": 8525 - }, - { - "epoch": 10.550401978973408, - "grad_norm": 0.4375, - "learning_rate": 4.885707865210093e-05, - "loss": 0.8599, - "step": 8530 - }, - { - "epoch": 10.556586270871986, - "grad_norm": 0.427734375, - "learning_rate": 4.873338537181368e-05, - "loss": 0.8601, - "step": 8535 - }, - { - "epoch": 10.562770562770563, - "grad_norm": 0.44921875, - "learning_rate": 4.860979840396016e-05, - "loss": 0.8651, - "step": 8540 - }, - { - "epoch": 10.56895485466914, - "grad_norm": 0.44140625, - "learning_rate": 4.8486318004824794e-05, - "loss": 0.8599, - "step": 8545 - }, - { - "epoch": 10.575139146567718, - "grad_norm": 0.453125, - "learning_rate": 4.836294443047088e-05, - "loss": 0.8638, - "step": 8550 - }, - { - "epoch": 10.581323438466296, - "grad_norm": 0.435546875, - "learning_rate": 4.823967793674033e-05, - "loss": 0.8507, - "step": 8555 - }, - { - "epoch": 10.587507730364873, - "grad_norm": 0.4453125, - "learning_rate": 4.8116518779252885e-05, - "loss": 0.8627, - "step": 8560 - }, - { - "epoch": 10.59369202226345, - "grad_norm": 0.427734375, - "learning_rate": 4.7993467213405706e-05, - "loss": 0.8496, - "step": 8565 - }, - { - "epoch": 10.599876314162028, - "grad_norm": 0.421875, - "learning_rate": 4.787052349437295e-05, - "loss": 0.8644, - "step": 8570 - }, - { - "epoch": 10.606060606060606, - "grad_norm": 0.45703125, - "learning_rate": 4.774768787710501e-05, - "loss": 0.8613, - "step": 8575 - }, - { - "epoch": 10.612244897959183, - "grad_norm": 0.42578125, - "learning_rate": 4.762496061632814e-05, - "loss": 0.8637, - "step": 8580 - }, - { - "epoch": 10.61842918985776, - "grad_norm": 0.44140625, - "learning_rate": 4.7502341966544e-05, - "loss": 0.8614, - "step": 8585 - }, - { - "epoch": 10.624613481756338, - "grad_norm": 0.44140625, - "learning_rate": 4.7379832182028814e-05, - "loss": 0.8562, - "step": 8590 - }, - { - "epoch": 10.630797773654917, - "grad_norm": 0.4375, - "learning_rate": 4.725743151683325e-05, - "loss": 0.8573, - "step": 8595 - }, - { - "epoch": 10.636982065553495, - "grad_norm": 0.43359375, - "learning_rate": 4.713514022478155e-05, - "loss": 0.8611, - "step": 8600 - }, - { - "epoch": 10.643166357452072, - "grad_norm": 0.42578125, - "learning_rate": 4.701295855947126e-05, - "loss": 0.8604, - "step": 8605 - }, - { - "epoch": 10.64935064935065, - "grad_norm": 0.423828125, - "learning_rate": 4.689088677427249e-05, - "loss": 0.8561, - "step": 8610 - }, - { - "epoch": 10.655534941249227, - "grad_norm": 0.46484375, - "learning_rate": 4.676892512232758e-05, - "loss": 0.8619, - "step": 8615 - }, - { - "epoch": 10.661719233147805, - "grad_norm": 0.42578125, - "learning_rate": 4.6647073856550415e-05, - "loss": 0.8636, - "step": 8620 - }, - { - "epoch": 10.667903525046382, - "grad_norm": 0.48046875, - "learning_rate": 4.652533322962597e-05, - "loss": 0.8545, - "step": 8625 - }, - { - "epoch": 10.67408781694496, - "grad_norm": 0.484375, - "learning_rate": 4.6403703494009875e-05, - "loss": 0.8494, - "step": 8630 - }, - { - "epoch": 10.680272108843537, - "grad_norm": 0.44921875, - "learning_rate": 4.6282184901927674e-05, - "loss": 0.8562, - "step": 8635 - }, - { - "epoch": 10.686456400742115, - "grad_norm": 0.427734375, - "learning_rate": 4.6160777705374524e-05, - "loss": 0.8659, - "step": 8640 - }, - { - "epoch": 10.692640692640692, - "grad_norm": 0.46484375, - "learning_rate": 4.603948215611461e-05, - "loss": 0.8543, - "step": 8645 - }, - { - "epoch": 10.69882498453927, - "grad_norm": 0.44140625, - "learning_rate": 4.591829850568046e-05, - "loss": 0.8608, - "step": 8650 - }, - { - "epoch": 10.705009276437847, - "grad_norm": 0.427734375, - "learning_rate": 4.579722700537268e-05, - "loss": 0.8566, - "step": 8655 - }, - { - "epoch": 10.711193568336425, - "grad_norm": 0.4296875, - "learning_rate": 4.567626790625921e-05, - "loss": 0.8532, - "step": 8660 - }, - { - "epoch": 10.717377860235002, - "grad_norm": 0.427734375, - "learning_rate": 4.555542145917501e-05, - "loss": 0.8549, - "step": 8665 - }, - { - "epoch": 10.723562152133582, - "grad_norm": 0.453125, - "learning_rate": 4.543468791472131e-05, - "loss": 0.8586, - "step": 8670 - }, - { - "epoch": 10.729746444032159, - "grad_norm": 0.443359375, - "learning_rate": 4.5314067523265333e-05, - "loss": 0.8684, - "step": 8675 - }, - { - "epoch": 10.735930735930737, - "grad_norm": 0.42578125, - "learning_rate": 4.519356053493958e-05, - "loss": 0.8558, - "step": 8680 - }, - { - "epoch": 10.742115027829314, - "grad_norm": 0.412109375, - "learning_rate": 4.5073167199641367e-05, - "loss": 0.8604, - "step": 8685 - }, - { - "epoch": 10.748299319727892, - "grad_norm": 0.41796875, - "learning_rate": 4.495288776703241e-05, - "loss": 0.855, - "step": 8690 - }, - { - "epoch": 10.754483611626469, - "grad_norm": 0.439453125, - "learning_rate": 4.483272248653811e-05, - "loss": 0.8472, - "step": 8695 - }, - { - "epoch": 10.760667903525047, - "grad_norm": 0.44140625, - "learning_rate": 4.471267160734731e-05, - "loss": 0.8542, - "step": 8700 - }, - { - "epoch": 10.766852195423624, - "grad_norm": 0.4453125, - "learning_rate": 4.459273537841141e-05, - "loss": 0.8558, - "step": 8705 - }, - { - "epoch": 10.773036487322202, - "grad_norm": 0.44921875, - "learning_rate": 4.447291404844424e-05, - "loss": 0.849, - "step": 8710 - }, - { - "epoch": 10.779220779220779, - "grad_norm": 0.4296875, - "learning_rate": 4.43532078659213e-05, - "loss": 0.8582, - "step": 8715 - }, - { - "epoch": 10.785405071119357, - "grad_norm": 0.451171875, - "learning_rate": 4.4233617079079236e-05, - "loss": 0.8583, - "step": 8720 - }, - { - "epoch": 10.791589363017934, - "grad_norm": 0.466796875, - "learning_rate": 4.4114141935915534e-05, - "loss": 0.8683, - "step": 8725 - }, - { - "epoch": 10.797773654916512, - "grad_norm": 0.43359375, - "learning_rate": 4.399478268418771e-05, - "loss": 0.87, - "step": 8730 - }, - { - "epoch": 10.803957946815089, - "grad_norm": 0.4375, - "learning_rate": 4.3875539571413106e-05, - "loss": 0.8518, - "step": 8735 - }, - { - "epoch": 10.810142238713667, - "grad_norm": 0.4296875, - "learning_rate": 4.375641284486808e-05, - "loss": 0.8577, - "step": 8740 - }, - { - "epoch": 10.816326530612244, - "grad_norm": 0.4921875, - "learning_rate": 4.36374027515878e-05, - "loss": 0.8692, - "step": 8745 - }, - { - "epoch": 10.822510822510823, - "grad_norm": 0.451171875, - "learning_rate": 4.3518509538365425e-05, - "loss": 0.866, - "step": 8750 - }, - { - "epoch": 10.8286951144094, - "grad_norm": 0.462890625, - "learning_rate": 4.3399733451751776e-05, - "loss": 0.859, - "step": 8755 - }, - { - "epoch": 10.834879406307978, - "grad_norm": 0.458984375, - "learning_rate": 4.328107473805487e-05, - "loss": 0.8584, - "step": 8760 - }, - { - "epoch": 10.841063698206556, - "grad_norm": 0.478515625, - "learning_rate": 4.3162533643339185e-05, - "loss": 0.8585, - "step": 8765 - }, - { - "epoch": 10.847247990105133, - "grad_norm": 0.443359375, - "learning_rate": 4.3044110413425395e-05, - "loss": 0.867, - "step": 8770 - }, - { - "epoch": 10.85343228200371, - "grad_norm": 0.4375, - "learning_rate": 4.2925805293889786e-05, - "loss": 0.854, - "step": 8775 - }, - { - "epoch": 10.859616573902288, - "grad_norm": 0.42578125, - "learning_rate": 4.2807618530063565e-05, - "loss": 0.8525, - "step": 8780 - }, - { - "epoch": 10.865800865800866, - "grad_norm": 0.43359375, - "learning_rate": 4.268955036703267e-05, - "loss": 0.8495, - "step": 8785 - }, - { - "epoch": 10.871985157699443, - "grad_norm": 0.443359375, - "learning_rate": 4.257160104963696e-05, - "loss": 0.863, - "step": 8790 - }, - { - "epoch": 10.87816944959802, - "grad_norm": 0.4296875, - "learning_rate": 4.245377082246995e-05, - "loss": 0.8614, - "step": 8795 - }, - { - "epoch": 10.884353741496598, - "grad_norm": 0.4453125, - "learning_rate": 4.23360599298781e-05, - "loss": 0.8674, - "step": 8800 - }, - { - "epoch": 10.890538033395176, - "grad_norm": 0.435546875, - "learning_rate": 4.2218468615960484e-05, - "loss": 0.8588, - "step": 8805 - }, - { - "epoch": 10.896722325293753, - "grad_norm": 0.4609375, - "learning_rate": 4.210099712456822e-05, - "loss": 0.8665, - "step": 8810 - }, - { - "epoch": 10.90290661719233, - "grad_norm": 0.451171875, - "learning_rate": 4.1983645699303786e-05, - "loss": 0.8527, - "step": 8815 - }, - { - "epoch": 10.909090909090908, - "grad_norm": 0.431640625, - "learning_rate": 4.1866414583520877e-05, - "loss": 0.8574, - "step": 8820 - }, - { - "epoch": 10.915275200989488, - "grad_norm": 0.439453125, - "learning_rate": 4.174930402032354e-05, - "loss": 0.8598, - "step": 8825 - }, - { - "epoch": 10.921459492888065, - "grad_norm": 0.458984375, - "learning_rate": 4.163231425256595e-05, - "loss": 0.8656, - "step": 8830 - }, - { - "epoch": 10.927643784786643, - "grad_norm": 0.455078125, - "learning_rate": 4.1515445522851784e-05, - "loss": 0.8586, - "step": 8835 - }, - { - "epoch": 10.93382807668522, - "grad_norm": 0.453125, - "learning_rate": 4.139869807353357e-05, - "loss": 0.8615, - "step": 8840 - }, - { - "epoch": 10.940012368583798, - "grad_norm": 0.447265625, - "learning_rate": 4.128207214671255e-05, - "loss": 0.8655, - "step": 8845 - }, - { - "epoch": 10.946196660482375, - "grad_norm": 0.435546875, - "learning_rate": 4.1165567984237764e-05, - "loss": 0.8549, - "step": 8850 - }, - { - "epoch": 10.952380952380953, - "grad_norm": 0.443359375, - "learning_rate": 4.1049185827705904e-05, - "loss": 0.859, - "step": 8855 - }, - { - "epoch": 10.95856524427953, - "grad_norm": 0.4375, - "learning_rate": 4.0932925918460516e-05, - "loss": 0.8618, - "step": 8860 - }, - { - "epoch": 10.964749536178108, - "grad_norm": 0.453125, - "learning_rate": 4.081678849759181e-05, - "loss": 0.8544, - "step": 8865 - }, - { - "epoch": 10.970933828076685, - "grad_norm": 0.42578125, - "learning_rate": 4.070077380593579e-05, - "loss": 0.8579, - "step": 8870 - }, - { - "epoch": 10.977118119975263, - "grad_norm": 0.439453125, - "learning_rate": 4.058488208407415e-05, - "loss": 0.8697, - "step": 8875 - }, - { - "epoch": 10.98330241187384, - "grad_norm": 0.423828125, - "learning_rate": 4.046911357233343e-05, - "loss": 0.8579, - "step": 8880 - }, - { - "epoch": 10.989486703772418, - "grad_norm": 0.439453125, - "learning_rate": 4.035346851078471e-05, - "loss": 0.8545, - "step": 8885 - }, - { - "epoch": 10.995670995670995, - "grad_norm": 0.42578125, - "learning_rate": 4.02379471392431e-05, - "loss": 0.8555, - "step": 8890 - }, - { - "epoch": 10.999381570810142, - "eval_loss": 2.519637107849121, - "eval_runtime": 0.6376, - "eval_samples_per_second": 15.684, - "eval_steps_per_second": 1.568, - "step": 8893 - }, - { - "epoch": 11.001855287569573, - "grad_norm": 0.47265625, - "learning_rate": 4.0122549697267244e-05, - "loss": 0.8561, - "step": 8895 - }, - { - "epoch": 11.00803957946815, - "grad_norm": 0.41796875, - "learning_rate": 4.000727642415867e-05, - "loss": 0.8568, - "step": 8900 - }, - { - "epoch": 11.01422387136673, - "grad_norm": 0.470703125, - "learning_rate": 3.9892127558961546e-05, - "loss": 0.8552, - "step": 8905 - }, - { - "epoch": 11.020408163265307, - "grad_norm": 0.44921875, - "learning_rate": 3.977710334046193e-05, - "loss": 0.8449, - "step": 8910 - }, - { - "epoch": 11.026592455163884, - "grad_norm": 0.453125, - "learning_rate": 3.9662204007187534e-05, - "loss": 0.8568, - "step": 8915 - }, - { - "epoch": 11.032776747062462, - "grad_norm": 0.44921875, - "learning_rate": 3.954742979740695e-05, - "loss": 0.8545, - "step": 8920 - }, - { - "epoch": 11.03896103896104, - "grad_norm": 0.427734375, - "learning_rate": 3.943278094912946e-05, - "loss": 0.8591, - "step": 8925 - }, - { - "epoch": 11.045145330859617, - "grad_norm": 0.443359375, - "learning_rate": 3.9318257700104174e-05, - "loss": 0.8521, - "step": 8930 - }, - { - "epoch": 11.051329622758194, - "grad_norm": 0.43359375, - "learning_rate": 3.920386028781995e-05, - "loss": 0.8529, - "step": 8935 - }, - { - "epoch": 11.057513914656772, - "grad_norm": 0.466796875, - "learning_rate": 3.9089588949504655e-05, - "loss": 0.8505, - "step": 8940 - }, - { - "epoch": 11.06369820655535, - "grad_norm": 0.443359375, - "learning_rate": 3.897544392212453e-05, - "loss": 0.8584, - "step": 8945 - }, - { - "epoch": 11.069882498453927, - "grad_norm": 0.443359375, - "learning_rate": 3.8861425442384135e-05, - "loss": 0.8614, - "step": 8950 - }, - { - "epoch": 11.076066790352504, - "grad_norm": 0.4375, - "learning_rate": 3.874753374672542e-05, - "loss": 0.8564, - "step": 8955 - }, - { - "epoch": 11.082251082251082, - "grad_norm": 0.4375, - "learning_rate": 3.863376907132752e-05, - "loss": 0.8547, - "step": 8960 - }, - { - "epoch": 11.08843537414966, - "grad_norm": 0.431640625, - "learning_rate": 3.8520131652106186e-05, - "loss": 0.8568, - "step": 8965 - }, - { - "epoch": 11.094619666048237, - "grad_norm": 0.431640625, - "learning_rate": 3.840662172471315e-05, - "loss": 0.8578, - "step": 8970 - }, - { - "epoch": 11.100803957946814, - "grad_norm": 0.43359375, - "learning_rate": 3.8293239524535915e-05, - "loss": 0.8555, - "step": 8975 - }, - { - "epoch": 11.106988249845394, - "grad_norm": 0.4453125, - "learning_rate": 3.8179985286696986e-05, - "loss": 0.8503, - "step": 8980 - }, - { - "epoch": 11.113172541743971, - "grad_norm": 0.439453125, - "learning_rate": 3.806685924605361e-05, - "loss": 0.8591, - "step": 8985 - }, - { - "epoch": 11.119356833642549, - "grad_norm": 0.439453125, - "learning_rate": 3.7953861637197085e-05, - "loss": 0.8569, - "step": 8990 - }, - { - "epoch": 11.125541125541126, - "grad_norm": 0.431640625, - "learning_rate": 3.784099269445247e-05, - "loss": 0.8432, - "step": 8995 - }, - { - "epoch": 11.131725417439704, - "grad_norm": 0.435546875, - "learning_rate": 3.772825265187802e-05, - "loss": 0.8495, - "step": 9000 - }, - { - "epoch": 11.137909709338281, - "grad_norm": 0.419921875, - "learning_rate": 3.7615641743264586e-05, - "loss": 0.8622, - "step": 9005 - }, - { - "epoch": 11.144094001236859, - "grad_norm": 0.4296875, - "learning_rate": 3.75031602021353e-05, - "loss": 0.8522, - "step": 9010 - }, - { - "epoch": 11.150278293135436, - "grad_norm": 0.40625, - "learning_rate": 3.739080826174498e-05, - "loss": 0.863, - "step": 9015 - }, - { - "epoch": 11.156462585034014, - "grad_norm": 0.423828125, - "learning_rate": 3.727858615507974e-05, - "loss": 0.8574, - "step": 9020 - }, - { - "epoch": 11.162646876932591, - "grad_norm": 0.42578125, - "learning_rate": 3.716649411485649e-05, - "loss": 0.8516, - "step": 9025 - }, - { - "epoch": 11.168831168831169, - "grad_norm": 0.4296875, - "learning_rate": 3.705453237352227e-05, - "loss": 0.8488, - "step": 9030 - }, - { - "epoch": 11.175015460729746, - "grad_norm": 0.4453125, - "learning_rate": 3.694270116325409e-05, - "loss": 0.8661, - "step": 9035 - }, - { - "epoch": 11.181199752628324, - "grad_norm": 0.439453125, - "learning_rate": 3.683100071595813e-05, - "loss": 0.8578, - "step": 9040 - }, - { - "epoch": 11.187384044526901, - "grad_norm": 0.416015625, - "learning_rate": 3.6719431263269533e-05, - "loss": 0.8548, - "step": 9045 - }, - { - "epoch": 11.193568336425479, - "grad_norm": 0.427734375, - "learning_rate": 3.660799303655166e-05, - "loss": 0.8551, - "step": 9050 - }, - { - "epoch": 11.199752628324056, - "grad_norm": 0.44140625, - "learning_rate": 3.6496686266895874e-05, - "loss": 0.8744, - "step": 9055 - }, - { - "epoch": 11.205936920222635, - "grad_norm": 0.421875, - "learning_rate": 3.638551118512089e-05, - "loss": 0.8612, - "step": 9060 - }, - { - "epoch": 11.212121212121213, - "grad_norm": 0.421875, - "learning_rate": 3.6274468021772323e-05, - "loss": 0.8442, - "step": 9065 - }, - { - "epoch": 11.21830550401979, - "grad_norm": 0.435546875, - "learning_rate": 3.616355700712221e-05, - "loss": 0.8621, - "step": 9070 - }, - { - "epoch": 11.224489795918368, - "grad_norm": 0.419921875, - "learning_rate": 3.605277837116854e-05, - "loss": 0.8561, - "step": 9075 - }, - { - "epoch": 11.230674087816945, - "grad_norm": 0.439453125, - "learning_rate": 3.594213234363486e-05, - "loss": 0.8508, - "step": 9080 - }, - { - "epoch": 11.236858379715523, - "grad_norm": 0.40625, - "learning_rate": 3.583161915396971e-05, - "loss": 0.8477, - "step": 9085 - }, - { - "epoch": 11.2430426716141, - "grad_norm": 0.421875, - "learning_rate": 3.5721239031346066e-05, - "loss": 0.8554, - "step": 9090 - }, - { - "epoch": 11.249226963512678, - "grad_norm": 0.419921875, - "learning_rate": 3.561099220466111e-05, - "loss": 0.8456, - "step": 9095 - }, - { - "epoch": 11.255411255411255, - "grad_norm": 0.435546875, - "learning_rate": 3.550087890253544e-05, - "loss": 0.8625, - "step": 9100 - }, - { - "epoch": 11.261595547309833, - "grad_norm": 0.427734375, - "learning_rate": 3.539089935331294e-05, - "loss": 0.8504, - "step": 9105 - }, - { - "epoch": 11.26777983920841, - "grad_norm": 0.419921875, - "learning_rate": 3.5281053785059925e-05, - "loss": 0.8567, - "step": 9110 - }, - { - "epoch": 11.273964131106988, - "grad_norm": 0.439453125, - "learning_rate": 3.5171342425565055e-05, - "loss": 0.8728, - "step": 9115 - }, - { - "epoch": 11.280148423005565, - "grad_norm": 0.439453125, - "learning_rate": 3.506176550233863e-05, - "loss": 0.855, - "step": 9120 - }, - { - "epoch": 11.286332714904143, - "grad_norm": 0.443359375, - "learning_rate": 3.495232324261206e-05, - "loss": 0.8537, - "step": 9125 - }, - { - "epoch": 11.29251700680272, - "grad_norm": 0.42578125, - "learning_rate": 3.484301587333772e-05, - "loss": 0.8524, - "step": 9130 - }, - { - "epoch": 11.2987012987013, - "grad_norm": 0.44140625, - "learning_rate": 3.473384362118794e-05, - "loss": 0.8689, - "step": 9135 - }, - { - "epoch": 11.304885590599877, - "grad_norm": 0.435546875, - "learning_rate": 3.462480671255515e-05, - "loss": 0.8683, - "step": 9140 - }, - { - "epoch": 11.311069882498455, - "grad_norm": 0.447265625, - "learning_rate": 3.4515905373551016e-05, - "loss": 0.863, - "step": 9145 - }, - { - "epoch": 11.317254174397032, - "grad_norm": 0.443359375, - "learning_rate": 3.440713983000601e-05, - "loss": 0.8485, - "step": 9150 - }, - { - "epoch": 11.32343846629561, - "grad_norm": 0.41796875, - "learning_rate": 3.42985103074691e-05, - "loss": 0.8589, - "step": 9155 - }, - { - "epoch": 11.329622758194187, - "grad_norm": 0.43359375, - "learning_rate": 3.419001703120709e-05, - "loss": 0.8566, - "step": 9160 - }, - { - "epoch": 11.335807050092765, - "grad_norm": 0.42578125, - "learning_rate": 3.4081660226204357e-05, - "loss": 0.8617, - "step": 9165 - }, - { - "epoch": 11.341991341991342, - "grad_norm": 0.45703125, - "learning_rate": 3.397344011716216e-05, - "loss": 0.8579, - "step": 9170 - }, - { - "epoch": 11.34817563388992, - "grad_norm": 0.443359375, - "learning_rate": 3.386535692849838e-05, - "loss": 0.8511, - "step": 9175 - }, - { - "epoch": 11.354359925788497, - "grad_norm": 0.478515625, - "learning_rate": 3.3757410884346894e-05, - "loss": 0.8612, - "step": 9180 - }, - { - "epoch": 11.360544217687075, - "grad_norm": 0.43359375, - "learning_rate": 3.364960220855723e-05, - "loss": 0.8714, - "step": 9185 - }, - { - "epoch": 11.366728509585652, - "grad_norm": 0.447265625, - "learning_rate": 3.354193112469407e-05, - "loss": 0.8527, - "step": 9190 - }, - { - "epoch": 11.37291280148423, - "grad_norm": 0.43359375, - "learning_rate": 3.34343978560367e-05, - "loss": 0.859, - "step": 9195 - }, - { - "epoch": 11.379097093382807, - "grad_norm": 0.431640625, - "learning_rate": 3.332700262557864e-05, - "loss": 0.857, - "step": 9200 - }, - { - "epoch": 11.385281385281385, - "grad_norm": 0.416015625, - "learning_rate": 3.321974565602722e-05, - "loss": 0.861, - "step": 9205 - }, - { - "epoch": 11.391465677179962, - "grad_norm": 0.423828125, - "learning_rate": 3.3112627169802946e-05, - "loss": 0.8597, - "step": 9210 - }, - { - "epoch": 11.397649969078541, - "grad_norm": 0.435546875, - "learning_rate": 3.300564738903926e-05, - "loss": 0.8571, - "step": 9215 - }, - { - "epoch": 11.403834260977119, - "grad_norm": 0.44140625, - "learning_rate": 3.289880653558188e-05, - "loss": 0.8648, - "step": 9220 - }, - { - "epoch": 11.410018552875696, - "grad_norm": 0.431640625, - "learning_rate": 3.2792104830988515e-05, - "loss": 0.8598, - "step": 9225 - }, - { - "epoch": 11.416202844774274, - "grad_norm": 0.419921875, - "learning_rate": 3.2685542496528185e-05, - "loss": 0.8615, - "step": 9230 - }, - { - "epoch": 11.422387136672851, - "grad_norm": 0.44140625, - "learning_rate": 3.257911975318109e-05, - "loss": 0.8524, - "step": 9235 - }, - { - "epoch": 11.428571428571429, - "grad_norm": 0.466796875, - "learning_rate": 3.2472836821637744e-05, - "loss": 0.8651, - "step": 9240 - }, - { - "epoch": 11.434755720470006, - "grad_norm": 0.423828125, - "learning_rate": 3.236669392229888e-05, - "loss": 0.8592, - "step": 9245 - }, - { - "epoch": 11.440940012368584, - "grad_norm": 0.423828125, - "learning_rate": 3.2260691275274835e-05, - "loss": 0.8623, - "step": 9250 - }, - { - "epoch": 11.447124304267161, - "grad_norm": 0.4453125, - "learning_rate": 3.2154829100385e-05, - "loss": 0.8588, - "step": 9255 - }, - { - "epoch": 11.453308596165739, - "grad_norm": 0.443359375, - "learning_rate": 3.204910761715763e-05, - "loss": 0.8525, - "step": 9260 - }, - { - "epoch": 11.459492888064316, - "grad_norm": 0.423828125, - "learning_rate": 3.194352704482899e-05, - "loss": 0.8494, - "step": 9265 - }, - { - "epoch": 11.465677179962894, - "grad_norm": 0.42578125, - "learning_rate": 3.1838087602343344e-05, - "loss": 0.8531, - "step": 9270 - }, - { - "epoch": 11.471861471861471, - "grad_norm": 0.431640625, - "learning_rate": 3.173278950835227e-05, - "loss": 0.8532, - "step": 9275 - }, - { - "epoch": 11.478045763760049, - "grad_norm": 0.443359375, - "learning_rate": 3.162763298121408e-05, - "loss": 0.8603, - "step": 9280 - }, - { - "epoch": 11.484230055658626, - "grad_norm": 0.423828125, - "learning_rate": 3.1522618238993725e-05, - "loss": 0.8561, - "step": 9285 - }, - { - "epoch": 11.490414347557206, - "grad_norm": 0.42578125, - "learning_rate": 3.1417745499461934e-05, - "loss": 0.8618, - "step": 9290 - }, - { - "epoch": 11.496598639455783, - "grad_norm": 0.44921875, - "learning_rate": 3.131301498009514e-05, - "loss": 0.8649, - "step": 9295 - }, - { - "epoch": 11.50278293135436, - "grad_norm": 0.43359375, - "learning_rate": 3.120842689807468e-05, - "loss": 0.8618, - "step": 9300 - }, - { - "epoch": 11.508967223252938, - "grad_norm": 0.4296875, - "learning_rate": 3.110398147028666e-05, - "loss": 0.8535, - "step": 9305 - }, - { - "epoch": 11.515151515151516, - "grad_norm": 0.427734375, - "learning_rate": 3.099967891332132e-05, - "loss": 0.8454, - "step": 9310 - }, - { - "epoch": 11.521335807050093, - "grad_norm": 0.43359375, - "learning_rate": 3.089551944347255e-05, - "loss": 0.8551, - "step": 9315 - }, - { - "epoch": 11.52752009894867, - "grad_norm": 0.447265625, - "learning_rate": 3.079150327673766e-05, - "loss": 0.8603, - "step": 9320 - }, - { - "epoch": 11.533704390847248, - "grad_norm": 0.435546875, - "learning_rate": 3.0687630628816656e-05, - "loss": 0.8571, - "step": 9325 - }, - { - "epoch": 11.539888682745826, - "grad_norm": 0.431640625, - "learning_rate": 3.058390171511196e-05, - "loss": 0.8548, - "step": 9330 - }, - { - "epoch": 11.546072974644403, - "grad_norm": 0.4140625, - "learning_rate": 3.0480316750728012e-05, - "loss": 0.8556, - "step": 9335 - }, - { - "epoch": 11.55225726654298, - "grad_norm": 0.421875, - "learning_rate": 3.0376875950470617e-05, - "loss": 0.8581, - "step": 9340 - }, - { - "epoch": 11.558441558441558, - "grad_norm": 0.42578125, - "learning_rate": 3.0273579528846762e-05, - "loss": 0.8545, - "step": 9345 - }, - { - "epoch": 11.564625850340136, - "grad_norm": 0.431640625, - "learning_rate": 3.0170427700063873e-05, - "loss": 0.8601, - "step": 9350 - }, - { - "epoch": 11.570810142238713, - "grad_norm": 0.4296875, - "learning_rate": 3.0067420678029702e-05, - "loss": 0.8641, - "step": 9355 - }, - { - "epoch": 11.57699443413729, - "grad_norm": 0.41796875, - "learning_rate": 2.996455867635155e-05, - "loss": 0.8537, - "step": 9360 - }, - { - "epoch": 11.583178726035868, - "grad_norm": 0.419921875, - "learning_rate": 2.9861841908336098e-05, - "loss": 0.8596, - "step": 9365 - }, - { - "epoch": 11.589363017934446, - "grad_norm": 0.416015625, - "learning_rate": 2.9759270586988865e-05, - "loss": 0.8543, - "step": 9370 - }, - { - "epoch": 11.595547309833025, - "grad_norm": 0.43359375, - "learning_rate": 2.9656844925013637e-05, - "loss": 0.8673, - "step": 9375 - }, - { - "epoch": 11.601731601731602, - "grad_norm": 0.447265625, - "learning_rate": 2.9554565134812294e-05, - "loss": 0.8609, - "step": 9380 - }, - { - "epoch": 11.60791589363018, - "grad_norm": 0.431640625, - "learning_rate": 2.9452431428484062e-05, - "loss": 0.853, - "step": 9385 - }, - { - "epoch": 11.614100185528757, - "grad_norm": 0.431640625, - "learning_rate": 2.9350444017825385e-05, - "loss": 0.8552, - "step": 9390 - }, - { - "epoch": 11.620284477427335, - "grad_norm": 0.431640625, - "learning_rate": 2.9248603114329222e-05, - "loss": 0.8633, - "step": 9395 - }, - { - "epoch": 11.626468769325912, - "grad_norm": 0.439453125, - "learning_rate": 2.9146908929184713e-05, - "loss": 0.8658, - "step": 9400 - }, - { - "epoch": 11.63265306122449, - "grad_norm": 0.427734375, - "learning_rate": 2.9045361673276872e-05, - "loss": 0.8517, - "step": 9405 - }, - { - "epoch": 11.638837353123067, - "grad_norm": 0.4296875, - "learning_rate": 2.894396155718585e-05, - "loss": 0.8599, - "step": 9410 - }, - { - "epoch": 11.645021645021645, - "grad_norm": 0.435546875, - "learning_rate": 2.8842708791186835e-05, - "loss": 0.8648, - "step": 9415 - }, - { - "epoch": 11.651205936920222, - "grad_norm": 0.4140625, - "learning_rate": 2.874160358524931e-05, - "loss": 0.8556, - "step": 9420 - }, - { - "epoch": 11.6573902288188, - "grad_norm": 0.431640625, - "learning_rate": 2.8640646149036898e-05, - "loss": 0.8598, - "step": 9425 - }, - { - "epoch": 11.663574520717377, - "grad_norm": 0.46875, - "learning_rate": 2.853983669190664e-05, - "loss": 0.8665, - "step": 9430 - }, - { - "epoch": 11.669758812615955, - "grad_norm": 0.421875, - "learning_rate": 2.8439175422908824e-05, - "loss": 0.8515, - "step": 9435 - }, - { - "epoch": 11.675943104514532, - "grad_norm": 0.42578125, - "learning_rate": 2.8338662550786443e-05, - "loss": 0.8591, - "step": 9440 - }, - { - "epoch": 11.682127396413112, - "grad_norm": 0.416015625, - "learning_rate": 2.823829828397465e-05, - "loss": 0.8331, - "step": 9445 - }, - { - "epoch": 11.688311688311689, - "grad_norm": 0.44921875, - "learning_rate": 2.8138082830600554e-05, - "loss": 0.8499, - "step": 9450 - }, - { - "epoch": 11.694495980210267, - "grad_norm": 0.443359375, - "learning_rate": 2.8038016398482593e-05, - "loss": 0.857, - "step": 9455 - }, - { - "epoch": 11.700680272108844, - "grad_norm": 0.431640625, - "learning_rate": 2.7938099195130153e-05, - "loss": 0.8609, - "step": 9460 - }, - { - "epoch": 11.706864564007422, - "grad_norm": 0.44140625, - "learning_rate": 2.7838331427743282e-05, - "loss": 0.853, - "step": 9465 - }, - { - "epoch": 11.713048855905999, - "grad_norm": 0.435546875, - "learning_rate": 2.7738713303211982e-05, - "loss": 0.8501, - "step": 9470 - }, - { - "epoch": 11.719233147804577, - "grad_norm": 0.4375, - "learning_rate": 2.763924502811609e-05, - "loss": 0.8579, - "step": 9475 - }, - { - "epoch": 11.725417439703154, - "grad_norm": 0.4375, - "learning_rate": 2.753992680872457e-05, - "loss": 0.8647, - "step": 9480 - }, - { - "epoch": 11.731601731601732, - "grad_norm": 0.427734375, - "learning_rate": 2.7440758850995318e-05, - "loss": 0.8558, - "step": 9485 - }, - { - "epoch": 11.737786023500309, - "grad_norm": 0.423828125, - "learning_rate": 2.7341741360574548e-05, - "loss": 0.8538, - "step": 9490 - }, - { - "epoch": 11.743970315398887, - "grad_norm": 0.43359375, - "learning_rate": 2.7242874542796482e-05, - "loss": 0.8588, - "step": 9495 - }, - { - "epoch": 11.750154607297464, - "grad_norm": 0.451171875, - "learning_rate": 2.7144158602682924e-05, - "loss": 0.8642, - "step": 9500 - }, - { - "epoch": 11.756338899196042, - "grad_norm": 0.451171875, - "learning_rate": 2.704559374494272e-05, - "loss": 0.8658, - "step": 9505 - }, - { - "epoch": 11.762523191094619, - "grad_norm": 0.43359375, - "learning_rate": 2.6947180173971508e-05, - "loss": 0.8672, - "step": 9510 - }, - { - "epoch": 11.768707482993197, - "grad_norm": 0.41015625, - "learning_rate": 2.6848918093851104e-05, - "loss": 0.8565, - "step": 9515 - }, - { - "epoch": 11.774891774891774, - "grad_norm": 0.4140625, - "learning_rate": 2.6750807708349267e-05, - "loss": 0.8646, - "step": 9520 - }, - { - "epoch": 11.781076066790352, - "grad_norm": 0.423828125, - "learning_rate": 2.665284922091912e-05, - "loss": 0.8566, - "step": 9525 - }, - { - "epoch": 11.78726035868893, - "grad_norm": 0.431640625, - "learning_rate": 2.6555042834698773e-05, - "loss": 0.853, - "step": 9530 - }, - { - "epoch": 11.793444650587508, - "grad_norm": 0.42578125, - "learning_rate": 2.6457388752511025e-05, - "loss": 0.8578, - "step": 9535 - }, - { - "epoch": 11.799628942486086, - "grad_norm": 0.435546875, - "learning_rate": 2.6359887176862718e-05, - "loss": 0.8506, - "step": 9540 - }, - { - "epoch": 11.805813234384663, - "grad_norm": 0.431640625, - "learning_rate": 2.626253830994455e-05, - "loss": 0.8466, - "step": 9545 - }, - { - "epoch": 11.81199752628324, - "grad_norm": 0.455078125, - "learning_rate": 2.6165342353630428e-05, - "loss": 0.8668, - "step": 9550 - }, - { - "epoch": 11.818181818181818, - "grad_norm": 0.439453125, - "learning_rate": 2.6068299509477266e-05, - "loss": 0.8625, - "step": 9555 - }, - { - "epoch": 11.824366110080396, - "grad_norm": 0.43359375, - "learning_rate": 2.5971409978724458e-05, - "loss": 0.8743, - "step": 9560 - }, - { - "epoch": 11.830550401978973, - "grad_norm": 0.416015625, - "learning_rate": 2.5874673962293373e-05, - "loss": 0.8559, - "step": 9565 - }, - { - "epoch": 11.83673469387755, - "grad_norm": 0.419921875, - "learning_rate": 2.577809166078716e-05, - "loss": 0.8542, - "step": 9570 - }, - { - "epoch": 11.842918985776128, - "grad_norm": 0.419921875, - "learning_rate": 2.5681663274490107e-05, - "loss": 0.8476, - "step": 9575 - }, - { - "epoch": 11.849103277674706, - "grad_norm": 0.421875, - "learning_rate": 2.558538900336741e-05, - "loss": 0.8544, - "step": 9580 - }, - { - "epoch": 11.855287569573283, - "grad_norm": 0.416015625, - "learning_rate": 2.548926904706459e-05, - "loss": 0.8608, - "step": 9585 - }, - { - "epoch": 11.86147186147186, - "grad_norm": 0.46484375, - "learning_rate": 2.5393303604907205e-05, - "loss": 0.8591, - "step": 9590 - }, - { - "epoch": 11.867656153370438, - "grad_norm": 0.435546875, - "learning_rate": 2.529749287590042e-05, - "loss": 0.8516, - "step": 9595 - }, - { - "epoch": 11.873840445269018, - "grad_norm": 0.423828125, - "learning_rate": 2.5201837058728505e-05, - "loss": 0.8549, - "step": 9600 - }, - { - "epoch": 11.880024737167595, - "grad_norm": 0.443359375, - "learning_rate": 2.5106336351754578e-05, - "loss": 0.8498, - "step": 9605 - }, - { - "epoch": 11.886209029066173, - "grad_norm": 0.421875, - "learning_rate": 2.5010990953019975e-05, - "loss": 0.8661, - "step": 9610 - }, - { - "epoch": 11.89239332096475, - "grad_norm": 0.455078125, - "learning_rate": 2.4915801060244092e-05, - "loss": 0.8668, - "step": 9615 - }, - { - "epoch": 11.898577612863328, - "grad_norm": 0.392578125, - "learning_rate": 2.4820766870823807e-05, - "loss": 0.8547, - "step": 9620 - }, - { - "epoch": 11.904761904761905, - "grad_norm": 0.439453125, - "learning_rate": 2.4725888581833058e-05, - "loss": 0.852, - "step": 9625 - }, - { - "epoch": 11.910946196660483, - "grad_norm": 0.431640625, - "learning_rate": 2.4631166390022574e-05, - "loss": 0.8616, - "step": 9630 - }, - { - "epoch": 11.91713048855906, - "grad_norm": 0.43359375, - "learning_rate": 2.4536600491819318e-05, - "loss": 0.8677, - "step": 9635 - }, - { - "epoch": 11.923314780457638, - "grad_norm": 0.408203125, - "learning_rate": 2.4442191083326195e-05, - "loss": 0.8516, - "step": 9640 - }, - { - "epoch": 11.929499072356215, - "grad_norm": 0.42578125, - "learning_rate": 2.4347938360321566e-05, - "loss": 0.8636, - "step": 9645 - }, - { - "epoch": 11.935683364254793, - "grad_norm": 0.4296875, - "learning_rate": 2.425384251825882e-05, - "loss": 0.8504, - "step": 9650 - }, - { - "epoch": 11.94186765615337, - "grad_norm": 0.447265625, - "learning_rate": 2.4159903752266156e-05, - "loss": 0.8598, - "step": 9655 - }, - { - "epoch": 11.948051948051948, - "grad_norm": 0.4375, - "learning_rate": 2.4066122257145894e-05, - "loss": 0.8374, - "step": 9660 - }, - { - "epoch": 11.954236239950525, - "grad_norm": 0.423828125, - "learning_rate": 2.3972498227374342e-05, - "loss": 0.8554, - "step": 9665 - }, - { - "epoch": 11.960420531849103, - "grad_norm": 0.419921875, - "learning_rate": 2.387903185710115e-05, - "loss": 0.8525, - "step": 9670 - }, - { - "epoch": 11.96660482374768, - "grad_norm": 0.462890625, - "learning_rate": 2.3785723340149134e-05, - "loss": 0.8618, - "step": 9675 - }, - { - "epoch": 11.972789115646258, - "grad_norm": 0.43359375, - "learning_rate": 2.3692572870013718e-05, - "loss": 0.8645, - "step": 9680 - }, - { - "epoch": 11.978973407544837, - "grad_norm": 0.435546875, - "learning_rate": 2.359958063986256e-05, - "loss": 0.8472, - "step": 9685 - }, - { - "epoch": 11.985157699443414, - "grad_norm": 0.41796875, - "learning_rate": 2.3506746842535242e-05, - "loss": 0.8537, - "step": 9690 - }, - { - "epoch": 11.991341991341992, - "grad_norm": 0.431640625, - "learning_rate": 2.3414071670542703e-05, - "loss": 0.8494, - "step": 9695 - }, - { - "epoch": 11.99752628324057, - "grad_norm": 0.435546875, - "learning_rate": 2.3321555316067045e-05, - "loss": 0.8442, - "step": 9700 - }, - { - "epoch": 12.0, - "eval_loss": 2.5192575454711914, - "eval_runtime": 0.5399, - "eval_samples_per_second": 18.52, - "eval_steps_per_second": 1.852, - "step": 9702 - }, - { - "epoch": 12.003710575139147, - "grad_norm": 0.42578125, - "learning_rate": 2.3229197970960924e-05, - "loss": 0.8504, - "step": 9705 - }, - { - "epoch": 12.009894867037724, - "grad_norm": 0.40625, - "learning_rate": 2.313699982674736e-05, - "loss": 0.8496, - "step": 9710 - }, - { - "epoch": 12.016079158936302, - "grad_norm": 0.447265625, - "learning_rate": 2.3044961074619165e-05, - "loss": 0.8635, - "step": 9715 - }, - { - "epoch": 12.02226345083488, - "grad_norm": 0.46484375, - "learning_rate": 2.295308190543859e-05, - "loss": 0.8556, - "step": 9720 - }, - { - "epoch": 12.028447742733457, - "grad_norm": 0.43359375, - "learning_rate": 2.2861362509737072e-05, - "loss": 0.8588, - "step": 9725 - }, - { - "epoch": 12.034632034632034, - "grad_norm": 0.412109375, - "learning_rate": 2.276980307771458e-05, - "loss": 0.858, - "step": 9730 - }, - { - "epoch": 12.040816326530612, - "grad_norm": 0.416015625, - "learning_rate": 2.26784037992395e-05, - "loss": 0.848, - "step": 9735 - }, - { - "epoch": 12.04700061842919, - "grad_norm": 0.43359375, - "learning_rate": 2.2587164863847975e-05, - "loss": 0.8624, - "step": 9740 - }, - { - "epoch": 12.053184910327767, - "grad_norm": 0.4375, - "learning_rate": 2.249608646074375e-05, - "loss": 0.8623, - "step": 9745 - }, - { - "epoch": 12.059369202226344, - "grad_norm": 0.431640625, - "learning_rate": 2.2405168778797646e-05, - "loss": 0.8504, - "step": 9750 - }, - { - "epoch": 12.065553494124922, - "grad_norm": 0.478515625, - "learning_rate": 2.2314412006547125e-05, - "loss": 0.8403, - "step": 9755 - }, - { - "epoch": 12.071737786023501, - "grad_norm": 0.4453125, - "learning_rate": 2.222381633219608e-05, - "loss": 0.859, - "step": 9760 - }, - { - "epoch": 12.077922077922079, - "grad_norm": 0.416015625, - "learning_rate": 2.2133381943614207e-05, - "loss": 0.8639, - "step": 9765 - }, - { - "epoch": 12.084106369820656, - "grad_norm": 0.447265625, - "learning_rate": 2.204310902833685e-05, - "loss": 0.859, - "step": 9770 - }, - { - "epoch": 12.090290661719234, - "grad_norm": 0.451171875, - "learning_rate": 2.1952997773564467e-05, - "loss": 0.8598, - "step": 9775 - }, - { - "epoch": 12.096474953617811, - "grad_norm": 0.423828125, - "learning_rate": 2.1863048366162208e-05, - "loss": 0.8472, - "step": 9780 - }, - { - "epoch": 12.102659245516389, - "grad_norm": 0.435546875, - "learning_rate": 2.1773260992659728e-05, - "loss": 0.8557, - "step": 9785 - }, - { - "epoch": 12.108843537414966, - "grad_norm": 0.443359375, - "learning_rate": 2.1683635839250537e-05, - "loss": 0.8597, - "step": 9790 - }, - { - "epoch": 12.115027829313544, - "grad_norm": 0.41796875, - "learning_rate": 2.159417309179189e-05, - "loss": 0.8594, - "step": 9795 - }, - { - "epoch": 12.121212121212121, - "grad_norm": 0.43359375, - "learning_rate": 2.15048729358041e-05, - "loss": 0.859, - "step": 9800 - }, - { - "epoch": 12.127396413110699, - "grad_norm": 0.42578125, - "learning_rate": 2.141573555647042e-05, - "loss": 0.8495, - "step": 9805 - }, - { - "epoch": 12.133580705009276, - "grad_norm": 0.43359375, - "learning_rate": 2.1326761138636553e-05, - "loss": 0.8567, - "step": 9810 - }, - { - "epoch": 12.139764996907854, - "grad_norm": 0.46875, - "learning_rate": 2.123794986681017e-05, - "loss": 0.8475, - "step": 9815 - }, - { - "epoch": 12.145949288806431, - "grad_norm": 0.447265625, - "learning_rate": 2.114930192516076e-05, - "loss": 0.8616, - "step": 9820 - }, - { - "epoch": 12.152133580705009, - "grad_norm": 0.421875, - "learning_rate": 2.106081749751897e-05, - "loss": 0.8602, - "step": 9825 - }, - { - "epoch": 12.158317872603586, - "grad_norm": 0.427734375, - "learning_rate": 2.097249676737648e-05, - "loss": 0.8583, - "step": 9830 - }, - { - "epoch": 12.164502164502165, - "grad_norm": 0.431640625, - "learning_rate": 2.0884339917885433e-05, - "loss": 0.8533, - "step": 9835 - }, - { - "epoch": 12.170686456400743, - "grad_norm": 0.427734375, - "learning_rate": 2.0796347131858186e-05, - "loss": 0.8532, - "step": 9840 - }, - { - "epoch": 12.17687074829932, - "grad_norm": 0.42578125, - "learning_rate": 2.0708518591766825e-05, - "loss": 0.8622, - "step": 9845 - }, - { - "epoch": 12.183055040197898, - "grad_norm": 0.41015625, - "learning_rate": 2.0620854479742834e-05, - "loss": 0.8548, - "step": 9850 - }, - { - "epoch": 12.189239332096475, - "grad_norm": 0.427734375, - "learning_rate": 2.05333549775768e-05, - "loss": 0.8512, - "step": 9855 - }, - { - "epoch": 12.195423623995053, - "grad_norm": 0.4296875, - "learning_rate": 2.044602026671786e-05, - "loss": 0.8632, - "step": 9860 - }, - { - "epoch": 12.20160791589363, - "grad_norm": 0.427734375, - "learning_rate": 2.0358850528273455e-05, - "loss": 0.856, - "step": 9865 - }, - { - "epoch": 12.207792207792208, - "grad_norm": 0.419921875, - "learning_rate": 2.027184594300898e-05, - "loss": 0.856, - "step": 9870 - }, - { - "epoch": 12.213976499690785, - "grad_norm": 0.4375, - "learning_rate": 2.018500669134723e-05, - "loss": 0.8541, - "step": 9875 - }, - { - "epoch": 12.220160791589363, - "grad_norm": 0.42578125, - "learning_rate": 2.0098332953368272e-05, - "loss": 0.86, - "step": 9880 - }, - { - "epoch": 12.22634508348794, - "grad_norm": 0.421875, - "learning_rate": 2.0011824908808808e-05, - "loss": 0.8557, - "step": 9885 - }, - { - "epoch": 12.232529375386518, - "grad_norm": 0.4140625, - "learning_rate": 1.9925482737062085e-05, - "loss": 0.8758, - "step": 9890 - }, - { - "epoch": 12.238713667285095, - "grad_norm": 0.412109375, - "learning_rate": 1.9839306617177243e-05, - "loss": 0.8628, - "step": 9895 - }, - { - "epoch": 12.244897959183673, - "grad_norm": 0.40625, - "learning_rate": 1.9753296727859195e-05, - "loss": 0.854, - "step": 9900 - }, - { - "epoch": 12.25108225108225, - "grad_norm": 0.46484375, - "learning_rate": 1.966745324746806e-05, - "loss": 0.8571, - "step": 9905 - }, - { - "epoch": 12.257266542980828, - "grad_norm": 0.431640625, - "learning_rate": 1.9581776354018854e-05, - "loss": 0.8629, - "step": 9910 - }, - { - "epoch": 12.263450834879407, - "grad_norm": 0.43359375, - "learning_rate": 1.9496266225181248e-05, - "loss": 0.8587, - "step": 9915 - }, - { - "epoch": 12.269635126777985, - "grad_norm": 0.427734375, - "learning_rate": 1.941092303827896e-05, - "loss": 0.8526, - "step": 9920 - }, - { - "epoch": 12.275819418676562, - "grad_norm": 0.435546875, - "learning_rate": 1.9325746970289627e-05, - "loss": 0.8631, - "step": 9925 - }, - { - "epoch": 12.28200371057514, - "grad_norm": 0.41796875, - "learning_rate": 1.9240738197844278e-05, - "loss": 0.8637, - "step": 9930 - }, - { - "epoch": 12.288188002473717, - "grad_norm": 0.421875, - "learning_rate": 1.9155896897226988e-05, - "loss": 0.8544, - "step": 9935 - }, - { - "epoch": 12.294372294372295, - "grad_norm": 0.453125, - "learning_rate": 1.9071223244374614e-05, - "loss": 0.8676, - "step": 9940 - }, - { - "epoch": 12.300556586270872, - "grad_norm": 0.431640625, - "learning_rate": 1.89867174148763e-05, - "loss": 0.8686, - "step": 9945 - }, - { - "epoch": 12.30674087816945, - "grad_norm": 0.416015625, - "learning_rate": 1.8902379583973208e-05, - "loss": 0.8476, - "step": 9950 - }, - { - "epoch": 12.312925170068027, - "grad_norm": 0.4375, - "learning_rate": 1.8818209926558082e-05, - "loss": 0.8516, - "step": 9955 - }, - { - "epoch": 12.319109461966605, - "grad_norm": 0.43359375, - "learning_rate": 1.8734208617174988e-05, - "loss": 0.8567, - "step": 9960 - }, - { - "epoch": 12.325293753865182, - "grad_norm": 0.42578125, - "learning_rate": 1.8650375830018785e-05, - "loss": 0.8539, - "step": 9965 - }, - { - "epoch": 12.33147804576376, - "grad_norm": 0.42578125, - "learning_rate": 1.856671173893497e-05, - "loss": 0.8477, - "step": 9970 - }, - { - "epoch": 12.337662337662337, - "grad_norm": 0.431640625, - "learning_rate": 1.8483216517419142e-05, - "loss": 0.8546, - "step": 9975 - }, - { - "epoch": 12.343846629560915, - "grad_norm": 0.421875, - "learning_rate": 1.839989033861673e-05, - "loss": 0.8508, - "step": 9980 - }, - { - "epoch": 12.350030921459492, - "grad_norm": 0.42578125, - "learning_rate": 1.8316733375322637e-05, - "loss": 0.8376, - "step": 9985 - }, - { - "epoch": 12.35621521335807, - "grad_norm": 0.443359375, - "learning_rate": 1.8233745799980817e-05, - "loss": 0.8456, - "step": 9990 - }, - { - "epoch": 12.362399505256649, - "grad_norm": 0.4375, - "learning_rate": 1.8150927784684e-05, - "loss": 0.8578, - "step": 9995 - }, - { - "epoch": 12.368583797155226, - "grad_norm": 0.447265625, - "learning_rate": 1.8068279501173335e-05, - "loss": 0.8569, - "step": 10000 - }, - { - "epoch": 12.374768089053804, - "grad_norm": 0.4375, - "learning_rate": 1.7985801120837865e-05, - "loss": 0.8492, - "step": 10005 - }, - { - "epoch": 12.380952380952381, - "grad_norm": 0.447265625, - "learning_rate": 1.790349281471445e-05, - "loss": 0.8583, - "step": 10010 - }, - { - "epoch": 12.387136672850959, - "grad_norm": 0.43359375, - "learning_rate": 1.782135475348714e-05, - "loss": 0.8493, - "step": 10015 - }, - { - "epoch": 12.393320964749536, - "grad_norm": 0.4453125, - "learning_rate": 1.773938710748706e-05, - "loss": 0.8574, - "step": 10020 - }, - { - "epoch": 12.399505256648114, - "grad_norm": 0.43359375, - "learning_rate": 1.765759004669183e-05, - "loss": 0.8448, - "step": 10025 - }, - { - "epoch": 12.405689548546691, - "grad_norm": 0.41796875, - "learning_rate": 1.757596374072543e-05, - "loss": 0.8523, - "step": 10030 - }, - { - "epoch": 12.411873840445269, - "grad_norm": 0.443359375, - "learning_rate": 1.7494508358857677e-05, - "loss": 0.8663, - "step": 10035 - }, - { - "epoch": 12.418058132343846, - "grad_norm": 0.41796875, - "learning_rate": 1.741322407000391e-05, - "loss": 0.8499, - "step": 10040 - }, - { - "epoch": 12.424242424242424, - "grad_norm": 0.416015625, - "learning_rate": 1.7332111042724775e-05, - "loss": 0.8595, - "step": 10045 - }, - { - "epoch": 12.430426716141001, - "grad_norm": 0.427734375, - "learning_rate": 1.7251169445225657e-05, - "loss": 0.8644, - "step": 10050 - }, - { - "epoch": 12.436611008039579, - "grad_norm": 0.408203125, - "learning_rate": 1.7170399445356532e-05, - "loss": 0.8636, - "step": 10055 - }, - { - "epoch": 12.442795299938156, - "grad_norm": 0.42578125, - "learning_rate": 1.70898012106115e-05, - "loss": 0.8547, - "step": 10060 - }, - { - "epoch": 12.448979591836734, - "grad_norm": 0.4140625, - "learning_rate": 1.700937490812844e-05, - "loss": 0.8547, - "step": 10065 - }, - { - "epoch": 12.455163883735313, - "grad_norm": 0.41796875, - "learning_rate": 1.692912070468874e-05, - "loss": 0.8514, - "step": 10070 - }, - { - "epoch": 12.46134817563389, - "grad_norm": 0.431640625, - "learning_rate": 1.684903876671685e-05, - "loss": 0.849, - "step": 10075 - }, - { - "epoch": 12.467532467532468, - "grad_norm": 0.44140625, - "learning_rate": 1.676912926028007e-05, - "loss": 0.8497, - "step": 10080 - }, - { - "epoch": 12.473716759431046, - "grad_norm": 0.443359375, - "learning_rate": 1.668939235108802e-05, - "loss": 0.8668, - "step": 10085 - }, - { - "epoch": 12.479901051329623, - "grad_norm": 0.4140625, - "learning_rate": 1.660982820449247e-05, - "loss": 0.8532, - "step": 10090 - }, - { - "epoch": 12.4860853432282, - "grad_norm": 0.419921875, - "learning_rate": 1.6530436985486996e-05, - "loss": 0.8464, - "step": 10095 - }, - { - "epoch": 12.492269635126778, - "grad_norm": 0.427734375, - "learning_rate": 1.6451218858706374e-05, - "loss": 0.8589, - "step": 10100 - }, - { - "epoch": 12.498453927025356, - "grad_norm": 0.412109375, - "learning_rate": 1.637217398842663e-05, - "loss": 0.8584, - "step": 10105 - }, - { - "epoch": 12.504638218923933, - "grad_norm": 0.435546875, - "learning_rate": 1.6293302538564382e-05, - "loss": 0.8587, - "step": 10110 - }, - { - "epoch": 12.51082251082251, - "grad_norm": 0.40234375, - "learning_rate": 1.6214604672676704e-05, - "loss": 0.8626, - "step": 10115 - }, - { - "epoch": 12.517006802721088, - "grad_norm": 0.421875, - "learning_rate": 1.6136080553960687e-05, - "loss": 0.8634, - "step": 10120 - }, - { - "epoch": 12.523191094619666, - "grad_norm": 0.44140625, - "learning_rate": 1.6057730345253065e-05, - "loss": 0.8508, - "step": 10125 - }, - { - "epoch": 12.529375386518243, - "grad_norm": 0.4453125, - "learning_rate": 1.5979554209030024e-05, - "loss": 0.8541, - "step": 10130 - }, - { - "epoch": 12.53555967841682, - "grad_norm": 0.423828125, - "learning_rate": 1.5901552307406653e-05, - "loss": 0.8523, - "step": 10135 - }, - { - "epoch": 12.541743970315398, - "grad_norm": 0.416015625, - "learning_rate": 1.5823724802136865e-05, - "loss": 0.8674, - "step": 10140 - }, - { - "epoch": 12.547928262213976, - "grad_norm": 0.453125, - "learning_rate": 1.5746071854612797e-05, - "loss": 0.859, - "step": 10145 - }, - { - "epoch": 12.554112554112555, - "grad_norm": 0.4375, - "learning_rate": 1.5668593625864715e-05, - "loss": 0.8582, - "step": 10150 - }, - { - "epoch": 12.560296846011132, - "grad_norm": 0.427734375, - "learning_rate": 1.5591290276560466e-05, - "loss": 0.857, - "step": 10155 - }, - { - "epoch": 12.56648113790971, - "grad_norm": 0.408203125, - "learning_rate": 1.5514161967005337e-05, - "loss": 0.8556, - "step": 10160 - }, - { - "epoch": 12.572665429808287, - "grad_norm": 0.443359375, - "learning_rate": 1.543720885714157e-05, - "loss": 0.854, - "step": 10165 - }, - { - "epoch": 12.578849721706865, - "grad_norm": 0.435546875, - "learning_rate": 1.536043110654809e-05, - "loss": 0.8486, - "step": 10170 - }, - { - "epoch": 12.585034013605442, - "grad_norm": 0.4375, - "learning_rate": 1.528382887444022e-05, - "loss": 0.8531, - "step": 10175 - }, - { - "epoch": 12.59121830550402, - "grad_norm": 0.419921875, - "learning_rate": 1.5207402319669306e-05, - "loss": 0.8527, - "step": 10180 - }, - { - "epoch": 12.597402597402597, - "grad_norm": 0.423828125, - "learning_rate": 1.5131151600722337e-05, - "loss": 0.8534, - "step": 10185 - }, - { - "epoch": 12.603586889301175, - "grad_norm": 0.439453125, - "learning_rate": 1.505507687572173e-05, - "loss": 0.8519, - "step": 10190 - }, - { - "epoch": 12.609771181199752, - "grad_norm": 0.421875, - "learning_rate": 1.4979178302424867e-05, - "loss": 0.8637, - "step": 10195 - }, - { - "epoch": 12.61595547309833, - "grad_norm": 0.43359375, - "learning_rate": 1.4903456038223939e-05, - "loss": 0.8663, - "step": 10200 - }, - { - "epoch": 12.622139764996907, - "grad_norm": 0.4296875, - "learning_rate": 1.4827910240145426e-05, - "loss": 0.858, - "step": 10205 - }, - { - "epoch": 12.628324056895485, - "grad_norm": 0.4296875, - "learning_rate": 1.4752541064849946e-05, - "loss": 0.8537, - "step": 10210 - }, - { - "epoch": 12.634508348794062, - "grad_norm": 0.412109375, - "learning_rate": 1.4677348668631763e-05, - "loss": 0.8485, - "step": 10215 - }, - { - "epoch": 12.64069264069264, - "grad_norm": 0.412109375, - "learning_rate": 1.4602333207418651e-05, - "loss": 0.8529, - "step": 10220 - }, - { - "epoch": 12.64687693259122, - "grad_norm": 0.439453125, - "learning_rate": 1.4527494836771438e-05, - "loss": 0.8639, - "step": 10225 - }, - { - "epoch": 12.653061224489797, - "grad_norm": 0.451171875, - "learning_rate": 1.4452833711883628e-05, - "loss": 0.8531, - "step": 10230 - }, - { - "epoch": 12.659245516388374, - "grad_norm": 0.412109375, - "learning_rate": 1.4378349987581307e-05, - "loss": 0.8619, - "step": 10235 - }, - { - "epoch": 12.665429808286952, - "grad_norm": 0.42578125, - "learning_rate": 1.4304043818322565e-05, - "loss": 0.8556, - "step": 10240 - }, - { - "epoch": 12.67161410018553, - "grad_norm": 0.416015625, - "learning_rate": 1.4229915358197377e-05, - "loss": 0.8543, - "step": 10245 - }, - { - "epoch": 12.677798392084107, - "grad_norm": 0.427734375, - "learning_rate": 1.4155964760927176e-05, - "loss": 0.8553, - "step": 10250 - }, - { - "epoch": 12.683982683982684, - "grad_norm": 0.427734375, - "learning_rate": 1.4082192179864518e-05, - "loss": 0.8439, - "step": 10255 - }, - { - "epoch": 12.690166975881262, - "grad_norm": 0.421875, - "learning_rate": 1.4008597767992871e-05, - "loss": 0.8558, - "step": 10260 - }, - { - "epoch": 12.69635126777984, - "grad_norm": 0.447265625, - "learning_rate": 1.3935181677926156e-05, - "loss": 0.858, - "step": 10265 - }, - { - "epoch": 12.702535559678417, - "grad_norm": 0.4453125, - "learning_rate": 1.3861944061908583e-05, - "loss": 0.8561, - "step": 10270 - }, - { - "epoch": 12.708719851576994, - "grad_norm": 0.47265625, - "learning_rate": 1.3788885071814172e-05, - "loss": 0.8531, - "step": 10275 - }, - { - "epoch": 12.714904143475572, - "grad_norm": 0.44140625, - "learning_rate": 1.3716004859146592e-05, - "loss": 0.8465, - "step": 10280 - }, - { - "epoch": 12.72108843537415, - "grad_norm": 0.423828125, - "learning_rate": 1.3643303575038779e-05, - "loss": 0.8433, - "step": 10285 - }, - { - "epoch": 12.727272727272727, - "grad_norm": 0.4375, - "learning_rate": 1.3570781370252582e-05, - "loss": 0.8566, - "step": 10290 - }, - { - "epoch": 12.733457019171304, - "grad_norm": 0.4296875, - "learning_rate": 1.3498438395178492e-05, - "loss": 0.8578, - "step": 10295 - }, - { - "epoch": 12.739641311069882, - "grad_norm": 0.423828125, - "learning_rate": 1.3426274799835337e-05, - "loss": 0.8606, - "step": 10300 - }, - { - "epoch": 12.745825602968461, - "grad_norm": 0.44140625, - "learning_rate": 1.3354290733869979e-05, - "loss": 0.8597, - "step": 10305 - }, - { - "epoch": 12.752009894867038, - "grad_norm": 0.41796875, - "learning_rate": 1.328248634655701e-05, - "loss": 0.8595, - "step": 10310 - }, - { - "epoch": 12.758194186765616, - "grad_norm": 0.439453125, - "learning_rate": 1.3210861786798335e-05, - "loss": 0.8615, - "step": 10315 - }, - { - "epoch": 12.764378478664193, - "grad_norm": 0.453125, - "learning_rate": 1.3139417203123027e-05, - "loss": 0.8588, - "step": 10320 - }, - { - "epoch": 12.770562770562771, - "grad_norm": 0.43359375, - "learning_rate": 1.306815274368689e-05, - "loss": 0.8572, - "step": 10325 - }, - { - "epoch": 12.776747062461348, - "grad_norm": 0.423828125, - "learning_rate": 1.2997068556272263e-05, - "loss": 0.8521, - "step": 10330 - }, - { - "epoch": 12.782931354359926, - "grad_norm": 0.416015625, - "learning_rate": 1.2926164788287543e-05, - "loss": 0.852, - "step": 10335 - }, - { - "epoch": 12.789115646258503, - "grad_norm": 0.4453125, - "learning_rate": 1.2855441586767113e-05, - "loss": 0.8662, - "step": 10340 - }, - { - "epoch": 12.79529993815708, - "grad_norm": 0.44140625, - "learning_rate": 1.278489909837085e-05, - "loss": 0.8589, - "step": 10345 - }, - { - "epoch": 12.801484230055658, - "grad_norm": 0.423828125, - "learning_rate": 1.2714537469383858e-05, - "loss": 0.8578, - "step": 10350 - }, - { - "epoch": 12.807668521954236, - "grad_norm": 0.443359375, - "learning_rate": 1.2644356845716287e-05, - "loss": 0.8596, - "step": 10355 - }, - { - "epoch": 12.813852813852813, - "grad_norm": 0.46484375, - "learning_rate": 1.2574357372902767e-05, - "loss": 0.8585, - "step": 10360 - }, - { - "epoch": 12.82003710575139, - "grad_norm": 0.443359375, - "learning_rate": 1.2504539196102439e-05, - "loss": 0.8529, - "step": 10365 - }, - { - "epoch": 12.826221397649968, - "grad_norm": 0.41796875, - "learning_rate": 1.243490246009842e-05, - "loss": 0.8488, - "step": 10370 - }, - { - "epoch": 12.832405689548546, - "grad_norm": 0.447265625, - "learning_rate": 1.2365447309297529e-05, - "loss": 0.8562, - "step": 10375 - }, - { - "epoch": 12.838589981447125, - "grad_norm": 0.4375, - "learning_rate": 1.2296173887730123e-05, - "loss": 0.8609, - "step": 10380 - }, - { - "epoch": 12.844774273345703, - "grad_norm": 0.412109375, - "learning_rate": 1.2227082339049612e-05, - "loss": 0.862, - "step": 10385 - }, - { - "epoch": 12.85095856524428, - "grad_norm": 0.4296875, - "learning_rate": 1.215817280653232e-05, - "loss": 0.8521, - "step": 10390 - }, - { - "epoch": 12.857142857142858, - "grad_norm": 0.4375, - "learning_rate": 1.2089445433077073e-05, - "loss": 0.8468, - "step": 10395 - }, - { - "epoch": 12.863327149041435, - "grad_norm": 0.423828125, - "learning_rate": 1.2020900361204968e-05, - "loss": 0.855, - "step": 10400 - }, - { - "epoch": 12.869511440940013, - "grad_norm": 0.44140625, - "learning_rate": 1.19525377330591e-05, - "loss": 0.8517, - "step": 10405 - }, - { - "epoch": 12.87569573283859, - "grad_norm": 0.40625, - "learning_rate": 1.1884357690404158e-05, - "loss": 0.8631, - "step": 10410 - }, - { - "epoch": 12.881880024737168, - "grad_norm": 0.4296875, - "learning_rate": 1.1816360374626245e-05, - "loss": 0.8458, - "step": 10415 - }, - { - "epoch": 12.888064316635745, - "grad_norm": 0.41015625, - "learning_rate": 1.1748545926732535e-05, - "loss": 0.8588, - "step": 10420 - }, - { - "epoch": 12.894248608534323, - "grad_norm": 0.408203125, - "learning_rate": 1.1680914487350959e-05, - "loss": 0.8574, - "step": 10425 - }, - { - "epoch": 12.9004329004329, - "grad_norm": 0.4296875, - "learning_rate": 1.1613466196729984e-05, - "loss": 0.8485, - "step": 10430 - }, - { - "epoch": 12.906617192331478, - "grad_norm": 0.4296875, - "learning_rate": 1.1546201194738227e-05, - "loss": 0.8718, - "step": 10435 - }, - { - "epoch": 12.912801484230055, - "grad_norm": 0.4296875, - "learning_rate": 1.1479119620864276e-05, - "loss": 0.851, - "step": 10440 - }, - { - "epoch": 12.918985776128633, - "grad_norm": 0.419921875, - "learning_rate": 1.1412221614216278e-05, - "loss": 0.846, - "step": 10445 - }, - { - "epoch": 12.92517006802721, - "grad_norm": 0.416015625, - "learning_rate": 1.1345507313521786e-05, - "loss": 0.8575, - "step": 10450 - }, - { - "epoch": 12.931354359925788, - "grad_norm": 0.427734375, - "learning_rate": 1.1278976857127311e-05, - "loss": 0.8535, - "step": 10455 - }, - { - "epoch": 12.937538651824367, - "grad_norm": 0.443359375, - "learning_rate": 1.1212630382998213e-05, - "loss": 0.8522, - "step": 10460 - }, - { - "epoch": 12.943722943722944, - "grad_norm": 0.439453125, - "learning_rate": 1.1146468028718237e-05, - "loss": 0.8622, - "step": 10465 - }, - { - "epoch": 12.949907235621522, - "grad_norm": 0.43359375, - "learning_rate": 1.1080489931489391e-05, - "loss": 0.8574, - "step": 10470 - }, - { - "epoch": 12.9560915275201, - "grad_norm": 0.419921875, - "learning_rate": 1.101469622813157e-05, - "loss": 0.867, - "step": 10475 - }, - { - "epoch": 12.962275819418677, - "grad_norm": 0.439453125, - "learning_rate": 1.0949087055082252e-05, - "loss": 0.8508, - "step": 10480 - }, - { - "epoch": 12.968460111317254, - "grad_norm": 0.44140625, - "learning_rate": 1.0883662548396257e-05, - "loss": 0.8491, - "step": 10485 - }, - { - "epoch": 12.974644403215832, - "grad_norm": 0.4375, - "learning_rate": 1.0818422843745512e-05, - "loss": 0.8596, - "step": 10490 - }, - { - "epoch": 12.98082869511441, - "grad_norm": 0.431640625, - "learning_rate": 1.0753368076418647e-05, - "loss": 0.8611, - "step": 10495 - }, - { - "epoch": 12.987012987012987, - "grad_norm": 0.40625, - "learning_rate": 1.0688498381320855e-05, - "loss": 0.8466, - "step": 10500 - }, - { - "epoch": 12.993197278911564, - "grad_norm": 0.431640625, - "learning_rate": 1.0623813892973455e-05, - "loss": 0.8582, - "step": 10505 - }, - { - "epoch": 12.999381570810142, - "grad_norm": 0.44140625, - "learning_rate": 1.0559314745513805e-05, - "loss": 0.8621, - "step": 10510 - }, - { - "epoch": 12.999381570810142, - "eval_loss": 2.520648717880249, - "eval_runtime": 0.6408, - "eval_samples_per_second": 15.605, - "eval_steps_per_second": 1.56, - "step": 10510 - }, - { - "epoch": 13.00556586270872, - "grad_norm": 0.439453125, - "learning_rate": 1.049500107269481e-05, - "loss": 0.8552, - "step": 10515 - }, - { - "epoch": 13.011750154607297, - "grad_norm": 0.4296875, - "learning_rate": 1.0430873007884857e-05, - "loss": 0.8558, - "step": 10520 - }, - { - "epoch": 13.017934446505874, - "grad_norm": 0.419921875, - "learning_rate": 1.0366930684067333e-05, - "loss": 0.8587, - "step": 10525 - }, - { - "epoch": 13.024118738404452, - "grad_norm": 0.4375, - "learning_rate": 1.0303174233840528e-05, - "loss": 0.861, - "step": 10530 - }, - { - "epoch": 13.030303030303031, - "grad_norm": 0.427734375, - "learning_rate": 1.0239603789417274e-05, - "loss": 0.8619, - "step": 10535 - }, - { - "epoch": 13.036487322201609, - "grad_norm": 0.431640625, - "learning_rate": 1.0176219482624616e-05, - "loss": 0.8493, - "step": 10540 - }, - { - "epoch": 13.042671614100186, - "grad_norm": 0.4140625, - "learning_rate": 1.0113021444903726e-05, - "loss": 0.8582, - "step": 10545 - }, - { - "epoch": 13.048855905998764, - "grad_norm": 0.42578125, - "learning_rate": 1.0050009807309325e-05, - "loss": 0.8486, - "step": 10550 - }, - { - "epoch": 13.055040197897341, - "grad_norm": 0.421875, - "learning_rate": 9.987184700509755e-06, - "loss": 0.8461, - "step": 10555 - }, - { - "epoch": 13.061224489795919, - "grad_norm": 0.427734375, - "learning_rate": 9.924546254786493e-06, - "loss": 0.8626, - "step": 10560 - }, - { - "epoch": 13.067408781694496, - "grad_norm": 0.412109375, - "learning_rate": 9.862094600033912e-06, - "loss": 0.861, - "step": 10565 - }, - { - "epoch": 13.073593073593074, - "grad_norm": 0.416015625, - "learning_rate": 9.799829865759069e-06, - "loss": 0.8576, - "step": 10570 - }, - { - "epoch": 13.079777365491651, - "grad_norm": 0.423828125, - "learning_rate": 9.737752181081338e-06, - "loss": 0.8471, - "step": 10575 - }, - { - "epoch": 13.085961657390229, - "grad_norm": 0.423828125, - "learning_rate": 9.675861674732312e-06, - "loss": 0.8518, - "step": 10580 - }, - { - "epoch": 13.092145949288806, - "grad_norm": 0.42578125, - "learning_rate": 9.614158475055302e-06, - "loss": 0.859, - "step": 10585 - }, - { - "epoch": 13.098330241187384, - "grad_norm": 0.439453125, - "learning_rate": 9.552642710005299e-06, - "loss": 0.8624, - "step": 10590 - }, - { - "epoch": 13.104514533085961, - "grad_norm": 0.427734375, - "learning_rate": 9.491314507148597e-06, - "loss": 0.8538, - "step": 10595 - }, - { - "epoch": 13.110698824984539, - "grad_norm": 0.421875, - "learning_rate": 9.430173993662451e-06, - "loss": 0.8567, - "step": 10600 - }, - { - "epoch": 13.116883116883116, - "grad_norm": 0.42578125, - "learning_rate": 9.369221296335006e-06, - "loss": 0.8544, - "step": 10605 - }, - { - "epoch": 13.123067408781694, - "grad_norm": 0.412109375, - "learning_rate": 9.308456541564881e-06, - "loss": 0.8461, - "step": 10610 - }, - { - "epoch": 13.129251700680273, - "grad_norm": 0.408203125, - "learning_rate": 9.24787985536094e-06, - "loss": 0.8612, - "step": 10615 - }, - { - "epoch": 13.13543599257885, - "grad_norm": 0.4140625, - "learning_rate": 9.187491363342093e-06, - "loss": 0.8727, - "step": 10620 - }, - { - "epoch": 13.141620284477428, - "grad_norm": 0.42578125, - "learning_rate": 9.127291190736943e-06, - "loss": 0.8599, - "step": 10625 - }, - { - "epoch": 13.147804576376005, - "grad_norm": 0.408203125, - "learning_rate": 9.067279462383615e-06, - "loss": 0.8478, - "step": 10630 - }, - { - "epoch": 13.153988868274583, - "grad_norm": 0.427734375, - "learning_rate": 9.0074563027294e-06, - "loss": 0.8527, - "step": 10635 - }, - { - "epoch": 13.16017316017316, - "grad_norm": 0.4140625, - "learning_rate": 8.947821835830616e-06, - "loss": 0.8508, - "step": 10640 - }, - { - "epoch": 13.166357452071738, - "grad_norm": 0.447265625, - "learning_rate": 8.88837618535222e-06, - "loss": 0.8556, - "step": 10645 - }, - { - "epoch": 13.172541743970315, - "grad_norm": 0.44140625, - "learning_rate": 8.829119474567671e-06, - "loss": 0.8484, - "step": 10650 - }, - { - "epoch": 13.178726035868893, - "grad_norm": 0.4140625, - "learning_rate": 8.770051826358594e-06, - "loss": 0.8635, - "step": 10655 - }, - { - "epoch": 13.18491032776747, - "grad_norm": 0.44140625, - "learning_rate": 8.711173363214553e-06, - "loss": 0.849, - "step": 10660 - }, - { - "epoch": 13.191094619666048, - "grad_norm": 0.421875, - "learning_rate": 8.652484207232803e-06, - "loss": 0.855, - "step": 10665 - }, - { - "epoch": 13.197278911564625, - "grad_norm": 0.4296875, - "learning_rate": 8.593984480118011e-06, - "loss": 0.8637, - "step": 10670 - }, - { - "epoch": 13.203463203463203, - "grad_norm": 0.419921875, - "learning_rate": 8.535674303182061e-06, - "loss": 0.855, - "step": 10675 - }, - { - "epoch": 13.20964749536178, - "grad_norm": 0.427734375, - "learning_rate": 8.47755379734373e-06, - "loss": 0.8649, - "step": 10680 - }, - { - "epoch": 13.215831787260358, - "grad_norm": 0.42578125, - "learning_rate": 8.419623083128458e-06, - "loss": 0.8694, - "step": 10685 - }, - { - "epoch": 13.222016079158937, - "grad_norm": 0.431640625, - "learning_rate": 8.361882280668165e-06, - "loss": 0.8572, - "step": 10690 - }, - { - "epoch": 13.228200371057515, - "grad_norm": 0.416015625, - "learning_rate": 8.304331509700891e-06, - "loss": 0.8529, - "step": 10695 - }, - { - "epoch": 13.234384662956092, - "grad_norm": 0.439453125, - "learning_rate": 8.24697088957066e-06, - "loss": 0.863, - "step": 10700 - }, - { - "epoch": 13.24056895485467, - "grad_norm": 0.41015625, - "learning_rate": 8.189800539227111e-06, - "loss": 0.86, - "step": 10705 - }, - { - "epoch": 13.246753246753247, - "grad_norm": 0.4140625, - "learning_rate": 8.132820577225387e-06, - "loss": 0.8673, - "step": 10710 - }, - { - "epoch": 13.252937538651825, - "grad_norm": 0.44140625, - "learning_rate": 8.076031121725746e-06, - "loss": 0.8537, - "step": 10715 - }, - { - "epoch": 13.259121830550402, - "grad_norm": 0.421875, - "learning_rate": 8.019432290493457e-06, - "loss": 0.8516, - "step": 10720 - }, - { - "epoch": 13.26530612244898, - "grad_norm": 0.431640625, - "learning_rate": 7.963024200898462e-06, - "loss": 0.859, - "step": 10725 - }, - { - "epoch": 13.271490414347557, - "grad_norm": 0.41796875, - "learning_rate": 7.906806969915148e-06, - "loss": 0.8595, - "step": 10730 - }, - { - "epoch": 13.277674706246135, - "grad_norm": 0.431640625, - "learning_rate": 7.85078071412213e-06, - "loss": 0.8483, - "step": 10735 - }, - { - "epoch": 13.283858998144712, - "grad_norm": 0.439453125, - "learning_rate": 7.794945549701993e-06, - "loss": 0.8498, - "step": 10740 - }, - { - "epoch": 13.29004329004329, - "grad_norm": 0.423828125, - "learning_rate": 7.739301592441017e-06, - "loss": 0.859, - "step": 10745 - }, - { - "epoch": 13.296227581941867, - "grad_norm": 0.412109375, - "learning_rate": 7.683848957729056e-06, - "loss": 0.8533, - "step": 10750 - }, - { - "epoch": 13.302411873840445, - "grad_norm": 0.421875, - "learning_rate": 7.6285877605591135e-06, - "loss": 0.8526, - "step": 10755 - }, - { - "epoch": 13.308596165739022, - "grad_norm": 0.421875, - "learning_rate": 7.573518115527289e-06, - "loss": 0.8536, - "step": 10760 - }, - { - "epoch": 13.3147804576376, - "grad_norm": 0.435546875, - "learning_rate": 7.5186401368324e-06, - "loss": 0.8591, - "step": 10765 - }, - { - "epoch": 13.320964749536179, - "grad_norm": 0.41796875, - "learning_rate": 7.463953938275858e-06, - "loss": 0.8542, - "step": 10770 - }, - { - "epoch": 13.327149041434756, - "grad_norm": 0.412109375, - "learning_rate": 7.409459633261307e-06, - "loss": 0.8495, - "step": 10775 - }, - { - "epoch": 13.333333333333334, - "grad_norm": 0.435546875, - "learning_rate": 7.355157334794516e-06, - "loss": 0.8547, - "step": 10780 - }, - { - "epoch": 13.339517625231911, - "grad_norm": 0.431640625, - "learning_rate": 7.3010471554830766e-06, - "loss": 0.8581, - "step": 10785 - }, - { - "epoch": 13.345701917130489, - "grad_norm": 0.451171875, - "learning_rate": 7.247129207536152e-06, - "loss": 0.8583, - "step": 10790 - }, - { - "epoch": 13.351886209029066, - "grad_norm": 0.4375, - "learning_rate": 7.193403602764315e-06, - "loss": 0.8665, - "step": 10795 - }, - { - "epoch": 13.358070500927644, - "grad_norm": 0.443359375, - "learning_rate": 7.1398704525792e-06, - "loss": 0.8528, - "step": 10800 - }, - { - "epoch": 13.364254792826221, - "grad_norm": 0.431640625, - "learning_rate": 7.086529867993453e-06, - "loss": 0.8538, - "step": 10805 - }, - { - "epoch": 13.370439084724799, - "grad_norm": 0.41015625, - "learning_rate": 7.0333819596203e-06, - "loss": 0.8579, - "step": 10810 - }, - { - "epoch": 13.376623376623376, - "grad_norm": 0.435546875, - "learning_rate": 6.980426837673437e-06, - "loss": 0.8534, - "step": 10815 - }, - { - "epoch": 13.382807668521954, - "grad_norm": 0.458984375, - "learning_rate": 6.927664611966811e-06, - "loss": 0.8529, - "step": 10820 - }, - { - "epoch": 13.388991960420531, - "grad_norm": 0.412109375, - "learning_rate": 6.875095391914299e-06, - "loss": 0.8577, - "step": 10825 - }, - { - "epoch": 13.395176252319109, - "grad_norm": 0.4296875, - "learning_rate": 6.8227192865295995e-06, - "loss": 0.8588, - "step": 10830 - }, - { - "epoch": 13.401360544217686, - "grad_norm": 0.431640625, - "learning_rate": 6.770536404425887e-06, - "loss": 0.8484, - "step": 10835 - }, - { - "epoch": 13.407544836116264, - "grad_norm": 0.431640625, - "learning_rate": 6.718546853815688e-06, - "loss": 0.8584, - "step": 10840 - }, - { - "epoch": 13.413729128014843, - "grad_norm": 0.423828125, - "learning_rate": 6.666750742510619e-06, - "loss": 0.8484, - "step": 10845 - }, - { - "epoch": 13.41991341991342, - "grad_norm": 0.423828125, - "learning_rate": 6.6151481779211155e-06, - "loss": 0.8478, - "step": 10850 - }, - { - "epoch": 13.426097711811998, - "grad_norm": 0.421875, - "learning_rate": 6.56373926705629e-06, - "loss": 0.8502, - "step": 10855 - }, - { - "epoch": 13.432282003710576, - "grad_norm": 0.419921875, - "learning_rate": 6.512524116523633e-06, - "loss": 0.8595, - "step": 10860 - }, - { - "epoch": 13.438466295609153, - "grad_norm": 0.439453125, - "learning_rate": 6.461502832528887e-06, - "loss": 0.8675, - "step": 10865 - }, - { - "epoch": 13.44465058750773, - "grad_norm": 0.435546875, - "learning_rate": 6.410675520875742e-06, - "loss": 0.8641, - "step": 10870 - }, - { - "epoch": 13.450834879406308, - "grad_norm": 0.42578125, - "learning_rate": 6.360042286965595e-06, - "loss": 0.8618, - "step": 10875 - }, - { - "epoch": 13.457019171304886, - "grad_norm": 0.4609375, - "learning_rate": 6.30960323579749e-06, - "loss": 0.8606, - "step": 10880 - }, - { - "epoch": 13.463203463203463, - "grad_norm": 0.43359375, - "learning_rate": 6.2593584719676805e-06, - "loss": 0.8486, - "step": 10885 - }, - { - "epoch": 13.46938775510204, - "grad_norm": 0.431640625, - "learning_rate": 6.209308099669597e-06, - "loss": 0.855, - "step": 10890 - }, - { - "epoch": 13.475572047000618, - "grad_norm": 0.427734375, - "learning_rate": 6.159452222693507e-06, - "loss": 0.8553, - "step": 10895 - }, - { - "epoch": 13.481756338899196, - "grad_norm": 0.431640625, - "learning_rate": 6.109790944426397e-06, - "loss": 0.8564, - "step": 10900 - }, - { - "epoch": 13.487940630797773, - "grad_norm": 0.421875, - "learning_rate": 6.0603243678516995e-06, - "loss": 0.8541, - "step": 10905 - }, - { - "epoch": 13.49412492269635, - "grad_norm": 0.40234375, - "learning_rate": 6.011052595549038e-06, - "loss": 0.8528, - "step": 10910 - }, - { - "epoch": 13.500309214594928, - "grad_norm": 0.427734375, - "learning_rate": 5.961975729694158e-06, - "loss": 0.8538, - "step": 10915 - }, - { - "epoch": 13.506493506493506, - "grad_norm": 0.42578125, - "learning_rate": 5.913093872058528e-06, - "loss": 0.8546, - "step": 10920 - }, - { - "epoch": 13.512677798392085, - "grad_norm": 0.427734375, - "learning_rate": 5.864407124009297e-06, - "loss": 0.8573, - "step": 10925 - }, - { - "epoch": 13.518862090290662, - "grad_norm": 0.42578125, - "learning_rate": 5.81591558650898e-06, - "loss": 0.8529, - "step": 10930 - }, - { - "epoch": 13.52504638218924, - "grad_norm": 0.416015625, - "learning_rate": 5.767619360115295e-06, - "loss": 0.8528, - "step": 10935 - }, - { - "epoch": 13.531230674087817, - "grad_norm": 0.443359375, - "learning_rate": 5.719518544980929e-06, - "loss": 0.8482, - "step": 10940 - }, - { - "epoch": 13.537414965986395, - "grad_norm": 0.435546875, - "learning_rate": 5.6716132408533355e-06, - "loss": 0.852, - "step": 10945 - }, - { - "epoch": 13.543599257884972, - "grad_norm": 0.443359375, - "learning_rate": 5.623903547074549e-06, - "loss": 0.8517, - "step": 10950 - }, - { - "epoch": 13.54978354978355, - "grad_norm": 0.41796875, - "learning_rate": 5.5763895625809415e-06, - "loss": 0.8595, - "step": 10955 - }, - { - "epoch": 13.555967841682127, - "grad_norm": 0.451171875, - "learning_rate": 5.529071385903084e-06, - "loss": 0.8475, - "step": 10960 - }, - { - "epoch": 13.562152133580705, - "grad_norm": 0.416015625, - "learning_rate": 5.481949115165452e-06, - "loss": 0.8665, - "step": 10965 - }, - { - "epoch": 13.568336425479282, - "grad_norm": 0.443359375, - "learning_rate": 5.43502284808628e-06, - "loss": 0.8565, - "step": 10970 - }, - { - "epoch": 13.57452071737786, - "grad_norm": 0.435546875, - "learning_rate": 5.38829268197738e-06, - "loss": 0.8591, - "step": 10975 - }, - { - "epoch": 13.580705009276437, - "grad_norm": 0.447265625, - "learning_rate": 5.341758713743828e-06, - "loss": 0.8511, - "step": 10980 - }, - { - "epoch": 13.586889301175015, - "grad_norm": 0.443359375, - "learning_rate": 5.295421039883941e-06, - "loss": 0.8608, - "step": 10985 - }, - { - "epoch": 13.593073593073592, - "grad_norm": 0.4609375, - "learning_rate": 5.249279756488878e-06, - "loss": 0.8418, - "step": 10990 - }, - { - "epoch": 13.59925788497217, - "grad_norm": 0.431640625, - "learning_rate": 5.2033349592426335e-06, - "loss": 0.8517, - "step": 10995 - }, - { - "epoch": 13.60544217687075, - "grad_norm": 0.443359375, - "learning_rate": 5.157586743421672e-06, - "loss": 0.8641, - "step": 11000 - }, - { - "epoch": 13.611626468769327, - "grad_norm": 0.451171875, - "learning_rate": 5.112035203894827e-06, - "loss": 0.846, - "step": 11005 - }, - { - "epoch": 13.617810760667904, - "grad_norm": 0.439453125, - "learning_rate": 5.066680435123106e-06, - "loss": 0.8578, - "step": 11010 - }, - { - "epoch": 13.623995052566482, - "grad_norm": 0.447265625, - "learning_rate": 5.021522531159428e-06, - "loss": 0.8662, - "step": 11015 - }, - { - "epoch": 13.63017934446506, - "grad_norm": 0.435546875, - "learning_rate": 4.976561585648509e-06, - "loss": 0.8479, - "step": 11020 - }, - { - "epoch": 13.636363636363637, - "grad_norm": 0.443359375, - "learning_rate": 4.931797691826601e-06, - "loss": 0.8492, - "step": 11025 - }, - { - "epoch": 13.642547928262214, - "grad_norm": 0.423828125, - "learning_rate": 4.887230942521337e-06, - "loss": 0.8523, - "step": 11030 - }, - { - "epoch": 13.648732220160792, - "grad_norm": 0.44921875, - "learning_rate": 4.842861430151557e-06, - "loss": 0.8563, - "step": 11035 - }, - { - "epoch": 13.65491651205937, - "grad_norm": 0.416015625, - "learning_rate": 4.798689246727006e-06, - "loss": 0.8535, - "step": 11040 - }, - { - "epoch": 13.661100803957947, - "grad_norm": 0.416015625, - "learning_rate": 4.754714483848333e-06, - "loss": 0.8502, - "step": 11045 - }, - { - "epoch": 13.667285095856524, - "grad_norm": 0.4375, - "learning_rate": 4.710937232706691e-06, - "loss": 0.8562, - "step": 11050 - }, - { - "epoch": 13.673469387755102, - "grad_norm": 0.427734375, - "learning_rate": 4.667357584083721e-06, - "loss": 0.8523, - "step": 11055 - }, - { - "epoch": 13.67965367965368, - "grad_norm": 0.427734375, - "learning_rate": 4.623975628351273e-06, - "loss": 0.8623, - "step": 11060 - }, - { - "epoch": 13.685837971552257, - "grad_norm": 0.4140625, - "learning_rate": 4.5807914554712005e-06, - "loss": 0.8535, - "step": 11065 - }, - { - "epoch": 13.692022263450834, - "grad_norm": 0.4296875, - "learning_rate": 4.537805154995278e-06, - "loss": 0.8433, - "step": 11070 - }, - { - "epoch": 13.698206555349412, - "grad_norm": 0.419921875, - "learning_rate": 4.49501681606489e-06, - "loss": 0.8533, - "step": 11075 - }, - { - "epoch": 13.70439084724799, - "grad_norm": 0.419921875, - "learning_rate": 4.452426527410947e-06, - "loss": 0.851, - "step": 11080 - }, - { - "epoch": 13.710575139146568, - "grad_norm": 0.423828125, - "learning_rate": 4.4100343773536225e-06, - "loss": 0.8634, - "step": 11085 - }, - { - "epoch": 13.716759431045146, - "grad_norm": 0.431640625, - "learning_rate": 4.36784045380223e-06, - "loss": 0.8549, - "step": 11090 - }, - { - "epoch": 13.722943722943723, - "grad_norm": 0.435546875, - "learning_rate": 4.325844844255023e-06, - "loss": 0.8541, - "step": 11095 - }, - { - "epoch": 13.729128014842301, - "grad_norm": 0.42578125, - "learning_rate": 4.2840476357989825e-06, - "loss": 0.8611, - "step": 11100 - }, - { - "epoch": 13.735312306740878, - "grad_norm": 0.412109375, - "learning_rate": 4.242448915109698e-06, - "loss": 0.8532, - "step": 11105 - }, - { - "epoch": 13.741496598639456, - "grad_norm": 0.423828125, - "learning_rate": 4.20104876845111e-06, - "loss": 0.8538, - "step": 11110 - }, - { - "epoch": 13.747680890538033, - "grad_norm": 0.412109375, - "learning_rate": 4.159847281675411e-06, - "loss": 0.8525, - "step": 11115 - }, - { - "epoch": 13.753865182436611, - "grad_norm": 0.412109375, - "learning_rate": 4.118844540222788e-06, - "loss": 0.8455, - "step": 11120 - }, - { - "epoch": 13.760049474335188, - "grad_norm": 0.412109375, - "learning_rate": 4.078040629121327e-06, - "loss": 0.8592, - "step": 11125 - }, - { - "epoch": 13.766233766233766, - "grad_norm": 0.439453125, - "learning_rate": 4.037435632986786e-06, - "loss": 0.86, - "step": 11130 - }, - { - "epoch": 13.772418058132343, - "grad_norm": 0.439453125, - "learning_rate": 3.997029636022387e-06, - "loss": 0.8488, - "step": 11135 - }, - { - "epoch": 13.778602350030921, - "grad_norm": 0.4453125, - "learning_rate": 3.95682272201876e-06, - "loss": 0.8485, - "step": 11140 - }, - { - "epoch": 13.784786641929498, - "grad_norm": 0.40625, - "learning_rate": 3.916814974353633e-06, - "loss": 0.8525, - "step": 11145 - }, - { - "epoch": 13.790970933828076, - "grad_norm": 0.4296875, - "learning_rate": 3.877006475991729e-06, - "loss": 0.8634, - "step": 11150 - }, - { - "epoch": 13.797155225726655, - "grad_norm": 0.427734375, - "learning_rate": 3.837397309484636e-06, - "loss": 0.8615, - "step": 11155 - }, - { - "epoch": 13.803339517625233, - "grad_norm": 0.44140625, - "learning_rate": 3.797987556970495e-06, - "loss": 0.8533, - "step": 11160 - }, - { - "epoch": 13.80952380952381, - "grad_norm": 0.416015625, - "learning_rate": 3.75877730017401e-06, - "loss": 0.8519, - "step": 11165 - }, - { - "epoch": 13.815708101422388, - "grad_norm": 0.443359375, - "learning_rate": 3.7197666204060955e-06, - "loss": 0.8673, - "step": 11170 - }, - { - "epoch": 13.821892393320965, - "grad_norm": 0.4140625, - "learning_rate": 3.6809555985639068e-06, - "loss": 0.8561, - "step": 11175 - }, - { - "epoch": 13.828076685219543, - "grad_norm": 0.447265625, - "learning_rate": 3.6423443151304526e-06, - "loss": 0.8519, - "step": 11180 - }, - { - "epoch": 13.83426097711812, - "grad_norm": 0.4375, - "learning_rate": 3.6039328501746293e-06, - "loss": 0.8641, - "step": 11185 - }, - { - "epoch": 13.840445269016698, - "grad_norm": 0.41796875, - "learning_rate": 3.565721283350931e-06, - "loss": 0.8552, - "step": 11190 - }, - { - "epoch": 13.846629560915275, - "grad_norm": 0.419921875, - "learning_rate": 3.527709693899306e-06, - "loss": 0.8747, - "step": 11195 - }, - { - "epoch": 13.852813852813853, - "grad_norm": 0.427734375, - "learning_rate": 3.4898981606450333e-06, - "loss": 0.8507, - "step": 11200 - }, - { - "epoch": 13.85899814471243, - "grad_norm": 0.44921875, - "learning_rate": 3.452286761998491e-06, - "loss": 0.8636, - "step": 11205 - }, - { - "epoch": 13.865182436611008, - "grad_norm": 0.431640625, - "learning_rate": 3.414875575955101e-06, - "loss": 0.8498, - "step": 11210 - }, - { - "epoch": 13.871366728509585, - "grad_norm": 0.416015625, - "learning_rate": 3.3776646800950605e-06, - "loss": 0.8534, - "step": 11215 - }, - { - "epoch": 13.877551020408163, - "grad_norm": 0.427734375, - "learning_rate": 3.3406541515832003e-06, - "loss": 0.8435, - "step": 11220 - }, - { - "epoch": 13.88373531230674, - "grad_norm": 0.4296875, - "learning_rate": 3.303844067168904e-06, - "loss": 0.8524, - "step": 11225 - }, - { - "epoch": 13.889919604205318, - "grad_norm": 0.4375, - "learning_rate": 3.267234503185823e-06, - "loss": 0.8505, - "step": 11230 - }, - { - "epoch": 13.896103896103895, - "grad_norm": 0.4296875, - "learning_rate": 3.2308255355518403e-06, - "loss": 0.8557, - "step": 11235 - }, - { - "epoch": 13.902288188002474, - "grad_norm": 0.439453125, - "learning_rate": 3.1946172397688267e-06, - "loss": 0.8562, - "step": 11240 - }, - { - "epoch": 13.908472479901052, - "grad_norm": 0.427734375, - "learning_rate": 3.158609690922554e-06, - "loss": 0.8555, - "step": 11245 - }, - { - "epoch": 13.91465677179963, - "grad_norm": 0.416015625, - "learning_rate": 3.1228029636824475e-06, - "loss": 0.8618, - "step": 11250 - }, - { - "epoch": 13.920841063698207, - "grad_norm": 0.416015625, - "learning_rate": 3.0871971323015336e-06, - "loss": 0.8604, - "step": 11255 - }, - { - "epoch": 13.927025355596784, - "grad_norm": 0.41015625, - "learning_rate": 3.051792270616216e-06, - "loss": 0.8598, - "step": 11260 - }, - { - "epoch": 13.933209647495362, - "grad_norm": 0.419921875, - "learning_rate": 3.0165884520461316e-06, - "loss": 0.8531, - "step": 11265 - }, - { - "epoch": 13.93939393939394, - "grad_norm": 0.43359375, - "learning_rate": 2.981585749594051e-06, - "loss": 0.8508, - "step": 11270 - }, - { - "epoch": 13.945578231292517, - "grad_norm": 0.439453125, - "learning_rate": 2.9467842358456345e-06, - "loss": 0.8466, - "step": 11275 - }, - { - "epoch": 13.951762523191094, - "grad_norm": 0.44140625, - "learning_rate": 2.912183982969385e-06, - "loss": 0.8548, - "step": 11280 - }, - { - "epoch": 13.957946815089672, - "grad_norm": 0.43359375, - "learning_rate": 2.8777850627164205e-06, - "loss": 0.8616, - "step": 11285 - }, - { - "epoch": 13.96413110698825, - "grad_norm": 0.41796875, - "learning_rate": 2.8435875464203343e-06, - "loss": 0.8578, - "step": 11290 - }, - { - "epoch": 13.970315398886827, - "grad_norm": 0.423828125, - "learning_rate": 2.809591504997111e-06, - "loss": 0.8674, - "step": 11295 - }, - { - "epoch": 13.976499690785404, - "grad_norm": 0.42578125, - "learning_rate": 2.7757970089449024e-06, - "loss": 0.8573, - "step": 11300 - }, - { - "epoch": 13.982683982683982, - "grad_norm": 0.44921875, - "learning_rate": 2.742204128343917e-06, - "loss": 0.8582, - "step": 11305 - }, - { - "epoch": 13.988868274582561, - "grad_norm": 0.423828125, - "learning_rate": 2.708812932856253e-06, - "loss": 0.8554, - "step": 11310 - }, - { - "epoch": 13.995052566481139, - "grad_norm": 0.416015625, - "learning_rate": 2.6756234917258205e-06, - "loss": 0.8586, - "step": 11315 - }, - { - "epoch": 14.0, - "eval_loss": 2.5210444927215576, - "eval_runtime": 0.5394, - "eval_samples_per_second": 18.54, - "eval_steps_per_second": 1.854, - "step": 11319 - }, - { - "epoch": 14.001236858379716, - "grad_norm": 0.43359375, - "learning_rate": 2.6426358737781098e-06, - "loss": 0.8497, - "step": 11320 - }, - { - "epoch": 14.007421150278294, - "grad_norm": 0.44921875, - "learning_rate": 2.6098501474200787e-06, - "loss": 0.8456, - "step": 11325 - }, - { - "epoch": 14.013605442176871, - "grad_norm": 0.4375, - "learning_rate": 2.577266380640053e-06, - "loss": 0.8513, - "step": 11330 - }, - { - "epoch": 14.019789734075449, - "grad_norm": 0.439453125, - "learning_rate": 2.5448846410075166e-06, - "loss": 0.8512, - "step": 11335 - }, - { - "epoch": 14.025974025974026, - "grad_norm": 0.423828125, - "learning_rate": 2.5127049956730207e-06, - "loss": 0.8564, - "step": 11340 - }, - { - "epoch": 14.032158317872604, - "grad_norm": 0.421875, - "learning_rate": 2.480727511368064e-06, - "loss": 0.8522, - "step": 11345 - }, - { - "epoch": 14.038342609771181, - "grad_norm": 0.431640625, - "learning_rate": 2.448952254404846e-06, - "loss": 0.8645, - "step": 11350 - }, - { - "epoch": 14.044526901669759, - "grad_norm": 0.4296875, - "learning_rate": 2.4173792906762804e-06, - "loss": 0.8509, - "step": 11355 - }, - { - "epoch": 14.050711193568336, - "grad_norm": 0.42578125, - "learning_rate": 2.3860086856557383e-06, - "loss": 0.857, - "step": 11360 - }, - { - "epoch": 14.056895485466914, - "grad_norm": 0.419921875, - "learning_rate": 2.35484050439696e-06, - "loss": 0.8457, - "step": 11365 - }, - { - "epoch": 14.063079777365491, - "grad_norm": 0.419921875, - "learning_rate": 2.3238748115339324e-06, - "loss": 0.8502, - "step": 11370 - }, - { - "epoch": 14.069264069264069, - "grad_norm": 0.421875, - "learning_rate": 2.293111671280712e-06, - "loss": 0.8608, - "step": 11375 - }, - { - "epoch": 14.075448361162646, - "grad_norm": 0.4375, - "learning_rate": 2.2625511474313685e-06, - "loss": 0.851, - "step": 11380 - }, - { - "epoch": 14.081632653061224, - "grad_norm": 0.439453125, - "learning_rate": 2.232193303359742e-06, - "loss": 0.8599, - "step": 11385 - }, - { - "epoch": 14.087816944959803, - "grad_norm": 0.431640625, - "learning_rate": 2.2020382020194074e-06, - "loss": 0.8584, - "step": 11390 - }, - { - "epoch": 14.09400123685838, - "grad_norm": 0.42578125, - "learning_rate": 2.1720859059434993e-06, - "loss": 0.8594, - "step": 11395 - }, - { - "epoch": 14.100185528756958, - "grad_norm": 0.447265625, - "learning_rate": 2.1423364772445887e-06, - "loss": 0.8604, - "step": 11400 - }, - { - "epoch": 14.106369820655535, - "grad_norm": 0.412109375, - "learning_rate": 2.112789977614582e-06, - "loss": 0.8407, - "step": 11405 - }, - { - "epoch": 14.112554112554113, - "grad_norm": 0.419921875, - "learning_rate": 2.0834464683245346e-06, - "loss": 0.8531, - "step": 11410 - }, - { - "epoch": 14.11873840445269, - "grad_norm": 0.419921875, - "learning_rate": 2.0543060102245717e-06, - "loss": 0.8524, - "step": 11415 - }, - { - "epoch": 14.124922696351268, - "grad_norm": 0.4296875, - "learning_rate": 2.025368663743743e-06, - "loss": 0.8522, - "step": 11420 - }, - { - "epoch": 14.131106988249845, - "grad_norm": 0.4140625, - "learning_rate": 1.9966344888899147e-06, - "loss": 0.8648, - "step": 11425 - }, - { - "epoch": 14.137291280148423, - "grad_norm": 0.447265625, - "learning_rate": 1.968103545249611e-06, - "loss": 0.8546, - "step": 11430 - }, - { - "epoch": 14.143475572047, - "grad_norm": 0.4296875, - "learning_rate": 1.9397758919879495e-06, - "loss": 0.8616, - "step": 11435 - }, - { - "epoch": 14.149659863945578, - "grad_norm": 0.408203125, - "learning_rate": 1.91165158784844e-06, - "loss": 0.8539, - "step": 11440 - }, - { - "epoch": 14.155844155844155, - "grad_norm": 0.421875, - "learning_rate": 1.8837306911529184e-06, - "loss": 0.8599, - "step": 11445 - }, - { - "epoch": 14.162028447742733, - "grad_norm": 0.423828125, - "learning_rate": 1.8560132598014368e-06, - "loss": 0.8608, - "step": 11450 - }, - { - "epoch": 14.16821273964131, - "grad_norm": 0.419921875, - "learning_rate": 1.8284993512720505e-06, - "loss": 0.8558, - "step": 11455 - }, - { - "epoch": 14.174397031539888, - "grad_norm": 0.4375, - "learning_rate": 1.8011890226208527e-06, - "loss": 0.8564, - "step": 11460 - }, - { - "epoch": 14.180581323438465, - "grad_norm": 0.42578125, - "learning_rate": 1.7740823304817188e-06, - "loss": 0.8581, - "step": 11465 - }, - { - "epoch": 14.186765615337045, - "grad_norm": 0.41796875, - "learning_rate": 1.7471793310662287e-06, - "loss": 0.866, - "step": 11470 - }, - { - "epoch": 14.192949907235622, - "grad_norm": 0.4296875, - "learning_rate": 1.7204800801636e-06, - "loss": 0.8628, - "step": 11475 - }, - { - "epoch": 14.1991341991342, - "grad_norm": 0.419921875, - "learning_rate": 1.6939846331405108e-06, - "loss": 0.8594, - "step": 11480 - }, - { - "epoch": 14.205318491032777, - "grad_norm": 0.41796875, - "learning_rate": 1.6676930449410099e-06, - "loss": 0.855, - "step": 11485 - }, - { - "epoch": 14.211502782931355, - "grad_norm": 0.41796875, - "learning_rate": 1.6416053700863964e-06, - "loss": 0.8688, - "step": 11490 - }, - { - "epoch": 14.217687074829932, - "grad_norm": 0.42578125, - "learning_rate": 1.6157216626751292e-06, - "loss": 0.8607, - "step": 11495 - }, - { - "epoch": 14.22387136672851, - "grad_norm": 0.41796875, - "learning_rate": 1.5900419763826614e-06, - "loss": 0.8577, - "step": 11500 - }, - { - "epoch": 14.230055658627087, - "grad_norm": 0.427734375, - "learning_rate": 1.5645663644614172e-06, - "loss": 0.8483, - "step": 11505 - }, - { - "epoch": 14.236239950525665, - "grad_norm": 0.435546875, - "learning_rate": 1.5392948797405827e-06, - "loss": 0.8501, - "step": 11510 - }, - { - "epoch": 14.242424242424242, - "grad_norm": 0.427734375, - "learning_rate": 1.5142275746260593e-06, - "loss": 0.8419, - "step": 11515 - }, - { - "epoch": 14.24860853432282, - "grad_norm": 0.431640625, - "learning_rate": 1.489364501100332e-06, - "loss": 0.8652, - "step": 11520 - }, - { - "epoch": 14.254792826221397, - "grad_norm": 0.41796875, - "learning_rate": 1.4647057107223583e-06, - "loss": 0.8626, - "step": 11525 - }, - { - "epoch": 14.260977118119975, - "grad_norm": 0.43359375, - "learning_rate": 1.4402512546275114e-06, - "loss": 0.8516, - "step": 11530 - }, - { - "epoch": 14.267161410018552, - "grad_norm": 0.423828125, - "learning_rate": 1.4160011835273934e-06, - "loss": 0.8496, - "step": 11535 - }, - { - "epoch": 14.27334570191713, - "grad_norm": 0.439453125, - "learning_rate": 1.3919555477097668e-06, - "loss": 0.8558, - "step": 11540 - }, - { - "epoch": 14.279529993815707, - "grad_norm": 0.421875, - "learning_rate": 1.3681143970385003e-06, - "loss": 0.8568, - "step": 11545 - }, - { - "epoch": 14.285714285714286, - "grad_norm": 0.455078125, - "learning_rate": 1.344477780953346e-06, - "loss": 0.8614, - "step": 11550 - }, - { - "epoch": 14.291898577612864, - "grad_norm": 0.4375, - "learning_rate": 1.3210457484699733e-06, - "loss": 0.8518, - "step": 11555 - }, - { - "epoch": 14.298082869511441, - "grad_norm": 0.4375, - "learning_rate": 1.2978183481797801e-06, - "loss": 0.8459, - "step": 11560 - }, - { - "epoch": 14.304267161410019, - "grad_norm": 0.451171875, - "learning_rate": 1.274795628249792e-06, - "loss": 0.8547, - "step": 11565 - }, - { - "epoch": 14.310451453308596, - "grad_norm": 0.4375, - "learning_rate": 1.251977636422641e-06, - "loss": 0.8592, - "step": 11570 - }, - { - "epoch": 14.316635745207174, - "grad_norm": 0.41796875, - "learning_rate": 1.2293644200163544e-06, - "loss": 0.8447, - "step": 11575 - }, - { - "epoch": 14.322820037105751, - "grad_norm": 0.416015625, - "learning_rate": 1.2069560259243328e-06, - "loss": 0.8548, - "step": 11580 - }, - { - "epoch": 14.329004329004329, - "grad_norm": 0.44140625, - "learning_rate": 1.1847525006152493e-06, - "loss": 0.8512, - "step": 11585 - }, - { - "epoch": 14.335188620902906, - "grad_norm": 0.421875, - "learning_rate": 1.1627538901329172e-06, - "loss": 0.856, - "step": 11590 - }, - { - "epoch": 14.341372912801484, - "grad_norm": 0.43359375, - "learning_rate": 1.1409602400962227e-06, - "loss": 0.8518, - "step": 11595 - }, - { - "epoch": 14.347557204700061, - "grad_norm": 0.41796875, - "learning_rate": 1.1193715956990258e-06, - "loss": 0.8417, - "step": 11600 - }, - { - "epoch": 14.353741496598639, - "grad_norm": 0.42578125, - "learning_rate": 1.0979880017100596e-06, - "loss": 0.861, - "step": 11605 - }, - { - "epoch": 14.359925788497216, - "grad_norm": 0.435546875, - "learning_rate": 1.076809502472831e-06, - "loss": 0.8562, - "step": 11610 - }, - { - "epoch": 14.366110080395794, - "grad_norm": 0.5078125, - "learning_rate": 1.055836141905553e-06, - "loss": 0.8512, - "step": 11615 - }, - { - "epoch": 14.372294372294371, - "grad_norm": 0.421875, - "learning_rate": 1.035067963501024e-06, - "loss": 0.873, - "step": 11620 - }, - { - "epoch": 14.37847866419295, - "grad_norm": 0.423828125, - "learning_rate": 1.014505010326583e-06, - "loss": 0.8564, - "step": 11625 - }, - { - "epoch": 14.384662956091528, - "grad_norm": 0.43359375, - "learning_rate": 9.94147325023953e-07, - "loss": 0.8537, - "step": 11630 - }, - { - "epoch": 14.390847247990106, - "grad_norm": 0.421875, - "learning_rate": 9.739949498091982e-07, - "loss": 0.8618, - "step": 11635 - }, - { - "epoch": 14.397031539888683, - "grad_norm": 0.423828125, - "learning_rate": 9.540479264726676e-07, - "loss": 0.8567, - "step": 11640 - }, - { - "epoch": 14.40321583178726, - "grad_norm": 0.439453125, - "learning_rate": 9.343062963787952e-07, - "loss": 0.8583, - "step": 11645 - }, - { - "epoch": 14.409400123685838, - "grad_norm": 0.427734375, - "learning_rate": 9.147701004661446e-07, - "loss": 0.8569, - "step": 11650 - }, - { - "epoch": 14.415584415584416, - "grad_norm": 0.427734375, - "learning_rate": 8.954393792472649e-07, - "loss": 0.8431, - "step": 11655 - }, - { - "epoch": 14.421768707482993, - "grad_norm": 0.419921875, - "learning_rate": 8.763141728085789e-07, - "loss": 0.856, - "step": 11660 - }, - { - "epoch": 14.42795299938157, - "grad_norm": 0.41796875, - "learning_rate": 8.573945208103618e-07, - "loss": 0.8554, - "step": 11665 - }, - { - "epoch": 14.434137291280148, - "grad_norm": 0.43359375, - "learning_rate": 8.386804624865851e-07, - "loss": 0.8636, - "step": 11670 - }, - { - "epoch": 14.440321583178726, - "grad_norm": 0.42578125, - "learning_rate": 8.201720366449283e-07, - "loss": 0.8605, - "step": 11675 - }, - { - "epoch": 14.446505875077303, - "grad_norm": 0.43359375, - "learning_rate": 8.018692816666118e-07, - "loss": 0.8529, - "step": 11680 - }, - { - "epoch": 14.45269016697588, - "grad_norm": 0.427734375, - "learning_rate": 7.837722355063637e-07, - "loss": 0.8594, - "step": 11685 - }, - { - "epoch": 14.458874458874458, - "grad_norm": 0.435546875, - "learning_rate": 7.658809356923424e-07, - "loss": 0.8562, - "step": 11690 - }, - { - "epoch": 14.465058750773036, - "grad_norm": 0.41796875, - "learning_rate": 7.481954193260143e-07, - "loss": 0.8563, - "step": 11695 - }, - { - "epoch": 14.471243042671613, - "grad_norm": 0.4140625, - "learning_rate": 7.307157230821426e-07, - "loss": 0.8515, - "step": 11700 - }, - { - "epoch": 14.477427334570192, - "grad_norm": 0.455078125, - "learning_rate": 7.134418832086653e-07, - "loss": 0.858, - "step": 11705 - }, - { - "epoch": 14.48361162646877, - "grad_norm": 0.431640625, - "learning_rate": 6.963739355266286e-07, - "loss": 0.8608, - "step": 11710 - }, - { - "epoch": 14.489795918367347, - "grad_norm": 0.423828125, - "learning_rate": 6.7951191543012e-07, - "loss": 0.8662, - "step": 11715 - }, - { - "epoch": 14.495980210265925, - "grad_norm": 0.455078125, - "learning_rate": 6.628558578862021e-07, - "loss": 0.86, - "step": 11720 - }, - { - "epoch": 14.502164502164502, - "grad_norm": 0.431640625, - "learning_rate": 6.464057974348014e-07, - "loss": 0.8587, - "step": 11725 - }, - { - "epoch": 14.50834879406308, - "grad_norm": 0.44921875, - "learning_rate": 6.301617681886863e-07, - "loss": 0.859, - "step": 11730 - }, - { - "epoch": 14.514533085961657, - "grad_norm": 0.408203125, - "learning_rate": 6.141238038333885e-07, - "loss": 0.8607, - "step": 11735 - }, - { - "epoch": 14.520717377860235, - "grad_norm": 0.4140625, - "learning_rate": 5.982919376270823e-07, - "loss": 0.865, - "step": 11740 - }, - { - "epoch": 14.526901669758812, - "grad_norm": 0.4296875, - "learning_rate": 5.826662024005835e-07, - "loss": 0.864, - "step": 11745 - }, - { - "epoch": 14.53308596165739, - "grad_norm": 0.41796875, - "learning_rate": 5.672466305572388e-07, - "loss": 0.8474, - "step": 11750 - }, - { - "epoch": 14.539270253555967, - "grad_norm": 0.421875, - "learning_rate": 5.52033254072859e-07, - "loss": 0.8571, - "step": 11755 - }, - { - "epoch": 14.545454545454545, - "grad_norm": 0.43359375, - "learning_rate": 5.370261044956971e-07, - "loss": 0.852, - "step": 11760 - }, - { - "epoch": 14.551638837353122, - "grad_norm": 0.419921875, - "learning_rate": 5.222252129463146e-07, - "loss": 0.8602, - "step": 11765 - }, - { - "epoch": 14.5578231292517, - "grad_norm": 0.43359375, - "learning_rate": 5.07630610117582e-07, - "loss": 0.8637, - "step": 11770 - }, - { - "epoch": 14.564007421150277, - "grad_norm": 0.419921875, - "learning_rate": 4.932423262745456e-07, - "loss": 0.8446, - "step": 11775 - }, - { - "epoch": 14.570191713048857, - "grad_norm": 0.4140625, - "learning_rate": 4.790603912544489e-07, - "loss": 0.8551, - "step": 11780 - }, - { - "epoch": 14.576376004947434, - "grad_norm": 0.4375, - "learning_rate": 4.6508483446661144e-07, - "loss": 0.8601, - "step": 11785 - }, - { - "epoch": 14.582560296846012, - "grad_norm": 0.4296875, - "learning_rate": 4.5131568489236166e-07, - "loss": 0.862, - "step": 11790 - }, - { - "epoch": 14.58874458874459, - "grad_norm": 0.421875, - "learning_rate": 4.377529710850259e-07, - "loss": 0.8592, - "step": 11795 - }, - { - "epoch": 14.594928880643167, - "grad_norm": 0.427734375, - "learning_rate": 4.2439672116982855e-07, - "loss": 0.8491, - "step": 11800 - }, - { - "epoch": 14.601113172541744, - "grad_norm": 0.4296875, - "learning_rate": 4.112469628438365e-07, - "loss": 0.8512, - "step": 11805 - }, - { - "epoch": 14.607297464440322, - "grad_norm": 0.427734375, - "learning_rate": 3.983037233759368e-07, - "loss": 0.8562, - "step": 11810 - }, - { - "epoch": 14.6134817563389, - "grad_norm": 0.421875, - "learning_rate": 3.8556702960673706e-07, - "loss": 0.8498, - "step": 11815 - }, - { - "epoch": 14.619666048237477, - "grad_norm": 0.400390625, - "learning_rate": 3.73036907948543e-07, - "loss": 0.8548, - "step": 11820 - }, - { - "epoch": 14.625850340136054, - "grad_norm": 0.421875, - "learning_rate": 3.6071338438524726e-07, - "loss": 0.8642, - "step": 11825 - }, - { - "epoch": 14.632034632034632, - "grad_norm": 0.44140625, - "learning_rate": 3.485964844723744e-07, - "loss": 0.8464, - "step": 11830 - }, - { - "epoch": 14.63821892393321, - "grad_norm": 0.419921875, - "learning_rate": 3.366862333369358e-07, - "loss": 0.8568, - "step": 11835 - }, - { - "epoch": 14.644403215831787, - "grad_norm": 0.421875, - "learning_rate": 3.2498265567739717e-07, - "loss": 0.8662, - "step": 11840 - }, - { - "epoch": 14.650587507730364, - "grad_norm": 0.431640625, - "learning_rate": 3.134857757636889e-07, - "loss": 0.8558, - "step": 11845 - }, - { - "epoch": 14.656771799628942, - "grad_norm": 0.404296875, - "learning_rate": 3.0219561743707326e-07, - "loss": 0.8455, - "step": 11850 - }, - { - "epoch": 14.66295609152752, - "grad_norm": 0.453125, - "learning_rate": 2.9111220411014437e-07, - "loss": 0.8558, - "step": 11855 - }, - { - "epoch": 14.669140383426098, - "grad_norm": 0.4609375, - "learning_rate": 2.8023555876673937e-07, - "loss": 0.85, - "step": 11860 - }, - { - "epoch": 14.675324675324676, - "grad_norm": 0.41796875, - "learning_rate": 2.6956570396197143e-07, - "loss": 0.848, - "step": 11865 - }, - { - "epoch": 14.681508967223253, - "grad_norm": 0.427734375, - "learning_rate": 2.5910266182207486e-07, - "loss": 0.8651, - "step": 11870 - }, - { - "epoch": 14.687693259121831, - "grad_norm": 0.4140625, - "learning_rate": 2.4884645404443795e-07, - "loss": 0.8467, - "step": 11875 - }, - { - "epoch": 14.693877551020408, - "grad_norm": 0.416015625, - "learning_rate": 2.3879710189753656e-07, - "loss": 0.8597, - "step": 11880 - }, - { - "epoch": 14.700061842918986, - "grad_norm": 0.41015625, - "learning_rate": 2.289546262208786e-07, - "loss": 0.8578, - "step": 11885 - }, - { - "epoch": 14.706246134817563, - "grad_norm": 0.4453125, - "learning_rate": 2.1931904742495957e-07, - "loss": 0.8528, - "step": 11890 - }, - { - "epoch": 14.712430426716141, - "grad_norm": 0.435546875, - "learning_rate": 2.098903854912515e-07, - "loss": 0.8549, - "step": 11895 - }, - { - "epoch": 14.718614718614718, - "grad_norm": 0.423828125, - "learning_rate": 2.0066865997212525e-07, - "loss": 0.8537, - "step": 11900 - }, - { - "epoch": 14.724799010513296, - "grad_norm": 0.408203125, - "learning_rate": 1.9165388999082822e-07, - "loss": 0.8617, - "step": 11905 - }, - { - "epoch": 14.730983302411873, - "grad_norm": 0.447265625, - "learning_rate": 1.8284609424142895e-07, - "loss": 0.858, - "step": 11910 - }, - { - "epoch": 14.737167594310451, - "grad_norm": 0.4296875, - "learning_rate": 1.7424529098881703e-07, - "loss": 0.8557, - "step": 11915 - }, - { - "epoch": 14.743351886209028, - "grad_norm": 0.4453125, - "learning_rate": 1.6585149806860324e-07, - "loss": 0.8588, - "step": 11920 - }, - { - "epoch": 14.749536178107606, - "grad_norm": 0.419921875, - "learning_rate": 1.5766473288715278e-07, - "loss": 0.8595, - "step": 11925 - }, - { - "epoch": 14.755720470006183, - "grad_norm": 0.41015625, - "learning_rate": 1.4968501242148547e-07, - "loss": 0.8498, - "step": 11930 - }, - { - "epoch": 14.761904761904763, - "grad_norm": 0.435546875, - "learning_rate": 1.4191235321928676e-07, - "loss": 0.8676, - "step": 11935 - }, - { - "epoch": 14.76808905380334, - "grad_norm": 0.43359375, - "learning_rate": 1.3434677139885222e-07, - "loss": 0.8537, - "step": 11940 - }, - { - "epoch": 14.774273345701918, - "grad_norm": 0.435546875, - "learning_rate": 1.2698828264904317e-07, - "loss": 0.8622, - "step": 11945 - }, - { - "epoch": 14.780457637600495, - "grad_norm": 0.4609375, - "learning_rate": 1.1983690222929778e-07, - "loss": 0.8581, - "step": 11950 - }, - { - "epoch": 14.786641929499073, - "grad_norm": 0.4296875, - "learning_rate": 1.1289264496953111e-07, - "loss": 0.8554, - "step": 11955 - }, - { - "epoch": 14.79282622139765, - "grad_norm": 0.4453125, - "learning_rate": 1.0615552527017958e-07, - "loss": 0.8502, - "step": 11960 - }, - { - "epoch": 14.799010513296228, - "grad_norm": 0.419921875, - "learning_rate": 9.962555710212318e-08, - "loss": 0.8599, - "step": 11965 - }, - { - "epoch": 14.805194805194805, - "grad_norm": 0.4453125, - "learning_rate": 9.330275400666332e-08, - "loss": 0.8524, - "step": 11970 - }, - { - "epoch": 14.811379097093383, - "grad_norm": 0.42578125, - "learning_rate": 8.718712909548953e-08, - "loss": 0.8665, - "step": 11975 - }, - { - "epoch": 14.81756338899196, - "grad_norm": 0.42578125, - "learning_rate": 8.127869505069053e-08, - "loss": 0.8573, - "step": 11980 - }, - { - "epoch": 14.823747680890538, - "grad_norm": 0.404296875, - "learning_rate": 7.557746412468758e-08, - "loss": 0.8544, - "step": 11985 - }, - { - "epoch": 14.829931972789115, - "grad_norm": 0.404296875, - "learning_rate": 7.00834481402013e-08, - "loss": 0.8494, - "step": 11990 - }, - { - "epoch": 14.836116264687693, - "grad_norm": 0.4296875, - "learning_rate": 6.479665849027372e-08, - "loss": 0.8539, - "step": 11995 - }, - { - "epoch": 14.84230055658627, - "grad_norm": 0.427734375, - "learning_rate": 5.971710613821291e-08, - "loss": 0.8534, - "step": 12000 - }, - { - "epoch": 14.848484848484848, - "grad_norm": 0.423828125, - "learning_rate": 5.484480161755956e-08, - "loss": 0.8428, - "step": 12005 - }, - { - "epoch": 14.854669140383425, - "grad_norm": 0.455078125, - "learning_rate": 5.0179755032109253e-08, - "loss": 0.8615, - "step": 12010 - }, - { - "epoch": 14.860853432282005, - "grad_norm": 0.43359375, - "learning_rate": 4.572197605583473e-08, - "loss": 0.8527, - "step": 12015 - }, - { - "epoch": 14.867037724180582, - "grad_norm": 0.4375, - "learning_rate": 4.147147393290807e-08, - "loss": 0.861, - "step": 12020 - }, - { - "epoch": 14.87322201607916, - "grad_norm": 0.42578125, - "learning_rate": 3.742825747766743e-08, - "loss": 0.8517, - "step": 12025 - }, - { - "epoch": 14.879406307977737, - "grad_norm": 0.408203125, - "learning_rate": 3.359233507459481e-08, - "loss": 0.8382, - "step": 12030 - }, - { - "epoch": 14.885590599876314, - "grad_norm": 0.447265625, - "learning_rate": 2.9963714678316045e-08, - "loss": 0.8497, - "step": 12035 - }, - { - "epoch": 14.891774891774892, - "grad_norm": 0.412109375, - "learning_rate": 2.6542403813545334e-08, - "loss": 0.8573, - "step": 12040 - }, - { - "epoch": 14.89795918367347, - "grad_norm": 0.416015625, - "learning_rate": 2.3328409575129608e-08, - "loss": 0.8541, - "step": 12045 - }, - { - "epoch": 14.904143475572047, - "grad_norm": 0.4375, - "learning_rate": 2.0321738627981923e-08, - "loss": 0.856, - "step": 12050 - }, - { - "epoch": 14.910327767470624, - "grad_norm": 0.43359375, - "learning_rate": 1.7522397207070383e-08, - "loss": 0.8598, - "step": 12055 - }, - { - "epoch": 14.916512059369202, - "grad_norm": 0.45703125, - "learning_rate": 1.4930391117451426e-08, - "loss": 0.8526, - "step": 12060 - }, - { - "epoch": 14.92269635126778, - "grad_norm": 0.419921875, - "learning_rate": 1.2545725734192103e-08, - "loss": 0.8637, - "step": 12065 - }, - { - "epoch": 14.928880643166357, - "grad_norm": 0.439453125, - "learning_rate": 1.0368406002436715e-08, - "loss": 0.8585, - "step": 12070 - }, - { - "epoch": 14.935064935064934, - "grad_norm": 0.412109375, - "learning_rate": 8.398436437317969e-09, - "loss": 0.8521, - "step": 12075 - }, - { - "epoch": 14.941249226963512, - "grad_norm": 0.416015625, - "learning_rate": 6.635821124001406e-09, - "loss": 0.8555, - "step": 12080 - }, - { - "epoch": 14.94743351886209, - "grad_norm": 0.423828125, - "learning_rate": 5.080563717629882e-09, - "loss": 0.8507, - "step": 12085 - }, - { - "epoch": 14.953617810760669, - "grad_norm": 0.431640625, - "learning_rate": 3.732667443390181e-09, - "loss": 0.8526, - "step": 12090 - }, - { - "epoch": 14.959802102659246, - "grad_norm": 0.431640625, - "learning_rate": 2.5921350964352997e-09, - "loss": 0.8538, - "step": 12095 - }, - { - "epoch": 14.965986394557824, - "grad_norm": 0.447265625, - "learning_rate": 1.6589690418955528e-09, - "loss": 0.8587, - "step": 12100 - }, - { - "epoch": 14.972170686456401, - "grad_norm": 0.427734375, - "learning_rate": 9.33171214889672e-10, - "loss": 0.8486, - "step": 12105 - }, - { - "epoch": 14.978354978354979, - "grad_norm": 0.421875, - "learning_rate": 4.147431205359098e-10, - "loss": 0.8622, - "step": 12110 - }, - { - "epoch": 14.984539270253556, - "grad_norm": 0.4453125, - "learning_rate": 1.0368583388542519e-10, - "loss": 0.8548, - "step": 12115 - }, - { - "epoch": 14.990723562152134, - "grad_norm": 0.431640625, - "learning_rate": 0.0, - "loss": 0.8485, - "step": 12120 - }, - { - "epoch": 14.990723562152134, - "eval_loss": 2.520606517791748, - "eval_runtime": 0.5387, - "eval_samples_per_second": 18.562, - "eval_steps_per_second": 1.856, - "step": 12120 - }, - { - "epoch": 14.990723562152134, - "step": 12120, - "total_flos": 7.118964864348848e+18, - "train_loss": 0.9255497357436139, - "train_runtime": 41659.6506, - "train_samples_per_second": 13.97, - "train_steps_per_second": 0.291 + "epoch": 9.993815708101423, + "step": 8080, + "total_flos": 4.816075145514844e+18, + "train_loss": 0.8524611343841741, + "train_runtime": 49320.3929, + "train_samples_per_second": 7.867, + "train_steps_per_second": 0.164 } ], "logging_steps": 5, - "max_steps": 12120, + "max_steps": 8080, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 10, "save_steps": 100, - "total_flos": 7.118964864348848e+18, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.816075145514844e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null