diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10387 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 184, + "global_step": 1470, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013605442176870747, + "grad_norm": 0.24701461672354935, + "learning_rate": 1.36986301369863e-07, + "loss": 1.6736, + "step": 1 + }, + { + "epoch": 0.0013605442176870747, + "eval_loss": 1.7904456853866577, + "eval_runtime": 75.582, + "eval_samples_per_second": 53.888, + "eval_steps_per_second": 6.748, + "step": 1 + }, + { + "epoch": 0.0027210884353741495, + "grad_norm": 0.21437113339785932, + "learning_rate": 2.73972602739726e-07, + "loss": 1.5884, + "step": 2 + }, + { + "epoch": 0.004081632653061225, + "grad_norm": 0.3228668200940542, + "learning_rate": 4.1095890410958903e-07, + "loss": 1.6821, + "step": 3 + }, + { + "epoch": 0.005442176870748299, + "grad_norm": 0.19408831616689562, + "learning_rate": 5.47945205479452e-07, + "loss": 1.8146, + "step": 4 + }, + { + "epoch": 0.006802721088435374, + "grad_norm": 0.18446566294319683, + "learning_rate": 6.849315068493151e-07, + "loss": 1.6316, + "step": 5 + }, + { + "epoch": 0.00816326530612245, + "grad_norm": 0.26237580245842185, + "learning_rate": 8.219178082191781e-07, + "loss": 1.7544, + "step": 6 + }, + { + "epoch": 0.009523809523809525, + "grad_norm": 0.1659195721310037, + "learning_rate": 9.589041095890411e-07, + "loss": 1.8325, + "step": 7 + }, + { + "epoch": 0.010884353741496598, + "grad_norm": 0.14112003912821341, + "learning_rate": 1.095890410958904e-06, + "loss": 1.8533, + "step": 8 + }, + { + "epoch": 0.012244897959183673, + "grad_norm": 0.22295406766041573, + "learning_rate": 1.2328767123287673e-06, + "loss": 1.7309, + "step": 9 + }, + { + "epoch": 0.013605442176870748, + "grad_norm": 0.20855919407710727, + "learning_rate": 1.3698630136986302e-06, + "loss": 1.4983, + "step": 10 + }, + { + "epoch": 0.014965986394557823, + "grad_norm": 0.39634451341504184, + "learning_rate": 1.5068493150684932e-06, + "loss": 1.71, + "step": 11 + }, + { + "epoch": 0.0163265306122449, + "grad_norm": 0.2918296142957545, + "learning_rate": 1.6438356164383561e-06, + "loss": 1.6983, + "step": 12 + }, + { + "epoch": 0.017687074829931974, + "grad_norm": 0.3333249210865954, + "learning_rate": 1.7808219178082193e-06, + "loss": 1.6435, + "step": 13 + }, + { + "epoch": 0.01904761904761905, + "grad_norm": 0.3288930419026758, + "learning_rate": 1.9178082191780823e-06, + "loss": 1.9445, + "step": 14 + }, + { + "epoch": 0.02040816326530612, + "grad_norm": 0.3311742875918285, + "learning_rate": 2.0547945205479454e-06, + "loss": 1.8007, + "step": 15 + }, + { + "epoch": 0.021768707482993196, + "grad_norm": 0.24222843258421317, + "learning_rate": 2.191780821917808e-06, + "loss": 1.8698, + "step": 16 + }, + { + "epoch": 0.02312925170068027, + "grad_norm": 0.2863215351075517, + "learning_rate": 2.3287671232876713e-06, + "loss": 1.8295, + "step": 17 + }, + { + "epoch": 0.024489795918367346, + "grad_norm": 0.37000991286313667, + "learning_rate": 2.4657534246575345e-06, + "loss": 1.7748, + "step": 18 + }, + { + "epoch": 0.02585034013605442, + "grad_norm": 0.305345665951125, + "learning_rate": 2.6027397260273973e-06, + "loss": 1.7799, + "step": 19 + }, + { + "epoch": 0.027210884353741496, + "grad_norm": 0.276577651886119, + "learning_rate": 2.7397260273972604e-06, + "loss": 1.5892, + "step": 20 + }, + { + "epoch": 0.02857142857142857, + "grad_norm": 0.40747672861545675, + "learning_rate": 2.876712328767123e-06, + "loss": 1.797, + "step": 21 + }, + { + "epoch": 0.029931972789115645, + "grad_norm": 0.1663214297242309, + "learning_rate": 3.0136986301369864e-06, + "loss": 1.8254, + "step": 22 + }, + { + "epoch": 0.031292517006802724, + "grad_norm": 0.34875514252556655, + "learning_rate": 3.1506849315068495e-06, + "loss": 1.5967, + "step": 23 + }, + { + "epoch": 0.0326530612244898, + "grad_norm": 0.31488445168418, + "learning_rate": 3.2876712328767123e-06, + "loss": 1.8033, + "step": 24 + }, + { + "epoch": 0.034013605442176874, + "grad_norm": 0.9585107293220959, + "learning_rate": 3.4246575342465754e-06, + "loss": 1.9985, + "step": 25 + }, + { + "epoch": 0.03537414965986395, + "grad_norm": 0.4719659909416967, + "learning_rate": 3.5616438356164386e-06, + "loss": 1.6673, + "step": 26 + }, + { + "epoch": 0.036734693877551024, + "grad_norm": 0.5206398105101208, + "learning_rate": 3.6986301369863014e-06, + "loss": 1.7832, + "step": 27 + }, + { + "epoch": 0.0380952380952381, + "grad_norm": 0.5525391513084628, + "learning_rate": 3.8356164383561645e-06, + "loss": 1.8033, + "step": 28 + }, + { + "epoch": 0.03945578231292517, + "grad_norm": 0.5864368554335787, + "learning_rate": 3.972602739726027e-06, + "loss": 1.637, + "step": 29 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 0.18211390682659326, + "learning_rate": 4.109589041095891e-06, + "loss": 1.6996, + "step": 30 + }, + { + "epoch": 0.04217687074829932, + "grad_norm": 0.26324481615027445, + "learning_rate": 4.246575342465754e-06, + "loss": 1.7077, + "step": 31 + }, + { + "epoch": 0.04353741496598639, + "grad_norm": 0.487665052197852, + "learning_rate": 4.383561643835616e-06, + "loss": 1.5757, + "step": 32 + }, + { + "epoch": 0.044897959183673466, + "grad_norm": 0.5110956602957011, + "learning_rate": 4.52054794520548e-06, + "loss": 1.6525, + "step": 33 + }, + { + "epoch": 0.04625850340136054, + "grad_norm": 0.41488349790070234, + "learning_rate": 4.657534246575343e-06, + "loss": 1.7469, + "step": 34 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.3205549447320179, + "learning_rate": 4.7945205479452054e-06, + "loss": 1.6621, + "step": 35 + }, + { + "epoch": 0.04897959183673469, + "grad_norm": 0.2759804237950767, + "learning_rate": 4.931506849315069e-06, + "loss": 1.8364, + "step": 36 + }, + { + "epoch": 0.050340136054421766, + "grad_norm": 0.4070079284746193, + "learning_rate": 5.068493150684932e-06, + "loss": 1.7928, + "step": 37 + }, + { + "epoch": 0.05170068027210884, + "grad_norm": 0.3162452736080499, + "learning_rate": 5.2054794520547945e-06, + "loss": 1.8174, + "step": 38 + }, + { + "epoch": 0.053061224489795916, + "grad_norm": 0.339190852848117, + "learning_rate": 5.342465753424658e-06, + "loss": 1.8372, + "step": 39 + }, + { + "epoch": 0.05442176870748299, + "grad_norm": 0.31599130496764827, + "learning_rate": 5.479452054794521e-06, + "loss": 1.7265, + "step": 40 + }, + { + "epoch": 0.055782312925170066, + "grad_norm": 0.18290357316608127, + "learning_rate": 5.6164383561643845e-06, + "loss": 1.7055, + "step": 41 + }, + { + "epoch": 0.05714285714285714, + "grad_norm": 0.26450493295787797, + "learning_rate": 5.753424657534246e-06, + "loss": 1.859, + "step": 42 + }, + { + "epoch": 0.058503401360544216, + "grad_norm": 0.2105468885683211, + "learning_rate": 5.89041095890411e-06, + "loss": 1.7903, + "step": 43 + }, + { + "epoch": 0.05986394557823129, + "grad_norm": 0.21904274744659627, + "learning_rate": 6.027397260273973e-06, + "loss": 1.7112, + "step": 44 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.2766631664495227, + "learning_rate": 6.164383561643836e-06, + "loss": 1.626, + "step": 45 + }, + { + "epoch": 0.06258503401360545, + "grad_norm": 0.27137304801321466, + "learning_rate": 6.301369863013699e-06, + "loss": 1.8546, + "step": 46 + }, + { + "epoch": 0.06394557823129252, + "grad_norm": 0.17562873404669305, + "learning_rate": 6.438356164383563e-06, + "loss": 1.8687, + "step": 47 + }, + { + "epoch": 0.0653061224489796, + "grad_norm": 0.23608638226381062, + "learning_rate": 6.5753424657534245e-06, + "loss": 1.5768, + "step": 48 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.12395160133391969, + "learning_rate": 6.712328767123288e-06, + "loss": 1.8217, + "step": 49 + }, + { + "epoch": 0.06802721088435375, + "grad_norm": 0.21069127406909471, + "learning_rate": 6.849315068493151e-06, + "loss": 1.7057, + "step": 50 + }, + { + "epoch": 0.06938775510204082, + "grad_norm": 0.17153884217244356, + "learning_rate": 6.9863013698630145e-06, + "loss": 1.9143, + "step": 51 + }, + { + "epoch": 0.0707482993197279, + "grad_norm": 0.3084343242877715, + "learning_rate": 7.123287671232877e-06, + "loss": 1.8398, + "step": 52 + }, + { + "epoch": 0.07210884353741497, + "grad_norm": 0.14644662918576262, + "learning_rate": 7.260273972602741e-06, + "loss": 1.6646, + "step": 53 + }, + { + "epoch": 0.07346938775510205, + "grad_norm": 0.3001793602079481, + "learning_rate": 7.397260273972603e-06, + "loss": 1.689, + "step": 54 + }, + { + "epoch": 0.07482993197278912, + "grad_norm": 0.301851334470962, + "learning_rate": 7.534246575342466e-06, + "loss": 1.5179, + "step": 55 + }, + { + "epoch": 0.0761904761904762, + "grad_norm": 0.33200247196496224, + "learning_rate": 7.671232876712329e-06, + "loss": 1.8986, + "step": 56 + }, + { + "epoch": 0.07755102040816327, + "grad_norm": 0.18181195505623798, + "learning_rate": 7.808219178082192e-06, + "loss": 1.6426, + "step": 57 + }, + { + "epoch": 0.07891156462585033, + "grad_norm": 0.12250708549849011, + "learning_rate": 7.945205479452055e-06, + "loss": 1.6214, + "step": 58 + }, + { + "epoch": 0.08027210884353742, + "grad_norm": 0.09796847494385076, + "learning_rate": 8.082191780821919e-06, + "loss": 1.6547, + "step": 59 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.12998919923759888, + "learning_rate": 8.219178082191782e-06, + "loss": 1.7818, + "step": 60 + }, + { + "epoch": 0.08299319727891157, + "grad_norm": 0.2260386111575877, + "learning_rate": 8.356164383561644e-06, + "loss": 1.7807, + "step": 61 + }, + { + "epoch": 0.08435374149659863, + "grad_norm": 0.33754760373428094, + "learning_rate": 8.493150684931507e-06, + "loss": 1.617, + "step": 62 + }, + { + "epoch": 0.08571428571428572, + "grad_norm": 0.35962963555168737, + "learning_rate": 8.63013698630137e-06, + "loss": 1.6799, + "step": 63 + }, + { + "epoch": 0.08707482993197278, + "grad_norm": 0.32506967541048193, + "learning_rate": 8.767123287671233e-06, + "loss": 1.6454, + "step": 64 + }, + { + "epoch": 0.08843537414965986, + "grad_norm": 0.21523079823600388, + "learning_rate": 8.904109589041097e-06, + "loss": 1.8856, + "step": 65 + }, + { + "epoch": 0.08979591836734693, + "grad_norm": 0.5363358811064897, + "learning_rate": 9.04109589041096e-06, + "loss": 1.6952, + "step": 66 + }, + { + "epoch": 0.09115646258503401, + "grad_norm": 0.14306066721600327, + "learning_rate": 9.178082191780823e-06, + "loss": 1.8208, + "step": 67 + }, + { + "epoch": 0.09251700680272108, + "grad_norm": 0.18646957264381078, + "learning_rate": 9.315068493150685e-06, + "loss": 1.7517, + "step": 68 + }, + { + "epoch": 0.09387755102040816, + "grad_norm": 0.19137982075531637, + "learning_rate": 9.452054794520548e-06, + "loss": 1.6456, + "step": 69 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.15987203027468555, + "learning_rate": 9.589041095890411e-06, + "loss": 1.7148, + "step": 70 + }, + { + "epoch": 0.09659863945578231, + "grad_norm": 0.16311504243422864, + "learning_rate": 9.726027397260275e-06, + "loss": 1.6627, + "step": 71 + }, + { + "epoch": 0.09795918367346938, + "grad_norm": 0.10186314299964105, + "learning_rate": 9.863013698630138e-06, + "loss": 1.5856, + "step": 72 + }, + { + "epoch": 0.09931972789115646, + "grad_norm": 0.13469761876363148, + "learning_rate": 1e-05, + "loss": 1.6557, + "step": 73 + }, + { + "epoch": 0.10068027210884353, + "grad_norm": 0.11568418682806415, + "learning_rate": 9.999987357098372e-06, + "loss": 1.7807, + "step": 74 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.11288388506482096, + "learning_rate": 9.999949428457423e-06, + "loss": 1.8232, + "step": 75 + }, + { + "epoch": 0.10340136054421768, + "grad_norm": 0.16329859637421754, + "learning_rate": 9.999886214268967e-06, + "loss": 1.7462, + "step": 76 + }, + { + "epoch": 0.10476190476190476, + "grad_norm": 0.20231664635671653, + "learning_rate": 9.999797714852686e-06, + "loss": 1.5938, + "step": 77 + }, + { + "epoch": 0.10612244897959183, + "grad_norm": 0.34538065180937266, + "learning_rate": 9.999683930656135e-06, + "loss": 1.8806, + "step": 78 + }, + { + "epoch": 0.10748299319727891, + "grad_norm": 0.13354157904043504, + "learning_rate": 9.999544862254743e-06, + "loss": 1.801, + "step": 79 + }, + { + "epoch": 0.10884353741496598, + "grad_norm": 0.13220305876865404, + "learning_rate": 9.999380510351796e-06, + "loss": 1.6805, + "step": 80 + }, + { + "epoch": 0.11020408163265306, + "grad_norm": 0.13768110879863274, + "learning_rate": 9.999190875778452e-06, + "loss": 1.7481, + "step": 81 + }, + { + "epoch": 0.11156462585034013, + "grad_norm": 0.11222690770456831, + "learning_rate": 9.998975959493722e-06, + "loss": 1.7894, + "step": 82 + }, + { + "epoch": 0.11292517006802721, + "grad_norm": 0.11775170157819592, + "learning_rate": 9.998735762584471e-06, + "loss": 1.8592, + "step": 83 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.20855277686570553, + "learning_rate": 9.998470286265415e-06, + "loss": 1.7145, + "step": 84 + }, + { + "epoch": 0.11564625850340136, + "grad_norm": 0.10682809945125131, + "learning_rate": 9.998179531879112e-06, + "loss": 1.7563, + "step": 85 + }, + { + "epoch": 0.11700680272108843, + "grad_norm": 0.1332681057101403, + "learning_rate": 9.99786350089595e-06, + "loss": 1.6698, + "step": 86 + }, + { + "epoch": 0.11836734693877551, + "grad_norm": 0.1442352006249483, + "learning_rate": 9.99752219491415e-06, + "loss": 1.542, + "step": 87 + }, + { + "epoch": 0.11972789115646258, + "grad_norm": 0.09723976872539679, + "learning_rate": 9.997155615659753e-06, + "loss": 1.5545, + "step": 88 + }, + { + "epoch": 0.12108843537414966, + "grad_norm": 0.15078850009122496, + "learning_rate": 9.996763764986606e-06, + "loss": 1.6872, + "step": 89 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 0.09880013032692718, + "learning_rate": 9.996346644876363e-06, + "loss": 1.5761, + "step": 90 + }, + { + "epoch": 0.12380952380952381, + "grad_norm": 0.1797981570168221, + "learning_rate": 9.995904257438467e-06, + "loss": 1.5885, + "step": 91 + }, + { + "epoch": 0.1251700680272109, + "grad_norm": 0.14066405347976094, + "learning_rate": 9.995436604910142e-06, + "loss": 1.7558, + "step": 92 + }, + { + "epoch": 0.12653061224489795, + "grad_norm": 0.2804984380485241, + "learning_rate": 9.994943689656381e-06, + "loss": 1.5653, + "step": 93 + }, + { + "epoch": 0.12789115646258503, + "grad_norm": 0.09802426112688165, + "learning_rate": 9.994425514169938e-06, + "loss": 1.8666, + "step": 94 + }, + { + "epoch": 0.1292517006802721, + "grad_norm": 0.2640163991220947, + "learning_rate": 9.993882081071307e-06, + "loss": 1.8331, + "step": 95 + }, + { + "epoch": 0.1306122448979592, + "grad_norm": 0.12584718580988416, + "learning_rate": 9.99331339310872e-06, + "loss": 1.7264, + "step": 96 + }, + { + "epoch": 0.13197278911564625, + "grad_norm": 0.11723300893007116, + "learning_rate": 9.99271945315812e-06, + "loss": 1.774, + "step": 97 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.11104245778454394, + "learning_rate": 9.992100264223156e-06, + "loss": 1.7154, + "step": 98 + }, + { + "epoch": 0.1346938775510204, + "grad_norm": 0.0915644970371204, + "learning_rate": 9.99145582943517e-06, + "loss": 1.6768, + "step": 99 + }, + { + "epoch": 0.1360544217687075, + "grad_norm": 0.11971918094721708, + "learning_rate": 9.990786152053169e-06, + "loss": 1.895, + "step": 100 + }, + { + "epoch": 0.13741496598639455, + "grad_norm": 0.13849974347702929, + "learning_rate": 9.99009123546382e-06, + "loss": 1.9232, + "step": 101 + }, + { + "epoch": 0.13877551020408163, + "grad_norm": 0.0832290902024341, + "learning_rate": 9.98937108318143e-06, + "loss": 1.419, + "step": 102 + }, + { + "epoch": 0.1401360544217687, + "grad_norm": 0.09490309244168035, + "learning_rate": 9.988625698847921e-06, + "loss": 1.6096, + "step": 103 + }, + { + "epoch": 0.1414965986394558, + "grad_norm": 0.08634281151584555, + "learning_rate": 9.987855086232824e-06, + "loss": 1.6766, + "step": 104 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.12657846070776754, + "learning_rate": 9.98705924923325e-06, + "loss": 1.7755, + "step": 105 + }, + { + "epoch": 0.14421768707482993, + "grad_norm": 0.1730231080244019, + "learning_rate": 9.986238191873874e-06, + "loss": 1.671, + "step": 106 + }, + { + "epoch": 0.145578231292517, + "grad_norm": 0.11653855558191023, + "learning_rate": 9.985391918306915e-06, + "loss": 1.6012, + "step": 107 + }, + { + "epoch": 0.1469387755102041, + "grad_norm": 0.09868922955378823, + "learning_rate": 9.984520432812117e-06, + "loss": 1.8218, + "step": 108 + }, + { + "epoch": 0.14829931972789115, + "grad_norm": 0.08718149041105193, + "learning_rate": 9.983623739796718e-06, + "loss": 1.6361, + "step": 109 + }, + { + "epoch": 0.14965986394557823, + "grad_norm": 0.08536190731319725, + "learning_rate": 9.982701843795441e-06, + "loss": 1.8356, + "step": 110 + }, + { + "epoch": 0.1510204081632653, + "grad_norm": 0.1778419657439268, + "learning_rate": 9.981754749470463e-06, + "loss": 1.6968, + "step": 111 + }, + { + "epoch": 0.1523809523809524, + "grad_norm": 0.12982223254146993, + "learning_rate": 9.980782461611391e-06, + "loss": 1.8005, + "step": 112 + }, + { + "epoch": 0.15374149659863945, + "grad_norm": 0.08982117932691205, + "learning_rate": 9.979784985135239e-06, + "loss": 1.7645, + "step": 113 + }, + { + "epoch": 0.15510204081632653, + "grad_norm": 0.12460716696891104, + "learning_rate": 9.978762325086408e-06, + "loss": 1.6455, + "step": 114 + }, + { + "epoch": 0.1564625850340136, + "grad_norm": 0.09362932823935477, + "learning_rate": 9.977714486636657e-06, + "loss": 1.8083, + "step": 115 + }, + { + "epoch": 0.15782312925170067, + "grad_norm": 0.09099536634076917, + "learning_rate": 9.976641475085067e-06, + "loss": 1.7776, + "step": 116 + }, + { + "epoch": 0.15918367346938775, + "grad_norm": 0.08568595730791906, + "learning_rate": 9.975543295858035e-06, + "loss": 1.8846, + "step": 117 + }, + { + "epoch": 0.16054421768707483, + "grad_norm": 0.1310404323604523, + "learning_rate": 9.974419954509225e-06, + "loss": 1.5725, + "step": 118 + }, + { + "epoch": 0.1619047619047619, + "grad_norm": 0.11863021260862251, + "learning_rate": 9.97327145671956e-06, + "loss": 1.6409, + "step": 119 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.11864941995819639, + "learning_rate": 9.972097808297174e-06, + "loss": 1.7081, + "step": 120 + }, + { + "epoch": 0.16462585034013605, + "grad_norm": 0.08013610894171046, + "learning_rate": 9.970899015177398e-06, + "loss": 1.7804, + "step": 121 + }, + { + "epoch": 0.16598639455782313, + "grad_norm": 0.12399055095582327, + "learning_rate": 9.969675083422719e-06, + "loss": 1.6848, + "step": 122 + }, + { + "epoch": 0.1673469387755102, + "grad_norm": 0.1433779964353759, + "learning_rate": 9.96842601922276e-06, + "loss": 1.6888, + "step": 123 + }, + { + "epoch": 0.16870748299319727, + "grad_norm": 0.09915990901687576, + "learning_rate": 9.967151828894234e-06, + "loss": 1.7802, + "step": 124 + }, + { + "epoch": 0.17006802721088435, + "grad_norm": 0.10206449162778881, + "learning_rate": 9.965852518880931e-06, + "loss": 1.806, + "step": 125 + }, + { + "epoch": 0.17142857142857143, + "grad_norm": 0.09282143748721522, + "learning_rate": 9.964528095753669e-06, + "loss": 1.5987, + "step": 126 + }, + { + "epoch": 0.1727891156462585, + "grad_norm": 0.16907020113729054, + "learning_rate": 9.963178566210268e-06, + "loss": 1.7569, + "step": 127 + }, + { + "epoch": 0.17414965986394557, + "grad_norm": 0.08207848199751772, + "learning_rate": 9.961803937075516e-06, + "loss": 1.6724, + "step": 128 + }, + { + "epoch": 0.17551020408163265, + "grad_norm": 0.07319670048822571, + "learning_rate": 9.960404215301133e-06, + "loss": 1.7498, + "step": 129 + }, + { + "epoch": 0.17687074829931973, + "grad_norm": 0.08159880339274488, + "learning_rate": 9.958979407965738e-06, + "loss": 1.65, + "step": 130 + }, + { + "epoch": 0.1782312925170068, + "grad_norm": 0.09730054828361595, + "learning_rate": 9.95752952227481e-06, + "loss": 1.7796, + "step": 131 + }, + { + "epoch": 0.17959183673469387, + "grad_norm": 0.2629732363287427, + "learning_rate": 9.956054565560653e-06, + "loss": 1.6904, + "step": 132 + }, + { + "epoch": 0.18095238095238095, + "grad_norm": 0.07651749890098045, + "learning_rate": 9.954554545282363e-06, + "loss": 1.7809, + "step": 133 + }, + { + "epoch": 0.18231292517006803, + "grad_norm": 0.09628395581138101, + "learning_rate": 9.953029469025777e-06, + "loss": 1.8135, + "step": 134 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 0.09612376832963275, + "learning_rate": 9.951479344503459e-06, + "loss": 1.6617, + "step": 135 + }, + { + "epoch": 0.18503401360544217, + "grad_norm": 0.08107993061371403, + "learning_rate": 9.949904179554632e-06, + "loss": 1.6634, + "step": 136 + }, + { + "epoch": 0.18639455782312925, + "grad_norm": 0.07754512965459885, + "learning_rate": 9.94830398214516e-06, + "loss": 1.7732, + "step": 137 + }, + { + "epoch": 0.18775510204081633, + "grad_norm": 0.07265030754659244, + "learning_rate": 9.946678760367498e-06, + "loss": 1.7905, + "step": 138 + }, + { + "epoch": 0.1891156462585034, + "grad_norm": 0.09088517967487394, + "learning_rate": 9.945028522440654e-06, + "loss": 1.49, + "step": 139 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.21999926224724559, + "learning_rate": 9.943353276710146e-06, + "loss": 2.0726, + "step": 140 + }, + { + "epoch": 0.19183673469387755, + "grad_norm": 0.07397509235485085, + "learning_rate": 9.941653031647963e-06, + "loss": 1.6069, + "step": 141 + }, + { + "epoch": 0.19319727891156463, + "grad_norm": 0.17678430730401373, + "learning_rate": 9.939927795852513e-06, + "loss": 1.8128, + "step": 142 + }, + { + "epoch": 0.1945578231292517, + "grad_norm": 0.09311447920236875, + "learning_rate": 9.938177578048593e-06, + "loss": 1.682, + "step": 143 + }, + { + "epoch": 0.19591836734693877, + "grad_norm": 0.08923483853542422, + "learning_rate": 9.936402387087339e-06, + "loss": 1.7808, + "step": 144 + }, + { + "epoch": 0.19727891156462585, + "grad_norm": 0.3457260004062318, + "learning_rate": 9.93460223194617e-06, + "loss": 1.921, + "step": 145 + }, + { + "epoch": 0.19863945578231293, + "grad_norm": 0.11272086420065035, + "learning_rate": 9.932777121728765e-06, + "loss": 1.627, + "step": 146 + }, + { + "epoch": 0.2, + "grad_norm": 0.0828154138118513, + "learning_rate": 9.930927065664997e-06, + "loss": 1.85, + "step": 147 + }, + { + "epoch": 0.20136054421768707, + "grad_norm": 0.10871781486388528, + "learning_rate": 9.929052073110897e-06, + "loss": 1.8526, + "step": 148 + }, + { + "epoch": 0.20272108843537415, + "grad_norm": 0.08372164326475892, + "learning_rate": 9.927152153548605e-06, + "loss": 1.6184, + "step": 149 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.2301749352348319, + "learning_rate": 9.925227316586316e-06, + "loss": 1.6416, + "step": 150 + }, + { + "epoch": 0.2054421768707483, + "grad_norm": 0.11534866479323268, + "learning_rate": 9.923277571958245e-06, + "loss": 1.6587, + "step": 151 + }, + { + "epoch": 0.20680272108843537, + "grad_norm": 0.1411655046905855, + "learning_rate": 9.921302929524561e-06, + "loss": 1.671, + "step": 152 + }, + { + "epoch": 0.20816326530612245, + "grad_norm": 0.07211757999248616, + "learning_rate": 9.919303399271348e-06, + "loss": 1.7163, + "step": 153 + }, + { + "epoch": 0.20952380952380953, + "grad_norm": 0.0873746156242924, + "learning_rate": 9.917278991310553e-06, + "loss": 1.6367, + "step": 154 + }, + { + "epoch": 0.2108843537414966, + "grad_norm": 0.0819591281688772, + "learning_rate": 9.915229715879928e-06, + "loss": 1.6989, + "step": 155 + }, + { + "epoch": 0.21224489795918366, + "grad_norm": 0.08552981032847369, + "learning_rate": 9.913155583342994e-06, + "loss": 1.5244, + "step": 156 + }, + { + "epoch": 0.21360544217687075, + "grad_norm": 0.13550974122069206, + "learning_rate": 9.91105660418897e-06, + "loss": 1.7495, + "step": 157 + }, + { + "epoch": 0.21496598639455783, + "grad_norm": 0.07091163304804983, + "learning_rate": 9.908932789032729e-06, + "loss": 1.7387, + "step": 158 + }, + { + "epoch": 0.2163265306122449, + "grad_norm": 0.0838103140533003, + "learning_rate": 9.906784148614745e-06, + "loss": 1.7076, + "step": 159 + }, + { + "epoch": 0.21768707482993196, + "grad_norm": 0.11349611508198672, + "learning_rate": 9.904610693801042e-06, + "loss": 1.6596, + "step": 160 + }, + { + "epoch": 0.21904761904761905, + "grad_norm": 0.07733122749252737, + "learning_rate": 9.902412435583127e-06, + "loss": 1.6503, + "step": 161 + }, + { + "epoch": 0.22040816326530613, + "grad_norm": 0.14625572340923682, + "learning_rate": 9.900189385077948e-06, + "loss": 1.564, + "step": 162 + }, + { + "epoch": 0.2217687074829932, + "grad_norm": 0.09712144690644532, + "learning_rate": 9.897941553527823e-06, + "loss": 1.7217, + "step": 163 + }, + { + "epoch": 0.22312925170068026, + "grad_norm": 0.0712274015908157, + "learning_rate": 9.895668952300403e-06, + "loss": 1.6412, + "step": 164 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 0.08811945291100708, + "learning_rate": 9.893371592888594e-06, + "loss": 1.6192, + "step": 165 + }, + { + "epoch": 0.22585034013605443, + "grad_norm": 0.07563751954927482, + "learning_rate": 9.891049486910513e-06, + "loss": 1.6283, + "step": 166 + }, + { + "epoch": 0.2272108843537415, + "grad_norm": 0.07473029887668768, + "learning_rate": 9.888702646109423e-06, + "loss": 1.6979, + "step": 167 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.07966661835478112, + "learning_rate": 9.886331082353673e-06, + "loss": 1.6951, + "step": 168 + }, + { + "epoch": 0.22993197278911565, + "grad_norm": 0.08625904148958655, + "learning_rate": 9.883934807636645e-06, + "loss": 1.6239, + "step": 169 + }, + { + "epoch": 0.23129251700680273, + "grad_norm": 0.06615144618602906, + "learning_rate": 9.881513834076683e-06, + "loss": 1.7456, + "step": 170 + }, + { + "epoch": 0.23265306122448978, + "grad_norm": 0.14491038831608893, + "learning_rate": 9.87906817391704e-06, + "loss": 1.7035, + "step": 171 + }, + { + "epoch": 0.23401360544217686, + "grad_norm": 0.0832300629302243, + "learning_rate": 9.876597839525814e-06, + "loss": 1.6672, + "step": 172 + }, + { + "epoch": 0.23537414965986395, + "grad_norm": 0.0917489076009908, + "learning_rate": 9.87410284339588e-06, + "loss": 1.6075, + "step": 173 + }, + { + "epoch": 0.23673469387755103, + "grad_norm": 0.06866742872418008, + "learning_rate": 9.871583198144836e-06, + "loss": 1.7646, + "step": 174 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.08465371920438951, + "learning_rate": 9.869038916514932e-06, + "loss": 1.6692, + "step": 175 + }, + { + "epoch": 0.23945578231292516, + "grad_norm": 0.09375415555940526, + "learning_rate": 9.866470011373009e-06, + "loss": 1.778, + "step": 176 + }, + { + "epoch": 0.24081632653061225, + "grad_norm": 0.07220432655814331, + "learning_rate": 9.863876495710433e-06, + "loss": 1.6857, + "step": 177 + }, + { + "epoch": 0.24217687074829933, + "grad_norm": 0.0797192184377915, + "learning_rate": 9.86125838264303e-06, + "loss": 1.7893, + "step": 178 + }, + { + "epoch": 0.24353741496598638, + "grad_norm": 0.07597718520214916, + "learning_rate": 9.858615685411018e-06, + "loss": 1.8848, + "step": 179 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.08003681020814803, + "learning_rate": 9.85594841737894e-06, + "loss": 1.8, + "step": 180 + }, + { + "epoch": 0.24625850340136055, + "grad_norm": 0.09696384289585193, + "learning_rate": 9.853256592035602e-06, + "loss": 1.7965, + "step": 181 + }, + { + "epoch": 0.24761904761904763, + "grad_norm": 0.12333580747104468, + "learning_rate": 9.850540222993994e-06, + "loss": 1.6365, + "step": 182 + }, + { + "epoch": 0.24897959183673468, + "grad_norm": 0.07310272321033273, + "learning_rate": 9.847799323991234e-06, + "loss": 1.5765, + "step": 183 + }, + { + "epoch": 0.2503401360544218, + "grad_norm": 0.12923131777808997, + "learning_rate": 9.845033908888485e-06, + "loss": 1.8017, + "step": 184 + }, + { + "epoch": 0.2503401360544218, + "eval_loss": 1.7241544723510742, + "eval_runtime": 76.6185, + "eval_samples_per_second": 53.159, + "eval_steps_per_second": 6.656, + "step": 184 + }, + { + "epoch": 0.25170068027210885, + "grad_norm": 0.06809715986288661, + "learning_rate": 9.842243991670899e-06, + "loss": 1.79, + "step": 185 + }, + { + "epoch": 0.2530612244897959, + "grad_norm": 0.08842474286261379, + "learning_rate": 9.839429586447534e-06, + "loss": 1.6168, + "step": 186 + }, + { + "epoch": 0.254421768707483, + "grad_norm": 0.10935159810036835, + "learning_rate": 9.836590707451287e-06, + "loss": 1.8505, + "step": 187 + }, + { + "epoch": 0.25578231292517006, + "grad_norm": 0.12270237138661655, + "learning_rate": 9.833727369038827e-06, + "loss": 1.635, + "step": 188 + }, + { + "epoch": 0.2571428571428571, + "grad_norm": 0.10719376109260877, + "learning_rate": 9.830839585690519e-06, + "loss": 1.8374, + "step": 189 + }, + { + "epoch": 0.2585034013605442, + "grad_norm": 0.07990417660478581, + "learning_rate": 9.827927372010343e-06, + "loss": 1.5681, + "step": 190 + }, + { + "epoch": 0.2598639455782313, + "grad_norm": 0.09668140137221073, + "learning_rate": 9.824990742725835e-06, + "loss": 1.6568, + "step": 191 + }, + { + "epoch": 0.2612244897959184, + "grad_norm": 0.07968515377548961, + "learning_rate": 9.822029712687999e-06, + "loss": 1.6007, + "step": 192 + }, + { + "epoch": 0.26258503401360545, + "grad_norm": 0.08837508173810749, + "learning_rate": 9.81904429687124e-06, + "loss": 1.6621, + "step": 193 + }, + { + "epoch": 0.2639455782312925, + "grad_norm": 0.08728025707185973, + "learning_rate": 9.816034510373287e-06, + "loss": 1.8335, + "step": 194 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 0.07961309839255727, + "learning_rate": 9.81300036841511e-06, + "loss": 1.7037, + "step": 195 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.09980119114286523, + "learning_rate": 9.809941886340854e-06, + "loss": 1.5664, + "step": 196 + }, + { + "epoch": 0.2680272108843537, + "grad_norm": 0.07147953268098678, + "learning_rate": 9.806859079617757e-06, + "loss": 1.7601, + "step": 197 + }, + { + "epoch": 0.2693877551020408, + "grad_norm": 0.08653388534305975, + "learning_rate": 9.803751963836065e-06, + "loss": 1.54, + "step": 198 + }, + { + "epoch": 0.2707482993197279, + "grad_norm": 0.0776685121413518, + "learning_rate": 9.800620554708962e-06, + "loss": 1.5557, + "step": 199 + }, + { + "epoch": 0.272108843537415, + "grad_norm": 0.07711567735740478, + "learning_rate": 9.797464868072489e-06, + "loss": 1.7101, + "step": 200 + }, + { + "epoch": 0.27346938775510204, + "grad_norm": 0.09355853262847387, + "learning_rate": 9.794284919885456e-06, + "loss": 1.7454, + "step": 201 + }, + { + "epoch": 0.2748299319727891, + "grad_norm": 0.0975587232648776, + "learning_rate": 9.791080726229376e-06, + "loss": 1.7479, + "step": 202 + }, + { + "epoch": 0.2761904761904762, + "grad_norm": 0.07709180794261607, + "learning_rate": 9.78785230330837e-06, + "loss": 1.8086, + "step": 203 + }, + { + "epoch": 0.27755102040816326, + "grad_norm": 0.09748041740615765, + "learning_rate": 9.784599667449088e-06, + "loss": 1.683, + "step": 204 + }, + { + "epoch": 0.2789115646258503, + "grad_norm": 0.09608384188874226, + "learning_rate": 9.781322835100639e-06, + "loss": 1.7985, + "step": 205 + }, + { + "epoch": 0.2802721088435374, + "grad_norm": 0.24417626356607502, + "learning_rate": 9.778021822834484e-06, + "loss": 1.721, + "step": 206 + }, + { + "epoch": 0.2816326530612245, + "grad_norm": 0.08000490559142356, + "learning_rate": 9.774696647344376e-06, + "loss": 1.5646, + "step": 207 + }, + { + "epoch": 0.2829931972789116, + "grad_norm": 0.08448656773098678, + "learning_rate": 9.771347325446261e-06, + "loss": 1.7897, + "step": 208 + }, + { + "epoch": 0.28435374149659864, + "grad_norm": 0.09777185072102372, + "learning_rate": 9.767973874078196e-06, + "loss": 1.8829, + "step": 209 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.07490737419370372, + "learning_rate": 9.764576310300268e-06, + "loss": 1.8031, + "step": 210 + }, + { + "epoch": 0.2870748299319728, + "grad_norm": 0.07567900709987288, + "learning_rate": 9.761154651294505e-06, + "loss": 1.752, + "step": 211 + }, + { + "epoch": 0.28843537414965986, + "grad_norm": 0.08322325294858353, + "learning_rate": 9.757708914364784e-06, + "loss": 1.6328, + "step": 212 + }, + { + "epoch": 0.2897959183673469, + "grad_norm": 0.19949313188011264, + "learning_rate": 9.75423911693675e-06, + "loss": 1.8577, + "step": 213 + }, + { + "epoch": 0.291156462585034, + "grad_norm": 0.07497191387563905, + "learning_rate": 9.750745276557725e-06, + "loss": 1.4911, + "step": 214 + }, + { + "epoch": 0.2925170068027211, + "grad_norm": 0.09096003185892962, + "learning_rate": 9.747227410896624e-06, + "loss": 1.5857, + "step": 215 + }, + { + "epoch": 0.2938775510204082, + "grad_norm": 0.06790968657114778, + "learning_rate": 9.743685537743856e-06, + "loss": 1.6452, + "step": 216 + }, + { + "epoch": 0.29523809523809524, + "grad_norm": 0.08487437350333037, + "learning_rate": 9.740119675011246e-06, + "loss": 1.674, + "step": 217 + }, + { + "epoch": 0.2965986394557823, + "grad_norm": 0.07405858389177783, + "learning_rate": 9.73652984073193e-06, + "loss": 1.7461, + "step": 218 + }, + { + "epoch": 0.2979591836734694, + "grad_norm": 0.07067520018576251, + "learning_rate": 9.73291605306028e-06, + "loss": 1.7163, + "step": 219 + }, + { + "epoch": 0.29931972789115646, + "grad_norm": 0.0791420166635673, + "learning_rate": 9.7292783302718e-06, + "loss": 1.6668, + "step": 220 + }, + { + "epoch": 0.3006802721088435, + "grad_norm": 0.07323313348575836, + "learning_rate": 9.72561669076304e-06, + "loss": 1.6398, + "step": 221 + }, + { + "epoch": 0.3020408163265306, + "grad_norm": 0.07634813609350796, + "learning_rate": 9.721931153051497e-06, + "loss": 1.6447, + "step": 222 + }, + { + "epoch": 0.3034013605442177, + "grad_norm": 0.07950888254230533, + "learning_rate": 9.718221735775527e-06, + "loss": 1.7845, + "step": 223 + }, + { + "epoch": 0.3047619047619048, + "grad_norm": 0.1408011781580588, + "learning_rate": 9.714488457694252e-06, + "loss": 1.7427, + "step": 224 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 0.20352696620915875, + "learning_rate": 9.710731337687457e-06, + "loss": 1.7789, + "step": 225 + }, + { + "epoch": 0.3074829931972789, + "grad_norm": 0.3632349628769343, + "learning_rate": 9.7069503947555e-06, + "loss": 1.7108, + "step": 226 + }, + { + "epoch": 0.308843537414966, + "grad_norm": 0.08426345577870951, + "learning_rate": 9.70314564801922e-06, + "loss": 1.5991, + "step": 227 + }, + { + "epoch": 0.31020408163265306, + "grad_norm": 0.07487391878859809, + "learning_rate": 9.699317116719831e-06, + "loss": 1.6637, + "step": 228 + }, + { + "epoch": 0.3115646258503401, + "grad_norm": 0.11408922719163124, + "learning_rate": 9.695464820218829e-06, + "loss": 1.734, + "step": 229 + }, + { + "epoch": 0.3129251700680272, + "grad_norm": 0.08379918487601977, + "learning_rate": 9.6915887779979e-06, + "loss": 1.5813, + "step": 230 + }, + { + "epoch": 0.3142857142857143, + "grad_norm": 0.08052280305893883, + "learning_rate": 9.68768900965881e-06, + "loss": 1.7164, + "step": 231 + }, + { + "epoch": 0.31564625850340133, + "grad_norm": 0.09154693865406958, + "learning_rate": 9.683765534923315e-06, + "loss": 1.5906, + "step": 232 + }, + { + "epoch": 0.31700680272108844, + "grad_norm": 0.1156976576631727, + "learning_rate": 9.679818373633054e-06, + "loss": 1.5045, + "step": 233 + }, + { + "epoch": 0.3183673469387755, + "grad_norm": 0.08364003073959134, + "learning_rate": 9.67584754574946e-06, + "loss": 1.6152, + "step": 234 + }, + { + "epoch": 0.3197278911564626, + "grad_norm": 0.15089067639580192, + "learning_rate": 9.671853071353645e-06, + "loss": 1.7127, + "step": 235 + }, + { + "epoch": 0.32108843537414966, + "grad_norm": 0.08835383872875176, + "learning_rate": 9.667834970646309e-06, + "loss": 1.609, + "step": 236 + }, + { + "epoch": 0.3224489795918367, + "grad_norm": 0.08056944247122935, + "learning_rate": 9.663793263947631e-06, + "loss": 1.6126, + "step": 237 + }, + { + "epoch": 0.3238095238095238, + "grad_norm": 0.09989875040942321, + "learning_rate": 9.659727971697173e-06, + "loss": 1.8035, + "step": 238 + }, + { + "epoch": 0.3251700680272109, + "grad_norm": 0.07493814768096231, + "learning_rate": 9.655639114453771e-06, + "loss": 1.813, + "step": 239 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.0879835131172389, + "learning_rate": 9.651526712895431e-06, + "loss": 1.6926, + "step": 240 + }, + { + "epoch": 0.32789115646258504, + "grad_norm": 0.1315730713741491, + "learning_rate": 9.647390787819232e-06, + "loss": 1.6993, + "step": 241 + }, + { + "epoch": 0.3292517006802721, + "grad_norm": 0.08144482513159658, + "learning_rate": 9.643231360141205e-06, + "loss": 1.5821, + "step": 242 + }, + { + "epoch": 0.3306122448979592, + "grad_norm": 0.11213233561578728, + "learning_rate": 9.639048450896251e-06, + "loss": 1.6491, + "step": 243 + }, + { + "epoch": 0.33197278911564626, + "grad_norm": 0.10094097099195502, + "learning_rate": 9.63484208123801e-06, + "loss": 1.5492, + "step": 244 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.06549445299823263, + "learning_rate": 9.630612272438771e-06, + "loss": 1.6378, + "step": 245 + }, + { + "epoch": 0.3346938775510204, + "grad_norm": 0.09866738594204459, + "learning_rate": 9.626359045889356e-06, + "loss": 1.6712, + "step": 246 + }, + { + "epoch": 0.3360544217687075, + "grad_norm": 0.08348510716160104, + "learning_rate": 9.622082423099013e-06, + "loss": 1.6177, + "step": 247 + }, + { + "epoch": 0.33741496598639453, + "grad_norm": 0.0912979636888496, + "learning_rate": 9.617782425695314e-06, + "loss": 1.7233, + "step": 248 + }, + { + "epoch": 0.33877551020408164, + "grad_norm": 0.0730054447204009, + "learning_rate": 9.613459075424033e-06, + "loss": 1.8876, + "step": 249 + }, + { + "epoch": 0.3401360544217687, + "grad_norm": 0.07708564338244163, + "learning_rate": 9.609112394149052e-06, + "loss": 1.5562, + "step": 250 + }, + { + "epoch": 0.3414965986394558, + "grad_norm": 0.09288554194206548, + "learning_rate": 9.604742403852232e-06, + "loss": 1.7512, + "step": 251 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.09029592521774728, + "learning_rate": 9.600349126633317e-06, + "loss": 1.4964, + "step": 252 + }, + { + "epoch": 0.3442176870748299, + "grad_norm": 0.09268667220104135, + "learning_rate": 9.595932584709815e-06, + "loss": 1.5166, + "step": 253 + }, + { + "epoch": 0.345578231292517, + "grad_norm": 0.09895745636981777, + "learning_rate": 9.59149280041689e-06, + "loss": 1.5862, + "step": 254 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 0.0918028367563306, + "learning_rate": 9.587029796207246e-06, + "loss": 1.6704, + "step": 255 + }, + { + "epoch": 0.34829931972789113, + "grad_norm": 0.08184214411162406, + "learning_rate": 9.582543594651006e-06, + "loss": 1.5767, + "step": 256 + }, + { + "epoch": 0.34965986394557824, + "grad_norm": 0.10380891219881462, + "learning_rate": 9.578034218435618e-06, + "loss": 1.7974, + "step": 257 + }, + { + "epoch": 0.3510204081632653, + "grad_norm": 0.07763979596015386, + "learning_rate": 9.573501690365718e-06, + "loss": 1.6754, + "step": 258 + }, + { + "epoch": 0.3523809523809524, + "grad_norm": 0.0974676538104556, + "learning_rate": 9.568946033363032e-06, + "loss": 1.7312, + "step": 259 + }, + { + "epoch": 0.35374149659863946, + "grad_norm": 0.08571433404959614, + "learning_rate": 9.564367270466247e-06, + "loss": 1.5805, + "step": 260 + }, + { + "epoch": 0.3551020408163265, + "grad_norm": 0.0818876254257716, + "learning_rate": 9.559765424830903e-06, + "loss": 1.7883, + "step": 261 + }, + { + "epoch": 0.3564625850340136, + "grad_norm": 0.0803950448066755, + "learning_rate": 9.555140519729273e-06, + "loss": 1.7474, + "step": 262 + }, + { + "epoch": 0.3578231292517007, + "grad_norm": 0.0843722550488927, + "learning_rate": 9.550492578550246e-06, + "loss": 1.5564, + "step": 263 + }, + { + "epoch": 0.35918367346938773, + "grad_norm": 0.07803324012301414, + "learning_rate": 9.545821624799205e-06, + "loss": 1.4879, + "step": 264 + }, + { + "epoch": 0.36054421768707484, + "grad_norm": 0.08256837435665124, + "learning_rate": 9.541127682097916e-06, + "loss": 1.7395, + "step": 265 + }, + { + "epoch": 0.3619047619047619, + "grad_norm": 0.08789940175813588, + "learning_rate": 9.536410774184397e-06, + "loss": 1.6602, + "step": 266 + }, + { + "epoch": 0.363265306122449, + "grad_norm": 0.10680679462820718, + "learning_rate": 9.531670924912814e-06, + "loss": 1.4675, + "step": 267 + }, + { + "epoch": 0.36462585034013606, + "grad_norm": 0.07750163404143413, + "learning_rate": 9.526908158253345e-06, + "loss": 1.7119, + "step": 268 + }, + { + "epoch": 0.3659863945578231, + "grad_norm": 0.07081517755227287, + "learning_rate": 9.522122498292066e-06, + "loss": 1.6457, + "step": 269 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 0.10501830739522407, + "learning_rate": 9.517313969230826e-06, + "loss": 1.6398, + "step": 270 + }, + { + "epoch": 0.3687074829931973, + "grad_norm": 0.08143325631533527, + "learning_rate": 9.512482595387131e-06, + "loss": 1.6122, + "step": 271 + }, + { + "epoch": 0.37006802721088433, + "grad_norm": 0.08749817297460229, + "learning_rate": 9.507628401194015e-06, + "loss": 1.7328, + "step": 272 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 0.10516442293102286, + "learning_rate": 9.50275141119992e-06, + "loss": 1.5773, + "step": 273 + }, + { + "epoch": 0.3727891156462585, + "grad_norm": 0.07808789214492048, + "learning_rate": 9.497851650068561e-06, + "loss": 1.6635, + "step": 274 + }, + { + "epoch": 0.3741496598639456, + "grad_norm": 0.17789525917278, + "learning_rate": 9.492929142578823e-06, + "loss": 1.9121, + "step": 275 + }, + { + "epoch": 0.37551020408163266, + "grad_norm": 0.11269638220817121, + "learning_rate": 9.487983913624616e-06, + "loss": 1.7355, + "step": 276 + }, + { + "epoch": 0.3768707482993197, + "grad_norm": 0.08409991854502387, + "learning_rate": 9.483015988214757e-06, + "loss": 1.7628, + "step": 277 + }, + { + "epoch": 0.3782312925170068, + "grad_norm": 0.07365655913118094, + "learning_rate": 9.478025391472841e-06, + "loss": 1.8144, + "step": 278 + }, + { + "epoch": 0.3795918367346939, + "grad_norm": 0.12778690022382183, + "learning_rate": 9.473012148637121e-06, + "loss": 1.4851, + "step": 279 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.0809734685514322, + "learning_rate": 9.467976285060369e-06, + "loss": 1.7698, + "step": 280 + }, + { + "epoch": 0.38231292517006804, + "grad_norm": 0.08513218534341072, + "learning_rate": 9.462917826209757e-06, + "loss": 1.6411, + "step": 281 + }, + { + "epoch": 0.3836734693877551, + "grad_norm": 0.09008342709764051, + "learning_rate": 9.457836797666722e-06, + "loss": 1.694, + "step": 282 + }, + { + "epoch": 0.38503401360544215, + "grad_norm": 0.10502121760541401, + "learning_rate": 9.452733225126845e-06, + "loss": 1.6999, + "step": 283 + }, + { + "epoch": 0.38639455782312926, + "grad_norm": 0.07343666145039363, + "learning_rate": 9.44760713439971e-06, + "loss": 1.8164, + "step": 284 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 0.08974960620233877, + "learning_rate": 9.442458551408784e-06, + "loss": 1.8539, + "step": 285 + }, + { + "epoch": 0.3891156462585034, + "grad_norm": 0.07202421778040075, + "learning_rate": 9.437287502191275e-06, + "loss": 1.5453, + "step": 286 + }, + { + "epoch": 0.3904761904761905, + "grad_norm": 0.09076340883522513, + "learning_rate": 9.43209401289801e-06, + "loss": 1.7088, + "step": 287 + }, + { + "epoch": 0.39183673469387753, + "grad_norm": 0.08425026537963505, + "learning_rate": 9.426878109793301e-06, + "loss": 1.4451, + "step": 288 + }, + { + "epoch": 0.39319727891156464, + "grad_norm": 0.09510405982528822, + "learning_rate": 9.421639819254806e-06, + "loss": 1.7913, + "step": 289 + }, + { + "epoch": 0.3945578231292517, + "grad_norm": 0.09616833843974483, + "learning_rate": 9.416379167773403e-06, + "loss": 1.649, + "step": 290 + }, + { + "epoch": 0.39591836734693875, + "grad_norm": 0.08286660774561835, + "learning_rate": 9.41109618195305e-06, + "loss": 1.9277, + "step": 291 + }, + { + "epoch": 0.39727891156462586, + "grad_norm": 0.07769697723139755, + "learning_rate": 9.405790888510655e-06, + "loss": 1.7279, + "step": 292 + }, + { + "epoch": 0.3986394557823129, + "grad_norm": 0.09006259560100753, + "learning_rate": 9.400463314275942e-06, + "loss": 1.6039, + "step": 293 + }, + { + "epoch": 0.4, + "grad_norm": 0.09012228611455256, + "learning_rate": 9.39511348619131e-06, + "loss": 1.7865, + "step": 294 + }, + { + "epoch": 0.4013605442176871, + "grad_norm": 0.09086893554478499, + "learning_rate": 9.389741431311694e-06, + "loss": 1.6225, + "step": 295 + }, + { + "epoch": 0.40272108843537413, + "grad_norm": 0.1067223732758171, + "learning_rate": 9.384347176804441e-06, + "loss": 1.8657, + "step": 296 + }, + { + "epoch": 0.40408163265306124, + "grad_norm": 0.09585514004003831, + "learning_rate": 9.378930749949166e-06, + "loss": 1.6826, + "step": 297 + }, + { + "epoch": 0.4054421768707483, + "grad_norm": 0.08230460323355868, + "learning_rate": 9.373492178137606e-06, + "loss": 1.8107, + "step": 298 + }, + { + "epoch": 0.40680272108843535, + "grad_norm": 0.08574613920900533, + "learning_rate": 9.368031488873492e-06, + "loss": 1.5687, + "step": 299 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.07911826770354155, + "learning_rate": 9.36254870977241e-06, + "loss": 1.8716, + "step": 300 + }, + { + "epoch": 0.4095238095238095, + "grad_norm": 0.08540782320726534, + "learning_rate": 9.357043868561653e-06, + "loss": 1.7997, + "step": 301 + }, + { + "epoch": 0.4108843537414966, + "grad_norm": 0.08850925770828139, + "learning_rate": 9.351516993080088e-06, + "loss": 1.6299, + "step": 302 + }, + { + "epoch": 0.4122448979591837, + "grad_norm": 0.10611011164886108, + "learning_rate": 9.34596811127801e-06, + "loss": 1.5621, + "step": 303 + }, + { + "epoch": 0.41360544217687073, + "grad_norm": 0.08382183266579103, + "learning_rate": 9.340397251217009e-06, + "loss": 1.4407, + "step": 304 + }, + { + "epoch": 0.41496598639455784, + "grad_norm": 0.09119095993947843, + "learning_rate": 9.334804441069819e-06, + "loss": 1.7161, + "step": 305 + }, + { + "epoch": 0.4163265306122449, + "grad_norm": 0.11113995697266435, + "learning_rate": 9.329189709120175e-06, + "loss": 1.4126, + "step": 306 + }, + { + "epoch": 0.41768707482993195, + "grad_norm": 0.08080147586730375, + "learning_rate": 9.323553083762681e-06, + "loss": 1.7303, + "step": 307 + }, + { + "epoch": 0.41904761904761906, + "grad_norm": 0.10411476203370171, + "learning_rate": 9.31789459350266e-06, + "loss": 1.6408, + "step": 308 + }, + { + "epoch": 0.4204081632653061, + "grad_norm": 0.0859822277343485, + "learning_rate": 9.312214266956003e-06, + "loss": 1.6534, + "step": 309 + }, + { + "epoch": 0.4217687074829932, + "grad_norm": 0.08604098070322257, + "learning_rate": 9.306512132849035e-06, + "loss": 1.6252, + "step": 310 + }, + { + "epoch": 0.4231292517006803, + "grad_norm": 0.0876425921649466, + "learning_rate": 9.300788220018363e-06, + "loss": 1.5096, + "step": 311 + }, + { + "epoch": 0.42448979591836733, + "grad_norm": 0.08371274563563173, + "learning_rate": 9.295042557410736e-06, + "loss": 1.7352, + "step": 312 + }, + { + "epoch": 0.42585034013605444, + "grad_norm": 0.13500827487489858, + "learning_rate": 9.28927517408289e-06, + "loss": 1.7902, + "step": 313 + }, + { + "epoch": 0.4272108843537415, + "grad_norm": 0.08754620765852711, + "learning_rate": 9.28348609920141e-06, + "loss": 1.6862, + "step": 314 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.1028946128162606, + "learning_rate": 9.27767536204258e-06, + "loss": 1.6907, + "step": 315 + }, + { + "epoch": 0.42993197278911566, + "grad_norm": 0.08692259223714764, + "learning_rate": 9.271842991992231e-06, + "loss": 1.638, + "step": 316 + }, + { + "epoch": 0.4312925170068027, + "grad_norm": 0.09542502295403, + "learning_rate": 9.26598901854559e-06, + "loss": 1.7157, + "step": 317 + }, + { + "epoch": 0.4326530612244898, + "grad_norm": 0.09080935991935338, + "learning_rate": 9.260113471307148e-06, + "loss": 1.5851, + "step": 318 + }, + { + "epoch": 0.4340136054421769, + "grad_norm": 0.09865728458566181, + "learning_rate": 9.254216379990487e-06, + "loss": 1.8897, + "step": 319 + }, + { + "epoch": 0.43537414965986393, + "grad_norm": 0.10060290482375228, + "learning_rate": 9.248297774418147e-06, + "loss": 1.5605, + "step": 320 + }, + { + "epoch": 0.43673469387755104, + "grad_norm": 0.09013488977054508, + "learning_rate": 9.242357684521467e-06, + "loss": 1.5582, + "step": 321 + }, + { + "epoch": 0.4380952380952381, + "grad_norm": 0.12978917401977683, + "learning_rate": 9.236396140340435e-06, + "loss": 1.5953, + "step": 322 + }, + { + "epoch": 0.43945578231292515, + "grad_norm": 0.09012186897343348, + "learning_rate": 9.230413172023538e-06, + "loss": 1.6678, + "step": 323 + }, + { + "epoch": 0.44081632653061226, + "grad_norm": 0.07610784588472097, + "learning_rate": 9.224408809827609e-06, + "loss": 1.6697, + "step": 324 + }, + { + "epoch": 0.4421768707482993, + "grad_norm": 0.08134414493064245, + "learning_rate": 9.218383084117671e-06, + "loss": 1.6543, + "step": 325 + }, + { + "epoch": 0.4435374149659864, + "grad_norm": 0.07969199857220131, + "learning_rate": 9.212336025366789e-06, + "loss": 1.7372, + "step": 326 + }, + { + "epoch": 0.4448979591836735, + "grad_norm": 0.08833397568245774, + "learning_rate": 9.206267664155906e-06, + "loss": 1.5033, + "step": 327 + }, + { + "epoch": 0.44625850340136053, + "grad_norm": 0.09013321784578471, + "learning_rate": 9.200178031173706e-06, + "loss": 1.7467, + "step": 328 + }, + { + "epoch": 0.44761904761904764, + "grad_norm": 0.08492129873211993, + "learning_rate": 9.194067157216436e-06, + "loss": 1.6346, + "step": 329 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.08924441822794496, + "learning_rate": 9.187935073187768e-06, + "loss": 1.5647, + "step": 330 + }, + { + "epoch": 0.45034013605442175, + "grad_norm": 0.2238269401441882, + "learning_rate": 9.181781810098638e-06, + "loss": 1.9641, + "step": 331 + }, + { + "epoch": 0.45170068027210886, + "grad_norm": 0.08505567098719835, + "learning_rate": 9.175607399067086e-06, + "loss": 1.723, + "step": 332 + }, + { + "epoch": 0.4530612244897959, + "grad_norm": 0.09479499636885363, + "learning_rate": 9.1694118713181e-06, + "loss": 1.7358, + "step": 333 + }, + { + "epoch": 0.454421768707483, + "grad_norm": 0.09532298638421347, + "learning_rate": 9.163195258183457e-06, + "loss": 1.652, + "step": 334 + }, + { + "epoch": 0.4557823129251701, + "grad_norm": 0.1435078453546247, + "learning_rate": 9.156957591101573e-06, + "loss": 1.8876, + "step": 335 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.08623460392050768, + "learning_rate": 9.150698901617326e-06, + "loss": 1.6408, + "step": 336 + }, + { + "epoch": 0.45850340136054424, + "grad_norm": 0.08103596406817255, + "learning_rate": 9.144419221381919e-06, + "loss": 1.582, + "step": 337 + }, + { + "epoch": 0.4598639455782313, + "grad_norm": 0.09624169989634061, + "learning_rate": 9.138118582152704e-06, + "loss": 1.7272, + "step": 338 + }, + { + "epoch": 0.46122448979591835, + "grad_norm": 0.06914577882609961, + "learning_rate": 9.131797015793026e-06, + "loss": 1.6864, + "step": 339 + }, + { + "epoch": 0.46258503401360546, + "grad_norm": 0.09682023357095138, + "learning_rate": 9.125454554272057e-06, + "loss": 1.5849, + "step": 340 + }, + { + "epoch": 0.4639455782312925, + "grad_norm": 0.08865054891775723, + "learning_rate": 9.119091229664648e-06, + "loss": 1.4716, + "step": 341 + }, + { + "epoch": 0.46530612244897956, + "grad_norm": 0.09112991799939003, + "learning_rate": 9.112707074151152e-06, + "loss": 1.6393, + "step": 342 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.09319883560181061, + "learning_rate": 9.106302120017272e-06, + "loss": 1.7619, + "step": 343 + }, + { + "epoch": 0.46802721088435373, + "grad_norm": 0.101061665585339, + "learning_rate": 9.099876399653885e-06, + "loss": 1.6286, + "step": 344 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 0.09755047037445551, + "learning_rate": 9.093429945556895e-06, + "loss": 1.6591, + "step": 345 + }, + { + "epoch": 0.4707482993197279, + "grad_norm": 0.0831755062746902, + "learning_rate": 9.086962790327057e-06, + "loss": 1.7728, + "step": 346 + }, + { + "epoch": 0.47210884353741495, + "grad_norm": 0.0981445280966388, + "learning_rate": 9.08047496666981e-06, + "loss": 1.6489, + "step": 347 + }, + { + "epoch": 0.47346938775510206, + "grad_norm": 0.0965447984049988, + "learning_rate": 9.073966507395123e-06, + "loss": 1.7807, + "step": 348 + }, + { + "epoch": 0.4748299319727891, + "grad_norm": 0.08805192634428297, + "learning_rate": 9.06743744541732e-06, + "loss": 1.6932, + "step": 349 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.07486129688956443, + "learning_rate": 9.060887813754914e-06, + "loss": 1.6828, + "step": 350 + }, + { + "epoch": 0.4775510204081633, + "grad_norm": 0.08844318223023526, + "learning_rate": 9.054317645530449e-06, + "loss": 1.5791, + "step": 351 + }, + { + "epoch": 0.47891156462585033, + "grad_norm": 0.08590901507611927, + "learning_rate": 9.047726973970317e-06, + "loss": 1.8916, + "step": 352 + }, + { + "epoch": 0.48027210884353744, + "grad_norm": 0.08275711528371654, + "learning_rate": 9.041115832404605e-06, + "loss": 1.6376, + "step": 353 + }, + { + "epoch": 0.4816326530612245, + "grad_norm": 0.13882142863507996, + "learning_rate": 9.03448425426692e-06, + "loss": 1.5496, + "step": 354 + }, + { + "epoch": 0.48299319727891155, + "grad_norm": 0.09033930791413211, + "learning_rate": 9.027832273094213e-06, + "loss": 1.8207, + "step": 355 + }, + { + "epoch": 0.48435374149659866, + "grad_norm": 0.08776380923196873, + "learning_rate": 9.021159922526623e-06, + "loss": 1.734, + "step": 356 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 0.0871797042611384, + "learning_rate": 9.014467236307303e-06, + "loss": 1.7255, + "step": 357 + }, + { + "epoch": 0.48707482993197276, + "grad_norm": 0.0832915611192237, + "learning_rate": 9.007754248282236e-06, + "loss": 1.6354, + "step": 358 + }, + { + "epoch": 0.4884353741496599, + "grad_norm": 0.11961492483949776, + "learning_rate": 9.001020992400086e-06, + "loss": 1.4193, + "step": 359 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.0794404046382169, + "learning_rate": 8.994267502712007e-06, + "loss": 1.728, + "step": 360 + }, + { + "epoch": 0.49115646258503404, + "grad_norm": 0.13335980644840648, + "learning_rate": 8.987493813371481e-06, + "loss": 1.6729, + "step": 361 + }, + { + "epoch": 0.4925170068027211, + "grad_norm": 0.08449457187355984, + "learning_rate": 8.980699958634147e-06, + "loss": 1.6142, + "step": 362 + }, + { + "epoch": 0.49387755102040815, + "grad_norm": 0.09270436024065673, + "learning_rate": 8.973885972857616e-06, + "loss": 1.6753, + "step": 363 + }, + { + "epoch": 0.49523809523809526, + "grad_norm": 0.08033630058062084, + "learning_rate": 8.96705189050131e-06, + "loss": 1.6269, + "step": 364 + }, + { + "epoch": 0.4965986394557823, + "grad_norm": 0.4381615422407444, + "learning_rate": 8.96019774612628e-06, + "loss": 1.6131, + "step": 365 + }, + { + "epoch": 0.49795918367346936, + "grad_norm": 0.09462169653835782, + "learning_rate": 8.953323574395038e-06, + "loss": 1.5629, + "step": 366 + }, + { + "epoch": 0.4993197278911565, + "grad_norm": 0.13595506352030012, + "learning_rate": 8.946429410071373e-06, + "loss": 1.5593, + "step": 367 + }, + { + "epoch": 0.5006802721088436, + "grad_norm": 0.10253342646979322, + "learning_rate": 8.939515288020182e-06, + "loss": 1.6281, + "step": 368 + }, + { + "epoch": 0.5006802721088436, + "eval_loss": 1.7046507596969604, + "eval_runtime": 76.5686, + "eval_samples_per_second": 53.194, + "eval_steps_per_second": 6.661, + "step": 368 + }, + { + "epoch": 0.5020408163265306, + "grad_norm": 0.08764026597495381, + "learning_rate": 8.932581243207289e-06, + "loss": 1.5909, + "step": 369 + }, + { + "epoch": 0.5034013605442177, + "grad_norm": 0.07898445718193145, + "learning_rate": 8.925627310699275e-06, + "loss": 1.761, + "step": 370 + }, + { + "epoch": 0.5047619047619047, + "grad_norm": 0.10577594540101636, + "learning_rate": 8.918653525663295e-06, + "loss": 1.695, + "step": 371 + }, + { + "epoch": 0.5061224489795918, + "grad_norm": 0.110620128076388, + "learning_rate": 8.911659923366897e-06, + "loss": 1.7043, + "step": 372 + }, + { + "epoch": 0.507482993197279, + "grad_norm": 0.08992699882848652, + "learning_rate": 8.904646539177852e-06, + "loss": 1.674, + "step": 373 + }, + { + "epoch": 0.508843537414966, + "grad_norm": 0.0971954506200858, + "learning_rate": 8.897613408563972e-06, + "loss": 1.6565, + "step": 374 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.14199688300843188, + "learning_rate": 8.89056056709293e-06, + "loss": 1.4402, + "step": 375 + }, + { + "epoch": 0.5115646258503401, + "grad_norm": 0.1034458287611268, + "learning_rate": 8.883488050432073e-06, + "loss": 1.6606, + "step": 376 + }, + { + "epoch": 0.5129251700680272, + "grad_norm": 0.09270052668783225, + "learning_rate": 8.87639589434826e-06, + "loss": 1.7067, + "step": 377 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 0.08063860406008341, + "learning_rate": 8.869284134707659e-06, + "loss": 1.7683, + "step": 378 + }, + { + "epoch": 0.5156462585034014, + "grad_norm": 0.08700140267251555, + "learning_rate": 8.862152807475584e-06, + "loss": 1.7135, + "step": 379 + }, + { + "epoch": 0.5170068027210885, + "grad_norm": 0.09374652236807894, + "learning_rate": 8.8550019487163e-06, + "loss": 1.5927, + "step": 380 + }, + { + "epoch": 0.5183673469387755, + "grad_norm": 0.09628602949564133, + "learning_rate": 8.847831594592851e-06, + "loss": 1.6169, + "step": 381 + }, + { + "epoch": 0.5197278911564626, + "grad_norm": 0.09133344115171436, + "learning_rate": 8.840641781366867e-06, + "loss": 1.6077, + "step": 382 + }, + { + "epoch": 0.5210884353741496, + "grad_norm": 0.0898264889709745, + "learning_rate": 8.83343254539839e-06, + "loss": 1.7476, + "step": 383 + }, + { + "epoch": 0.5224489795918368, + "grad_norm": 0.09463509154870942, + "learning_rate": 8.826203923145687e-06, + "loss": 1.6178, + "step": 384 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.086933970419064, + "learning_rate": 8.818955951165059e-06, + "loss": 1.6544, + "step": 385 + }, + { + "epoch": 0.5251700680272109, + "grad_norm": 0.08686966519278672, + "learning_rate": 8.811688666110663e-06, + "loss": 1.7268, + "step": 386 + }, + { + "epoch": 0.5265306122448979, + "grad_norm": 0.10219001251324068, + "learning_rate": 8.80440210473433e-06, + "loss": 1.6538, + "step": 387 + }, + { + "epoch": 0.527891156462585, + "grad_norm": 0.08199431219832622, + "learning_rate": 8.797096303885374e-06, + "loss": 1.6524, + "step": 388 + }, + { + "epoch": 0.5292517006802722, + "grad_norm": 0.09053864014656055, + "learning_rate": 8.789771300510397e-06, + "loss": 1.5971, + "step": 389 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 0.08221661934582344, + "learning_rate": 8.782427131653121e-06, + "loss": 1.6643, + "step": 390 + }, + { + "epoch": 0.5319727891156463, + "grad_norm": 0.08888879198967926, + "learning_rate": 8.77506383445419e-06, + "loss": 1.7855, + "step": 391 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.08638631079426494, + "learning_rate": 8.767681446150977e-06, + "loss": 1.8028, + "step": 392 + }, + { + "epoch": 0.5346938775510204, + "grad_norm": 0.07669664542559732, + "learning_rate": 8.76028000407741e-06, + "loss": 1.5725, + "step": 393 + }, + { + "epoch": 0.5360544217687074, + "grad_norm": 0.098633841165777, + "learning_rate": 8.752859545663766e-06, + "loss": 1.6692, + "step": 394 + }, + { + "epoch": 0.5374149659863946, + "grad_norm": 0.09002401371176637, + "learning_rate": 8.745420108436498e-06, + "loss": 1.7636, + "step": 395 + }, + { + "epoch": 0.5387755102040817, + "grad_norm": 0.0952379865665774, + "learning_rate": 8.737961730018034e-06, + "loss": 1.5664, + "step": 396 + }, + { + "epoch": 0.5401360544217687, + "grad_norm": 0.09623958141543322, + "learning_rate": 8.730484448126594e-06, + "loss": 1.5345, + "step": 397 + }, + { + "epoch": 0.5414965986394558, + "grad_norm": 0.1502927654981511, + "learning_rate": 8.722988300575992e-06, + "loss": 1.7841, + "step": 398 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 0.09222431588896472, + "learning_rate": 8.71547332527545e-06, + "loss": 1.9559, + "step": 399 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 0.0937414921537921, + "learning_rate": 8.707939560229406e-06, + "loss": 1.7022, + "step": 400 + }, + { + "epoch": 0.545578231292517, + "grad_norm": 0.11025083403359445, + "learning_rate": 8.700387043537319e-06, + "loss": 1.4365, + "step": 401 + }, + { + "epoch": 0.5469387755102041, + "grad_norm": 0.08441282686563067, + "learning_rate": 8.692815813393483e-06, + "loss": 1.6488, + "step": 402 + }, + { + "epoch": 0.5482993197278911, + "grad_norm": 0.3399602286969607, + "learning_rate": 8.68522590808682e-06, + "loss": 1.6829, + "step": 403 + }, + { + "epoch": 0.5496598639455782, + "grad_norm": 0.0924431892667502, + "learning_rate": 8.677617366000705e-06, + "loss": 1.7404, + "step": 404 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 0.09778253267340173, + "learning_rate": 8.669990225612754e-06, + "loss": 1.7674, + "step": 405 + }, + { + "epoch": 0.5523809523809524, + "grad_norm": 0.09471837896307483, + "learning_rate": 8.662344525494643e-06, + "loss": 1.6406, + "step": 406 + }, + { + "epoch": 0.5537414965986395, + "grad_norm": 0.08997298756417596, + "learning_rate": 8.654680304311908e-06, + "loss": 1.7875, + "step": 407 + }, + { + "epoch": 0.5551020408163265, + "grad_norm": 0.08849454584462031, + "learning_rate": 8.646997600823743e-06, + "loss": 1.5942, + "step": 408 + }, + { + "epoch": 0.5564625850340136, + "grad_norm": 0.10974758415682516, + "learning_rate": 8.639296453882816e-06, + "loss": 1.5229, + "step": 409 + }, + { + "epoch": 0.5578231292517006, + "grad_norm": 0.09669599431293863, + "learning_rate": 8.631576902435063e-06, + "loss": 1.7031, + "step": 410 + }, + { + "epoch": 0.5591836734693878, + "grad_norm": 0.10337335565704198, + "learning_rate": 8.623838985519498e-06, + "loss": 1.6138, + "step": 411 + }, + { + "epoch": 0.5605442176870749, + "grad_norm": 0.08963980024729686, + "learning_rate": 8.616082742268005e-06, + "loss": 1.6527, + "step": 412 + }, + { + "epoch": 0.5619047619047619, + "grad_norm": 0.13730489095006465, + "learning_rate": 8.608308211905159e-06, + "loss": 1.5823, + "step": 413 + }, + { + "epoch": 0.563265306122449, + "grad_norm": 0.08745105418294691, + "learning_rate": 8.600515433748003e-06, + "loss": 1.6647, + "step": 414 + }, + { + "epoch": 0.564625850340136, + "grad_norm": 0.19321095274209943, + "learning_rate": 8.592704447205872e-06, + "loss": 1.5218, + "step": 415 + }, + { + "epoch": 0.5659863945578232, + "grad_norm": 0.10518406439497321, + "learning_rate": 8.584875291780178e-06, + "loss": 1.5199, + "step": 416 + }, + { + "epoch": 0.5673469387755102, + "grad_norm": 0.09495102640071197, + "learning_rate": 8.577028007064218e-06, + "loss": 1.6623, + "step": 417 + }, + { + "epoch": 0.5687074829931973, + "grad_norm": 0.13232264409241218, + "learning_rate": 8.569162632742973e-06, + "loss": 1.606, + "step": 418 + }, + { + "epoch": 0.5700680272108843, + "grad_norm": 0.09382708523657708, + "learning_rate": 8.561279208592902e-06, + "loss": 1.6563, + "step": 419 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.10154097299186025, + "learning_rate": 8.553377774481748e-06, + "loss": 1.5177, + "step": 420 + }, + { + "epoch": 0.5727891156462585, + "grad_norm": 0.09008372077918206, + "learning_rate": 8.545458370368336e-06, + "loss": 1.5358, + "step": 421 + }, + { + "epoch": 0.5741496598639456, + "grad_norm": 0.11141318964471479, + "learning_rate": 8.53752103630236e-06, + "loss": 1.5975, + "step": 422 + }, + { + "epoch": 0.5755102040816327, + "grad_norm": 0.11594947645844504, + "learning_rate": 8.529565812424195e-06, + "loss": 1.5417, + "step": 423 + }, + { + "epoch": 0.5768707482993197, + "grad_norm": 0.1115243557715104, + "learning_rate": 8.521592738964689e-06, + "loss": 1.6912, + "step": 424 + }, + { + "epoch": 0.5782312925170068, + "grad_norm": 0.10294673717555994, + "learning_rate": 8.513601856244951e-06, + "loss": 1.6883, + "step": 425 + }, + { + "epoch": 0.5795918367346938, + "grad_norm": 0.12234481799540493, + "learning_rate": 8.505593204676162e-06, + "loss": 1.6903, + "step": 426 + }, + { + "epoch": 0.580952380952381, + "grad_norm": 0.1164390253206002, + "learning_rate": 8.497566824759359e-06, + "loss": 1.6433, + "step": 427 + }, + { + "epoch": 0.582312925170068, + "grad_norm": 0.10041564805248225, + "learning_rate": 8.489522757085234e-06, + "loss": 1.5482, + "step": 428 + }, + { + "epoch": 0.5836734693877551, + "grad_norm": 0.10770970938141446, + "learning_rate": 8.481461042333929e-06, + "loss": 1.6092, + "step": 429 + }, + { + "epoch": 0.5850340136054422, + "grad_norm": 0.08821548122605594, + "learning_rate": 8.473381721274832e-06, + "loss": 1.5793, + "step": 430 + }, + { + "epoch": 0.5863945578231292, + "grad_norm": 0.089232378501521, + "learning_rate": 8.465284834766365e-06, + "loss": 1.6233, + "step": 431 + }, + { + "epoch": 0.5877551020408164, + "grad_norm": 0.10250742081774146, + "learning_rate": 8.457170423755786e-06, + "loss": 1.5625, + "step": 432 + }, + { + "epoch": 0.5891156462585034, + "grad_norm": 0.1202037568410257, + "learning_rate": 8.449038529278976e-06, + "loss": 1.5843, + "step": 433 + }, + { + "epoch": 0.5904761904761905, + "grad_norm": 0.1323622148062186, + "learning_rate": 8.440889192460232e-06, + "loss": 1.808, + "step": 434 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 0.12495144537173114, + "learning_rate": 8.432722454512057e-06, + "loss": 1.9389, + "step": 435 + }, + { + "epoch": 0.5931972789115646, + "grad_norm": 0.09699121967438322, + "learning_rate": 8.424538356734957e-06, + "loss": 1.7367, + "step": 436 + }, + { + "epoch": 0.5945578231292517, + "grad_norm": 0.09801443283628167, + "learning_rate": 8.416336940517229e-06, + "loss": 1.6276, + "step": 437 + }, + { + "epoch": 0.5959183673469388, + "grad_norm": 0.10487807929341668, + "learning_rate": 8.408118247334755e-06, + "loss": 1.5578, + "step": 438 + }, + { + "epoch": 0.5972789115646259, + "grad_norm": 0.09364043239294455, + "learning_rate": 8.399882318750785e-06, + "loss": 1.5889, + "step": 439 + }, + { + "epoch": 0.5986394557823129, + "grad_norm": 0.10020651855110954, + "learning_rate": 8.391629196415733e-06, + "loss": 1.607, + "step": 440 + }, + { + "epoch": 0.6, + "grad_norm": 0.0899206896336364, + "learning_rate": 8.383358922066965e-06, + "loss": 1.5508, + "step": 441 + }, + { + "epoch": 0.601360544217687, + "grad_norm": 0.09009526459055692, + "learning_rate": 8.375071537528587e-06, + "loss": 1.6629, + "step": 442 + }, + { + "epoch": 0.6027210884353742, + "grad_norm": 0.09413295136140958, + "learning_rate": 8.366767084711232e-06, + "loss": 1.6568, + "step": 443 + }, + { + "epoch": 0.6040816326530613, + "grad_norm": 0.22548893033360673, + "learning_rate": 8.358445605611856e-06, + "loss": 1.7594, + "step": 444 + }, + { + "epoch": 0.6054421768707483, + "grad_norm": 0.12757276275079013, + "learning_rate": 8.350107142313513e-06, + "loss": 1.4311, + "step": 445 + }, + { + "epoch": 0.6068027210884354, + "grad_norm": 0.09965051028904835, + "learning_rate": 8.34175173698515e-06, + "loss": 1.6618, + "step": 446 + }, + { + "epoch": 0.6081632653061224, + "grad_norm": 0.09666753861412604, + "learning_rate": 8.333379431881398e-06, + "loss": 1.6729, + "step": 447 + }, + { + "epoch": 0.6095238095238096, + "grad_norm": 0.09235423529974252, + "learning_rate": 8.324990269342345e-06, + "loss": 1.7872, + "step": 448 + }, + { + "epoch": 0.6108843537414966, + "grad_norm": 0.0969813250335363, + "learning_rate": 8.316584291793337e-06, + "loss": 1.5299, + "step": 449 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.08468781546097646, + "learning_rate": 8.30816154174475e-06, + "loss": 1.7958, + "step": 450 + }, + { + "epoch": 0.6136054421768707, + "grad_norm": 0.08812390382328499, + "learning_rate": 8.299722061791788e-06, + "loss": 1.7292, + "step": 451 + }, + { + "epoch": 0.6149659863945578, + "grad_norm": 0.08809745700239427, + "learning_rate": 8.291265894614253e-06, + "loss": 1.758, + "step": 452 + }, + { + "epoch": 0.6163265306122448, + "grad_norm": 0.09490957390276747, + "learning_rate": 8.282793082976343e-06, + "loss": 1.5475, + "step": 453 + }, + { + "epoch": 0.617687074829932, + "grad_norm": 0.11060404799058053, + "learning_rate": 8.274303669726427e-06, + "loss": 1.6792, + "step": 454 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.09540512150005957, + "learning_rate": 8.265797697796831e-06, + "loss": 1.6685, + "step": 455 + }, + { + "epoch": 0.6204081632653061, + "grad_norm": 0.12873788369707956, + "learning_rate": 8.257275210203621e-06, + "loss": 1.5043, + "step": 456 + }, + { + "epoch": 0.6217687074829932, + "grad_norm": 0.6499840705042578, + "learning_rate": 8.248736250046389e-06, + "loss": 1.7548, + "step": 457 + }, + { + "epoch": 0.6231292517006802, + "grad_norm": 0.08484061508587025, + "learning_rate": 8.240180860508027e-06, + "loss": 1.8159, + "step": 458 + }, + { + "epoch": 0.6244897959183674, + "grad_norm": 0.09906794908825851, + "learning_rate": 8.231609084854513e-06, + "loss": 1.7116, + "step": 459 + }, + { + "epoch": 0.6258503401360545, + "grad_norm": 0.0939998149984381, + "learning_rate": 8.223020966434695e-06, + "loss": 1.7448, + "step": 460 + }, + { + "epoch": 0.6272108843537415, + "grad_norm": 0.09983102806108725, + "learning_rate": 8.214416548680065e-06, + "loss": 1.7284, + "step": 461 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.1276929064149538, + "learning_rate": 8.205795875104549e-06, + "loss": 1.5541, + "step": 462 + }, + { + "epoch": 0.6299319727891156, + "grad_norm": 0.0971007724405045, + "learning_rate": 8.197158989304277e-06, + "loss": 1.749, + "step": 463 + }, + { + "epoch": 0.6312925170068027, + "grad_norm": 0.09931797410533018, + "learning_rate": 8.188505934957368e-06, + "loss": 1.7908, + "step": 464 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 0.10420469743883767, + "learning_rate": 8.179836755823707e-06, + "loss": 1.7156, + "step": 465 + }, + { + "epoch": 0.6340136054421769, + "grad_norm": 0.09471973183090822, + "learning_rate": 8.171151495744726e-06, + "loss": 1.6598, + "step": 466 + }, + { + "epoch": 0.6353741496598639, + "grad_norm": 0.08008463828970452, + "learning_rate": 8.162450198643184e-06, + "loss": 1.8476, + "step": 467 + }, + { + "epoch": 0.636734693877551, + "grad_norm": 0.09336869405905367, + "learning_rate": 8.153732908522933e-06, + "loss": 1.677, + "step": 468 + }, + { + "epoch": 0.638095238095238, + "grad_norm": 0.10567938348565759, + "learning_rate": 8.144999669468714e-06, + "loss": 1.6987, + "step": 469 + }, + { + "epoch": 0.6394557823129252, + "grad_norm": 0.08460548198303047, + "learning_rate": 8.136250525645916e-06, + "loss": 1.8206, + "step": 470 + }, + { + "epoch": 0.6408163265306123, + "grad_norm": 0.0993816872028852, + "learning_rate": 8.127485521300366e-06, + "loss": 1.6618, + "step": 471 + }, + { + "epoch": 0.6421768707482993, + "grad_norm": 0.1043492135258874, + "learning_rate": 8.118704700758103e-06, + "loss": 1.641, + "step": 472 + }, + { + "epoch": 0.6435374149659864, + "grad_norm": 0.0880711093010387, + "learning_rate": 8.109908108425142e-06, + "loss": 1.8376, + "step": 473 + }, + { + "epoch": 0.6448979591836734, + "grad_norm": 0.09271039929379037, + "learning_rate": 8.101095788787266e-06, + "loss": 1.6914, + "step": 474 + }, + { + "epoch": 0.6462585034013606, + "grad_norm": 0.10199949996500506, + "learning_rate": 8.092267786409788e-06, + "loss": 1.6264, + "step": 475 + }, + { + "epoch": 0.6476190476190476, + "grad_norm": 0.09927764715965239, + "learning_rate": 8.08342414593734e-06, + "loss": 1.7495, + "step": 476 + }, + { + "epoch": 0.6489795918367347, + "grad_norm": 0.11011814904081185, + "learning_rate": 8.07456491209363e-06, + "loss": 1.8044, + "step": 477 + }, + { + "epoch": 0.6503401360544218, + "grad_norm": 0.10592895331787452, + "learning_rate": 8.065690129681224e-06, + "loss": 1.5279, + "step": 478 + }, + { + "epoch": 0.6517006802721088, + "grad_norm": 0.09546781848020944, + "learning_rate": 8.056799843581326e-06, + "loss": 1.5599, + "step": 479 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.09305244725756194, + "learning_rate": 8.04789409875354e-06, + "loss": 1.7328, + "step": 480 + }, + { + "epoch": 0.654421768707483, + "grad_norm": 0.10356673246511783, + "learning_rate": 8.038972940235647e-06, + "loss": 1.6317, + "step": 481 + }, + { + "epoch": 0.6557823129251701, + "grad_norm": 0.08355158958046802, + "learning_rate": 8.030036413143382e-06, + "loss": 1.823, + "step": 482 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 0.13429081910577273, + "learning_rate": 8.021084562670193e-06, + "loss": 1.765, + "step": 483 + }, + { + "epoch": 0.6585034013605442, + "grad_norm": 0.10977848927929611, + "learning_rate": 8.012117434087032e-06, + "loss": 1.7983, + "step": 484 + }, + { + "epoch": 0.6598639455782312, + "grad_norm": 0.10127247770991282, + "learning_rate": 8.003135072742106e-06, + "loss": 1.7146, + "step": 485 + }, + { + "epoch": 0.6612244897959184, + "grad_norm": 0.08420649737520919, + "learning_rate": 7.994137524060656e-06, + "loss": 1.7273, + "step": 486 + }, + { + "epoch": 0.6625850340136055, + "grad_norm": 0.12998243453204658, + "learning_rate": 7.985124833544737e-06, + "loss": 1.7116, + "step": 487 + }, + { + "epoch": 0.6639455782312925, + "grad_norm": 0.12572177154996375, + "learning_rate": 7.976097046772971e-06, + "loss": 1.5875, + "step": 488 + }, + { + "epoch": 0.6653061224489796, + "grad_norm": 0.09257333856558611, + "learning_rate": 7.967054209400325e-06, + "loss": 1.6259, + "step": 489 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.15599333098996487, + "learning_rate": 7.95799636715788e-06, + "loss": 1.6602, + "step": 490 + }, + { + "epoch": 0.6680272108843538, + "grad_norm": 0.09770606447991517, + "learning_rate": 7.948923565852597e-06, + "loss": 1.6867, + "step": 491 + }, + { + "epoch": 0.6693877551020408, + "grad_norm": 0.1303849167483497, + "learning_rate": 7.939835851367097e-06, + "loss": 1.6583, + "step": 492 + }, + { + "epoch": 0.6707482993197279, + "grad_norm": 0.10864319676746689, + "learning_rate": 7.930733269659405e-06, + "loss": 1.6832, + "step": 493 + }, + { + "epoch": 0.672108843537415, + "grad_norm": 0.11083699474291044, + "learning_rate": 7.921615866762743e-06, + "loss": 1.7117, + "step": 494 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 0.13148294366762545, + "learning_rate": 7.912483688785281e-06, + "loss": 1.5234, + "step": 495 + }, + { + "epoch": 0.6748299319727891, + "grad_norm": 0.11365843655270903, + "learning_rate": 7.903336781909911e-06, + "loss": 1.7783, + "step": 496 + }, + { + "epoch": 0.6761904761904762, + "grad_norm": 0.12277794049039219, + "learning_rate": 7.89417519239401e-06, + "loss": 1.6908, + "step": 497 + }, + { + "epoch": 0.6775510204081633, + "grad_norm": 0.11413949336950495, + "learning_rate": 7.884998966569206e-06, + "loss": 1.5654, + "step": 498 + }, + { + "epoch": 0.6789115646258503, + "grad_norm": 0.10494845825770326, + "learning_rate": 7.87580815084115e-06, + "loss": 1.5383, + "step": 499 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 0.09020097927859841, + "learning_rate": 7.866602791689272e-06, + "loss": 1.596, + "step": 500 + }, + { + "epoch": 0.6816326530612244, + "grad_norm": 0.08585696598298209, + "learning_rate": 7.857382935666554e-06, + "loss": 1.7307, + "step": 501 + }, + { + "epoch": 0.6829931972789116, + "grad_norm": 0.1010505486947337, + "learning_rate": 7.848148629399287e-06, + "loss": 1.6699, + "step": 502 + }, + { + "epoch": 0.6843537414965987, + "grad_norm": 0.11201229130009283, + "learning_rate": 7.838899919586841e-06, + "loss": 1.6521, + "step": 503 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.10009251853964633, + "learning_rate": 7.82963685300143e-06, + "loss": 1.6464, + "step": 504 + }, + { + "epoch": 0.6870748299319728, + "grad_norm": 0.10676770701676057, + "learning_rate": 7.820359476487866e-06, + "loss": 1.4472, + "step": 505 + }, + { + "epoch": 0.6884353741496598, + "grad_norm": 0.10987902455707375, + "learning_rate": 7.811067836963337e-06, + "loss": 1.6637, + "step": 506 + }, + { + "epoch": 0.689795918367347, + "grad_norm": 0.10593286251613945, + "learning_rate": 7.801761981417152e-06, + "loss": 1.714, + "step": 507 + }, + { + "epoch": 0.691156462585034, + "grad_norm": 0.11171983777047709, + "learning_rate": 7.792441956910523e-06, + "loss": 1.5948, + "step": 508 + }, + { + "epoch": 0.6925170068027211, + "grad_norm": 0.10208550757531555, + "learning_rate": 7.783107810576306e-06, + "loss": 1.7267, + "step": 509 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.11315560578865806, + "learning_rate": 7.773759589618782e-06, + "loss": 1.5995, + "step": 510 + }, + { + "epoch": 0.6952380952380952, + "grad_norm": 0.10501036944532048, + "learning_rate": 7.764397341313403e-06, + "loss": 1.4624, + "step": 511 + }, + { + "epoch": 0.6965986394557823, + "grad_norm": 0.08619282794989483, + "learning_rate": 7.755021113006567e-06, + "loss": 1.7983, + "step": 512 + }, + { + "epoch": 0.6979591836734694, + "grad_norm": 0.09109224074659802, + "learning_rate": 7.745630952115365e-06, + "loss": 1.6753, + "step": 513 + }, + { + "epoch": 0.6993197278911565, + "grad_norm": 0.11332860880884088, + "learning_rate": 7.736226906127344e-06, + "loss": 1.7472, + "step": 514 + }, + { + "epoch": 0.7006802721088435, + "grad_norm": 0.11046061973302813, + "learning_rate": 7.726809022600284e-06, + "loss": 1.6219, + "step": 515 + }, + { + "epoch": 0.7020408163265306, + "grad_norm": 0.09720494400811149, + "learning_rate": 7.71737734916193e-06, + "loss": 1.7941, + "step": 516 + }, + { + "epoch": 0.7034013605442176, + "grad_norm": 0.13391377607878455, + "learning_rate": 7.70793193350977e-06, + "loss": 1.7904, + "step": 517 + }, + { + "epoch": 0.7047619047619048, + "grad_norm": 0.12625091217599146, + "learning_rate": 7.69847282341079e-06, + "loss": 1.663, + "step": 518 + }, + { + "epoch": 0.7061224489795919, + "grad_norm": 0.12406607142823292, + "learning_rate": 7.68900006670123e-06, + "loss": 1.6766, + "step": 519 + }, + { + "epoch": 0.7074829931972789, + "grad_norm": 0.09475581974666519, + "learning_rate": 7.679513711286338e-06, + "loss": 1.7449, + "step": 520 + }, + { + "epoch": 0.708843537414966, + "grad_norm": 0.12489019591501085, + "learning_rate": 7.670013805140143e-06, + "loss": 1.7526, + "step": 521 + }, + { + "epoch": 0.710204081632653, + "grad_norm": 0.09091443476585943, + "learning_rate": 7.660500396305194e-06, + "loss": 1.66, + "step": 522 + }, + { + "epoch": 0.7115646258503401, + "grad_norm": 0.0843896278465122, + "learning_rate": 7.650973532892325e-06, + "loss": 1.5741, + "step": 523 + }, + { + "epoch": 0.7129251700680272, + "grad_norm": 0.12842839378581072, + "learning_rate": 7.641433263080418e-06, + "loss": 1.5639, + "step": 524 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.12573336181400996, + "learning_rate": 7.631879635116152e-06, + "loss": 1.5261, + "step": 525 + }, + { + "epoch": 0.7156462585034014, + "grad_norm": 0.08357817960907306, + "learning_rate": 7.622312697313754e-06, + "loss": 1.624, + "step": 526 + }, + { + "epoch": 0.7170068027210884, + "grad_norm": 0.08826301097458839, + "learning_rate": 7.612732498054769e-06, + "loss": 1.7131, + "step": 527 + }, + { + "epoch": 0.7183673469387755, + "grad_norm": 0.1030553177752236, + "learning_rate": 7.603139085787801e-06, + "loss": 1.76, + "step": 528 + }, + { + "epoch": 0.7197278911564626, + "grad_norm": 0.10248942600565085, + "learning_rate": 7.5935325090282785e-06, + "loss": 1.6537, + "step": 529 + }, + { + "epoch": 0.7210884353741497, + "grad_norm": 0.10935378440777248, + "learning_rate": 7.583912816358203e-06, + "loss": 1.7441, + "step": 530 + }, + { + "epoch": 0.7224489795918367, + "grad_norm": 0.10474278332332895, + "learning_rate": 7.574280056425907e-06, + "loss": 1.5672, + "step": 531 + }, + { + "epoch": 0.7238095238095238, + "grad_norm": 0.10337823669221702, + "learning_rate": 7.564634277945803e-06, + "loss": 1.7301, + "step": 532 + }, + { + "epoch": 0.7251700680272108, + "grad_norm": 0.1021232974428954, + "learning_rate": 7.554975529698143e-06, + "loss": 1.8401, + "step": 533 + }, + { + "epoch": 0.726530612244898, + "grad_norm": 0.10873875060914909, + "learning_rate": 7.54530386052877e-06, + "loss": 1.6832, + "step": 534 + }, + { + "epoch": 0.7278911564625851, + "grad_norm": 0.13142427274605614, + "learning_rate": 7.5356193193488655e-06, + "loss": 1.6824, + "step": 535 + }, + { + "epoch": 0.7292517006802721, + "grad_norm": 0.10578967924155344, + "learning_rate": 7.525921955134714e-06, + "loss": 1.6128, + "step": 536 + }, + { + "epoch": 0.7306122448979592, + "grad_norm": 0.1428684712147148, + "learning_rate": 7.5162118169274424e-06, + "loss": 1.6909, + "step": 537 + }, + { + "epoch": 0.7319727891156462, + "grad_norm": 0.12364317671003774, + "learning_rate": 7.506488953832779e-06, + "loss": 1.5894, + "step": 538 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.12070521447402052, + "learning_rate": 7.4967534150208066e-06, + "loss": 1.6316, + "step": 539 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.11805699210709567, + "learning_rate": 7.487005249725705e-06, + "loss": 1.773, + "step": 540 + }, + { + "epoch": 0.7360544217687075, + "grad_norm": 0.10518788665559951, + "learning_rate": 7.477244507245517e-06, + "loss": 1.5496, + "step": 541 + }, + { + "epoch": 0.7374149659863946, + "grad_norm": 0.10115679657141723, + "learning_rate": 7.4674712369418815e-06, + "loss": 1.6332, + "step": 542 + }, + { + "epoch": 0.7387755102040816, + "grad_norm": 0.13249620123305414, + "learning_rate": 7.457685488239799e-06, + "loss": 1.5464, + "step": 543 + }, + { + "epoch": 0.7401360544217687, + "grad_norm": 0.10912963076663591, + "learning_rate": 7.44788731062737e-06, + "loss": 1.6537, + "step": 544 + }, + { + "epoch": 0.7414965986394558, + "grad_norm": 0.0903784233977396, + "learning_rate": 7.438076753655557e-06, + "loss": 1.7509, + "step": 545 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.11534137833637581, + "learning_rate": 7.4282538669379186e-06, + "loss": 1.8423, + "step": 546 + }, + { + "epoch": 0.7442176870748299, + "grad_norm": 0.10673090155703877, + "learning_rate": 7.418418700150373e-06, + "loss": 1.5147, + "step": 547 + }, + { + "epoch": 0.745578231292517, + "grad_norm": 0.11078677610408977, + "learning_rate": 7.408571303030939e-06, + "loss": 1.598, + "step": 548 + }, + { + "epoch": 0.746938775510204, + "grad_norm": 0.20089268434188828, + "learning_rate": 7.398711725379486e-06, + "loss": 1.6854, + "step": 549 + }, + { + "epoch": 0.7482993197278912, + "grad_norm": 0.11465528276050316, + "learning_rate": 7.388840017057479e-06, + "loss": 1.7166, + "step": 550 + }, + { + "epoch": 0.7496598639455783, + "grad_norm": 0.17554597081264026, + "learning_rate": 7.378956227987738e-06, + "loss": 1.7621, + "step": 551 + }, + { + "epoch": 0.7510204081632653, + "grad_norm": 0.12001150745168689, + "learning_rate": 7.369060408154166e-06, + "loss": 1.6292, + "step": 552 + }, + { + "epoch": 0.7510204081632653, + "eval_loss": 1.695604681968689, + "eval_runtime": 76.6065, + "eval_samples_per_second": 53.168, + "eval_steps_per_second": 6.657, + "step": 552 + }, + { + "epoch": 0.7523809523809524, + "grad_norm": 0.08810105735536967, + "learning_rate": 7.35915260760152e-06, + "loss": 1.7169, + "step": 553 + }, + { + "epoch": 0.7537414965986394, + "grad_norm": 0.1670593915051014, + "learning_rate": 7.349232876435135e-06, + "loss": 1.5579, + "step": 554 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 0.11454927724613997, + "learning_rate": 7.3393012648206865e-06, + "loss": 1.7283, + "step": 555 + }, + { + "epoch": 0.7564625850340136, + "grad_norm": 0.969049897753043, + "learning_rate": 7.329357822983929e-06, + "loss": 1.7205, + "step": 556 + }, + { + "epoch": 0.7578231292517007, + "grad_norm": 0.14156929561935405, + "learning_rate": 7.319402601210448e-06, + "loss": 1.6642, + "step": 557 + }, + { + "epoch": 0.7591836734693878, + "grad_norm": 0.0984076585901956, + "learning_rate": 7.3094356498453955e-06, + "loss": 1.5543, + "step": 558 + }, + { + "epoch": 0.7605442176870748, + "grad_norm": 0.1065633653807634, + "learning_rate": 7.299457019293248e-06, + "loss": 1.6024, + "step": 559 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.10385154170413329, + "learning_rate": 7.289466760017543e-06, + "loss": 1.6121, + "step": 560 + }, + { + "epoch": 0.763265306122449, + "grad_norm": 0.09706098665043214, + "learning_rate": 7.279464922540626e-06, + "loss": 1.6291, + "step": 561 + }, + { + "epoch": 0.7646258503401361, + "grad_norm": 0.1005953850437904, + "learning_rate": 7.269451557443396e-06, + "loss": 1.5871, + "step": 562 + }, + { + "epoch": 0.7659863945578231, + "grad_norm": 0.12409919864299088, + "learning_rate": 7.2594267153650525e-06, + "loss": 1.8507, + "step": 563 + }, + { + "epoch": 0.7673469387755102, + "grad_norm": 0.17577412545797552, + "learning_rate": 7.249390447002827e-06, + "loss": 1.6741, + "step": 564 + }, + { + "epoch": 0.7687074829931972, + "grad_norm": 0.10625351066460387, + "learning_rate": 7.239342803111744e-06, + "loss": 1.6995, + "step": 565 + }, + { + "epoch": 0.7700680272108843, + "grad_norm": 0.10141675080832888, + "learning_rate": 7.229283834504351e-06, + "loss": 1.7018, + "step": 566 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 0.10351854356175742, + "learning_rate": 7.21921359205047e-06, + "loss": 1.6347, + "step": 567 + }, + { + "epoch": 0.7727891156462585, + "grad_norm": 0.097783522808633, + "learning_rate": 7.209132126676934e-06, + "loss": 1.63, + "step": 568 + }, + { + "epoch": 0.7741496598639456, + "grad_norm": 0.1032937811881391, + "learning_rate": 7.199039489367334e-06, + "loss": 1.7088, + "step": 569 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.12703981258728564, + "learning_rate": 7.188935731161756e-06, + "loss": 1.5488, + "step": 570 + }, + { + "epoch": 0.7768707482993197, + "grad_norm": 0.09603678350259663, + "learning_rate": 7.178820903156532e-06, + "loss": 1.7006, + "step": 571 + }, + { + "epoch": 0.7782312925170068, + "grad_norm": 0.10137848978882107, + "learning_rate": 7.168695056503967e-06, + "loss": 1.5343, + "step": 572 + }, + { + "epoch": 0.7795918367346939, + "grad_norm": 0.09168277145084121, + "learning_rate": 7.1585582424121005e-06, + "loss": 1.7654, + "step": 573 + }, + { + "epoch": 0.780952380952381, + "grad_norm": 0.10323000544095898, + "learning_rate": 7.148410512144425e-06, + "loss": 1.6613, + "step": 574 + }, + { + "epoch": 0.782312925170068, + "grad_norm": 0.09038228211904249, + "learning_rate": 7.138251917019645e-06, + "loss": 1.7182, + "step": 575 + }, + { + "epoch": 0.7836734693877551, + "grad_norm": 0.12949706977298758, + "learning_rate": 7.1280825084114065e-06, + "loss": 1.5075, + "step": 576 + }, + { + "epoch": 0.7850340136054422, + "grad_norm": 0.10182930523451113, + "learning_rate": 7.117902337748045e-06, + "loss": 1.5249, + "step": 577 + }, + { + "epoch": 0.7863945578231293, + "grad_norm": 0.1014784602992118, + "learning_rate": 7.107711456512316e-06, + "loss": 1.5699, + "step": 578 + }, + { + "epoch": 0.7877551020408163, + "grad_norm": 0.08920528739055578, + "learning_rate": 7.097509916241145e-06, + "loss": 1.7604, + "step": 579 + }, + { + "epoch": 0.7891156462585034, + "grad_norm": 0.14771070265391437, + "learning_rate": 7.08729776852536e-06, + "loss": 1.8294, + "step": 580 + }, + { + "epoch": 0.7904761904761904, + "grad_norm": 0.10437668016493229, + "learning_rate": 7.0770750650094335e-06, + "loss": 1.5263, + "step": 581 + }, + { + "epoch": 0.7918367346938775, + "grad_norm": 0.09453693869246346, + "learning_rate": 7.066841857391215e-06, + "loss": 1.7625, + "step": 582 + }, + { + "epoch": 0.7931972789115647, + "grad_norm": 0.1024749155019567, + "learning_rate": 7.056598197421686e-06, + "loss": 1.6953, + "step": 583 + }, + { + "epoch": 0.7945578231292517, + "grad_norm": 0.09967417247994723, + "learning_rate": 7.046344136904675e-06, + "loss": 1.5067, + "step": 584 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 0.10667510272444436, + "learning_rate": 7.036079727696618e-06, + "loss": 1.6966, + "step": 585 + }, + { + "epoch": 0.7972789115646258, + "grad_norm": 0.13727868268941767, + "learning_rate": 7.025805021706276e-06, + "loss": 1.6554, + "step": 586 + }, + { + "epoch": 0.7986394557823129, + "grad_norm": 0.09241703194497365, + "learning_rate": 7.0155200708944915e-06, + "loss": 1.7987, + "step": 587 + }, + { + "epoch": 0.8, + "grad_norm": 0.11984006315877656, + "learning_rate": 7.005224927273913e-06, + "loss": 1.7059, + "step": 588 + }, + { + "epoch": 0.8013605442176871, + "grad_norm": 0.12763094516083248, + "learning_rate": 6.9949196429087355e-06, + "loss": 1.8147, + "step": 589 + }, + { + "epoch": 0.8027210884353742, + "grad_norm": 0.13522443685391508, + "learning_rate": 6.984604269914437e-06, + "loss": 1.63, + "step": 590 + }, + { + "epoch": 0.8040816326530612, + "grad_norm": 0.10116021604335325, + "learning_rate": 6.974278860457515e-06, + "loss": 1.5963, + "step": 591 + }, + { + "epoch": 0.8054421768707483, + "grad_norm": 0.10234671686214797, + "learning_rate": 6.963943466755225e-06, + "loss": 1.7491, + "step": 592 + }, + { + "epoch": 0.8068027210884354, + "grad_norm": 0.09993833464748478, + "learning_rate": 6.953598141075315e-06, + "loss": 1.8742, + "step": 593 + }, + { + "epoch": 0.8081632653061225, + "grad_norm": 0.10194114128873782, + "learning_rate": 6.943242935735757e-06, + "loss": 1.8295, + "step": 594 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.11091877191061177, + "learning_rate": 6.932877903104487e-06, + "loss": 1.7282, + "step": 595 + }, + { + "epoch": 0.8108843537414966, + "grad_norm": 0.11885618706334389, + "learning_rate": 6.922503095599142e-06, + "loss": 1.7013, + "step": 596 + }, + { + "epoch": 0.8122448979591836, + "grad_norm": 0.128832076499377, + "learning_rate": 6.912118565686789e-06, + "loss": 1.6604, + "step": 597 + }, + { + "epoch": 0.8136054421768707, + "grad_norm": 0.149361446054299, + "learning_rate": 6.901724365883665e-06, + "loss": 1.5922, + "step": 598 + }, + { + "epoch": 0.8149659863945579, + "grad_norm": 0.10349101338251292, + "learning_rate": 6.89132054875491e-06, + "loss": 1.7742, + "step": 599 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.10814162257394357, + "learning_rate": 6.8809071669142946e-06, + "loss": 1.6099, + "step": 600 + }, + { + "epoch": 0.817687074829932, + "grad_norm": 0.09463224320256956, + "learning_rate": 6.870484273023967e-06, + "loss": 1.5986, + "step": 601 + }, + { + "epoch": 0.819047619047619, + "grad_norm": 0.10081423766781461, + "learning_rate": 6.8600519197941725e-06, + "loss": 1.7488, + "step": 602 + }, + { + "epoch": 0.8204081632653061, + "grad_norm": 0.10845349706729722, + "learning_rate": 6.849610159983003e-06, + "loss": 1.6419, + "step": 603 + }, + { + "epoch": 0.8217687074829932, + "grad_norm": 0.1220236426902432, + "learning_rate": 6.839159046396109e-06, + "loss": 1.6193, + "step": 604 + }, + { + "epoch": 0.8231292517006803, + "grad_norm": 0.11987974076038253, + "learning_rate": 6.828698631886455e-06, + "loss": 1.6836, + "step": 605 + }, + { + "epoch": 0.8244897959183674, + "grad_norm": 0.12488732229202766, + "learning_rate": 6.8182289693540375e-06, + "loss": 1.6057, + "step": 606 + }, + { + "epoch": 0.8258503401360544, + "grad_norm": 0.10061110967809365, + "learning_rate": 6.807750111745619e-06, + "loss": 1.6481, + "step": 607 + }, + { + "epoch": 0.8272108843537415, + "grad_norm": 0.10585066880846151, + "learning_rate": 6.797262112054469e-06, + "loss": 1.6665, + "step": 608 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 0.10134141638102989, + "learning_rate": 6.786765023320085e-06, + "loss": 1.5092, + "step": 609 + }, + { + "epoch": 0.8299319727891157, + "grad_norm": 0.11227140982751639, + "learning_rate": 6.776258898627932e-06, + "loss": 1.7522, + "step": 610 + }, + { + "epoch": 0.8312925170068027, + "grad_norm": 0.08967360374579285, + "learning_rate": 6.765743791109172e-06, + "loss": 1.7738, + "step": 611 + }, + { + "epoch": 0.8326530612244898, + "grad_norm": 0.12577528209737293, + "learning_rate": 6.755219753940389e-06, + "loss": 1.6958, + "step": 612 + }, + { + "epoch": 0.8340136054421768, + "grad_norm": 0.1026220373525454, + "learning_rate": 6.744686840343333e-06, + "loss": 1.7081, + "step": 613 + }, + { + "epoch": 0.8353741496598639, + "grad_norm": 0.1616576311932069, + "learning_rate": 6.734145103584638e-06, + "loss": 1.7878, + "step": 614 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 0.11606031099035814, + "learning_rate": 6.72359459697556e-06, + "loss": 1.704, + "step": 615 + }, + { + "epoch": 0.8380952380952381, + "grad_norm": 0.11405134687997359, + "learning_rate": 6.713035373871711e-06, + "loss": 1.6157, + "step": 616 + }, + { + "epoch": 0.8394557823129252, + "grad_norm": 0.11855263544564604, + "learning_rate": 6.702467487672771e-06, + "loss": 1.7325, + "step": 617 + }, + { + "epoch": 0.8408163265306122, + "grad_norm": 0.10438275658978799, + "learning_rate": 6.691890991822243e-06, + "loss": 1.6522, + "step": 618 + }, + { + "epoch": 0.8421768707482993, + "grad_norm": 0.10374577213787556, + "learning_rate": 6.681305939807165e-06, + "loss": 1.6307, + "step": 619 + }, + { + "epoch": 0.8435374149659864, + "grad_norm": 0.09336351222681238, + "learning_rate": 6.670712385157846e-06, + "loss": 1.5821, + "step": 620 + }, + { + "epoch": 0.8448979591836735, + "grad_norm": 0.12997638800995778, + "learning_rate": 6.660110381447593e-06, + "loss": 1.672, + "step": 621 + }, + { + "epoch": 0.8462585034013606, + "grad_norm": 0.09931623670142671, + "learning_rate": 6.649499982292441e-06, + "loss": 1.6305, + "step": 622 + }, + { + "epoch": 0.8476190476190476, + "grad_norm": 0.1164641533977229, + "learning_rate": 6.638881241350884e-06, + "loss": 1.6891, + "step": 623 + }, + { + "epoch": 0.8489795918367347, + "grad_norm": 0.09080847058543648, + "learning_rate": 6.628254212323601e-06, + "loss": 1.5685, + "step": 624 + }, + { + "epoch": 0.8503401360544217, + "grad_norm": 0.09554345130938086, + "learning_rate": 6.617618948953186e-06, + "loss": 1.8238, + "step": 625 + }, + { + "epoch": 0.8517006802721089, + "grad_norm": 0.11850979230452785, + "learning_rate": 6.606975505023874e-06, + "loss": 1.5686, + "step": 626 + }, + { + "epoch": 0.8530612244897959, + "grad_norm": 0.11391698206139707, + "learning_rate": 6.596323934361268e-06, + "loss": 1.6122, + "step": 627 + }, + { + "epoch": 0.854421768707483, + "grad_norm": 0.1259510837772197, + "learning_rate": 6.5856642908320745e-06, + "loss": 1.5638, + "step": 628 + }, + { + "epoch": 0.85578231292517, + "grad_norm": 0.3467765021073813, + "learning_rate": 6.574996628343824e-06, + "loss": 1.7503, + "step": 629 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.11342921408966115, + "learning_rate": 6.564321000844598e-06, + "loss": 1.6653, + "step": 630 + }, + { + "epoch": 0.8585034013605443, + "grad_norm": 0.10000357784567925, + "learning_rate": 6.553637462322759e-06, + "loss": 1.5783, + "step": 631 + }, + { + "epoch": 0.8598639455782313, + "grad_norm": 0.1067459993698176, + "learning_rate": 6.5429460668066825e-06, + "loss": 1.7222, + "step": 632 + }, + { + "epoch": 0.8612244897959184, + "grad_norm": 0.1669410376966875, + "learning_rate": 6.5322468683644665e-06, + "loss": 1.6325, + "step": 633 + }, + { + "epoch": 0.8625850340136054, + "grad_norm": 0.107409115579869, + "learning_rate": 6.5215399211036815e-06, + "loss": 1.5369, + "step": 634 + }, + { + "epoch": 0.8639455782312925, + "grad_norm": 0.1076970404147157, + "learning_rate": 6.510825279171077e-06, + "loss": 1.7722, + "step": 635 + }, + { + "epoch": 0.8653061224489796, + "grad_norm": 0.15949100738404212, + "learning_rate": 6.5001029967523195e-06, + "loss": 1.5295, + "step": 636 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.13488187893377007, + "learning_rate": 6.489373128071714e-06, + "loss": 1.6053, + "step": 637 + }, + { + "epoch": 0.8680272108843538, + "grad_norm": 0.1117979139478637, + "learning_rate": 6.4786357273919296e-06, + "loss": 1.6219, + "step": 638 + }, + { + "epoch": 0.8693877551020408, + "grad_norm": 0.11161396243370707, + "learning_rate": 6.467890849013728e-06, + "loss": 1.6193, + "step": 639 + }, + { + "epoch": 0.8707482993197279, + "grad_norm": 0.10619453607937132, + "learning_rate": 6.4571385472756835e-06, + "loss": 1.6587, + "step": 640 + }, + { + "epoch": 0.8721088435374149, + "grad_norm": 0.1092897734952719, + "learning_rate": 6.446378876553914e-06, + "loss": 1.5463, + "step": 641 + }, + { + "epoch": 0.8734693877551021, + "grad_norm": 0.10030800864141241, + "learning_rate": 6.4356118912618025e-06, + "loss": 1.6678, + "step": 642 + }, + { + "epoch": 0.8748299319727891, + "grad_norm": 0.10968908293389731, + "learning_rate": 6.424837645849724e-06, + "loss": 1.6558, + "step": 643 + }, + { + "epoch": 0.8761904761904762, + "grad_norm": 0.10586492564834864, + "learning_rate": 6.41405619480477e-06, + "loss": 1.6116, + "step": 644 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 0.10414471406609932, + "learning_rate": 6.403267592650466e-06, + "loss": 1.5987, + "step": 645 + }, + { + "epoch": 0.8789115646258503, + "grad_norm": 0.11102655800428667, + "learning_rate": 6.39247189394651e-06, + "loss": 1.5676, + "step": 646 + }, + { + "epoch": 0.8802721088435375, + "grad_norm": 0.1414944802203078, + "learning_rate": 6.381669153288485e-06, + "loss": 1.5632, + "step": 647 + }, + { + "epoch": 0.8816326530612245, + "grad_norm": 0.11327036672893112, + "learning_rate": 6.370859425307583e-06, + "loss": 1.6175, + "step": 648 + }, + { + "epoch": 0.8829931972789116, + "grad_norm": 0.10646525745122841, + "learning_rate": 6.360042764670337e-06, + "loss": 1.6644, + "step": 649 + }, + { + "epoch": 0.8843537414965986, + "grad_norm": 0.13888676328597274, + "learning_rate": 6.349219226078338e-06, + "loss": 1.707, + "step": 650 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 0.12373888508470542, + "learning_rate": 6.3383888642679585e-06, + "loss": 1.562, + "step": 651 + }, + { + "epoch": 0.8870748299319728, + "grad_norm": 0.1150370310262787, + "learning_rate": 6.327551734010079e-06, + "loss": 1.5981, + "step": 652 + }, + { + "epoch": 0.8884353741496599, + "grad_norm": 0.1333929036077325, + "learning_rate": 6.3167078901098064e-06, + "loss": 1.6216, + "step": 653 + }, + { + "epoch": 0.889795918367347, + "grad_norm": 0.09937346566987916, + "learning_rate": 6.305857387406204e-06, + "loss": 1.7385, + "step": 654 + }, + { + "epoch": 0.891156462585034, + "grad_norm": 0.10542763073830541, + "learning_rate": 6.295000280772004e-06, + "loss": 1.5687, + "step": 655 + }, + { + "epoch": 0.8925170068027211, + "grad_norm": 0.11666549211549145, + "learning_rate": 6.2841366251133405e-06, + "loss": 1.674, + "step": 656 + }, + { + "epoch": 0.8938775510204081, + "grad_norm": 0.1010611811906902, + "learning_rate": 6.273266475369466e-06, + "loss": 1.8506, + "step": 657 + }, + { + "epoch": 0.8952380952380953, + "grad_norm": 0.12229341089698531, + "learning_rate": 6.262389886512475e-06, + "loss": 1.6744, + "step": 658 + }, + { + "epoch": 0.8965986394557823, + "grad_norm": 0.1127681300190243, + "learning_rate": 6.251506913547021e-06, + "loss": 1.5399, + "step": 659 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.12397997048973994, + "learning_rate": 6.240617611510049e-06, + "loss": 1.5651, + "step": 660 + }, + { + "epoch": 0.8993197278911564, + "grad_norm": 0.11295244013894742, + "learning_rate": 6.229722035470509e-06, + "loss": 1.6198, + "step": 661 + }, + { + "epoch": 0.9006802721088435, + "grad_norm": 0.1075660765121038, + "learning_rate": 6.21882024052908e-06, + "loss": 1.6066, + "step": 662 + }, + { + "epoch": 0.9020408163265307, + "grad_norm": 0.1202834632226026, + "learning_rate": 6.2079122818178885e-06, + "loss": 1.7857, + "step": 663 + }, + { + "epoch": 0.9034013605442177, + "grad_norm": 0.10694066024862514, + "learning_rate": 6.196998214500236e-06, + "loss": 1.7661, + "step": 664 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.10432718122763324, + "learning_rate": 6.186078093770312e-06, + "loss": 1.5971, + "step": 665 + }, + { + "epoch": 0.9061224489795918, + "grad_norm": 0.11861992306540457, + "learning_rate": 6.1751519748529235e-06, + "loss": 1.5868, + "step": 666 + }, + { + "epoch": 0.9074829931972789, + "grad_norm": 0.10675270335750815, + "learning_rate": 6.164219913003208e-06, + "loss": 1.7003, + "step": 667 + }, + { + "epoch": 0.908843537414966, + "grad_norm": 0.11325117184223481, + "learning_rate": 6.153281963506359e-06, + "loss": 1.5944, + "step": 668 + }, + { + "epoch": 0.9102040816326531, + "grad_norm": 0.10366546297175218, + "learning_rate": 6.142338181677344e-06, + "loss": 1.8128, + "step": 669 + }, + { + "epoch": 0.9115646258503401, + "grad_norm": 0.10444277193505327, + "learning_rate": 6.131388622860627e-06, + "loss": 1.8767, + "step": 670 + }, + { + "epoch": 0.9129251700680272, + "grad_norm": 0.10972076945530858, + "learning_rate": 6.1204333424298835e-06, + "loss": 1.7049, + "step": 671 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.10751143314986424, + "learning_rate": 6.10947239578773e-06, + "loss": 1.7047, + "step": 672 + }, + { + "epoch": 0.9156462585034013, + "grad_norm": 0.10664892548031508, + "learning_rate": 6.098505838365431e-06, + "loss": 1.7452, + "step": 673 + }, + { + "epoch": 0.9170068027210885, + "grad_norm": 0.10573801367962304, + "learning_rate": 6.087533725622631e-06, + "loss": 1.6404, + "step": 674 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 0.11119140472745474, + "learning_rate": 6.076556113047066e-06, + "loss": 1.7246, + "step": 675 + }, + { + "epoch": 0.9197278911564626, + "grad_norm": 0.09707326009557425, + "learning_rate": 6.065573056154289e-06, + "loss": 1.6736, + "step": 676 + }, + { + "epoch": 0.9210884353741496, + "grad_norm": 0.14286264459704873, + "learning_rate": 6.05458461048738e-06, + "loss": 1.7604, + "step": 677 + }, + { + "epoch": 0.9224489795918367, + "grad_norm": 0.10186789751886853, + "learning_rate": 6.043590831616677e-06, + "loss": 1.52, + "step": 678 + }, + { + "epoch": 0.9238095238095239, + "grad_norm": 0.08610978660077238, + "learning_rate": 6.032591775139483e-06, + "loss": 1.6948, + "step": 679 + }, + { + "epoch": 0.9251700680272109, + "grad_norm": 0.14039977263434622, + "learning_rate": 6.0215874966797935e-06, + "loss": 1.7652, + "step": 680 + }, + { + "epoch": 0.926530612244898, + "grad_norm": 0.09777556431038441, + "learning_rate": 6.0105780518880156e-06, + "loss": 1.5695, + "step": 681 + }, + { + "epoch": 0.927891156462585, + "grad_norm": 0.10076506193217513, + "learning_rate": 5.999563496440678e-06, + "loss": 1.5797, + "step": 682 + }, + { + "epoch": 0.9292517006802721, + "grad_norm": 0.13645736281296353, + "learning_rate": 5.988543886040157e-06, + "loss": 1.6124, + "step": 683 + }, + { + "epoch": 0.9306122448979591, + "grad_norm": 0.18938296093195647, + "learning_rate": 5.977519276414393e-06, + "loss": 1.7377, + "step": 684 + }, + { + "epoch": 0.9319727891156463, + "grad_norm": 0.11671931887431021, + "learning_rate": 5.966489723316609e-06, + "loss": 1.57, + "step": 685 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.13192750753599222, + "learning_rate": 5.955455282525027e-06, + "loss": 1.5089, + "step": 686 + }, + { + "epoch": 0.9346938775510204, + "grad_norm": 0.1279138939941876, + "learning_rate": 5.944416009842585e-06, + "loss": 1.4862, + "step": 687 + }, + { + "epoch": 0.9360544217687075, + "grad_norm": 0.1368328421716675, + "learning_rate": 5.933371961096661e-06, + "loss": 1.7591, + "step": 688 + }, + { + "epoch": 0.9374149659863945, + "grad_norm": 0.09459226556926907, + "learning_rate": 5.92232319213878e-06, + "loss": 1.7315, + "step": 689 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.0944762256396259, + "learning_rate": 5.9112697588443456e-06, + "loss": 1.6664, + "step": 690 + }, + { + "epoch": 0.9401360544217687, + "grad_norm": 0.11165381365887841, + "learning_rate": 5.900211717112343e-06, + "loss": 1.512, + "step": 691 + }, + { + "epoch": 0.9414965986394558, + "grad_norm": 0.10026925859750246, + "learning_rate": 5.889149122865067e-06, + "loss": 1.8164, + "step": 692 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 0.14168969817408006, + "learning_rate": 5.8780820320478325e-06, + "loss": 1.7176, + "step": 693 + }, + { + "epoch": 0.9442176870748299, + "grad_norm": 0.11535232171729691, + "learning_rate": 5.867010500628698e-06, + "loss": 1.6684, + "step": 694 + }, + { + "epoch": 0.9455782312925171, + "grad_norm": 0.10755021118232244, + "learning_rate": 5.855934584598175e-06, + "loss": 1.7584, + "step": 695 + }, + { + "epoch": 0.9469387755102041, + "grad_norm": 0.11779529743978591, + "learning_rate": 5.844854339968952e-06, + "loss": 1.6906, + "step": 696 + }, + { + "epoch": 0.9482993197278912, + "grad_norm": 0.11394418134579978, + "learning_rate": 5.8337698227756035e-06, + "loss": 1.6403, + "step": 697 + }, + { + "epoch": 0.9496598639455782, + "grad_norm": 0.1136124987259348, + "learning_rate": 5.822681089074315e-06, + "loss": 1.5563, + "step": 698 + }, + { + "epoch": 0.9510204081632653, + "grad_norm": 0.10296990279179531, + "learning_rate": 5.811588194942593e-06, + "loss": 1.7407, + "step": 699 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.11976200431163776, + "learning_rate": 5.800491196478989e-06, + "loss": 1.4828, + "step": 700 + }, + { + "epoch": 0.9537414965986395, + "grad_norm": 0.11272704179233839, + "learning_rate": 5.789390149802802e-06, + "loss": 1.602, + "step": 701 + }, + { + "epoch": 0.9551020408163265, + "grad_norm": 0.23740154863087562, + "learning_rate": 5.778285111053812e-06, + "loss": 1.6265, + "step": 702 + }, + { + "epoch": 0.9564625850340136, + "grad_norm": 0.11973657664057448, + "learning_rate": 5.767176136391982e-06, + "loss": 1.5886, + "step": 703 + }, + { + "epoch": 0.9578231292517007, + "grad_norm": 0.11604087204409168, + "learning_rate": 5.756063281997183e-06, + "loss": 1.6891, + "step": 704 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 0.10628214364332488, + "learning_rate": 5.744946604068904e-06, + "loss": 1.6309, + "step": 705 + }, + { + "epoch": 0.9605442176870749, + "grad_norm": 0.09908640691690514, + "learning_rate": 5.733826158825973e-06, + "loss": 1.6741, + "step": 706 + }, + { + "epoch": 0.9619047619047619, + "grad_norm": 0.10347727809388245, + "learning_rate": 5.722702002506264e-06, + "loss": 1.6104, + "step": 707 + }, + { + "epoch": 0.963265306122449, + "grad_norm": 0.10424856238627535, + "learning_rate": 5.711574191366427e-06, + "loss": 1.6592, + "step": 708 + }, + { + "epoch": 0.964625850340136, + "grad_norm": 0.09391458584535348, + "learning_rate": 5.700442781681588e-06, + "loss": 1.7451, + "step": 709 + }, + { + "epoch": 0.9659863945578231, + "grad_norm": 0.1114698322409562, + "learning_rate": 5.689307829745074e-06, + "loss": 1.5695, + "step": 710 + }, + { + "epoch": 0.9673469387755103, + "grad_norm": 0.1263756308873154, + "learning_rate": 5.678169391868128e-06, + "loss": 1.7918, + "step": 711 + }, + { + "epoch": 0.9687074829931973, + "grad_norm": 0.1068286657604504, + "learning_rate": 5.6670275243796194e-06, + "loss": 1.6695, + "step": 712 + }, + { + "epoch": 0.9700680272108844, + "grad_norm": 0.08766062542171628, + "learning_rate": 5.65588228362576e-06, + "loss": 1.8529, + "step": 713 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 0.10061266375776706, + "learning_rate": 5.6447337259698245e-06, + "loss": 1.8285, + "step": 714 + }, + { + "epoch": 0.9727891156462585, + "grad_norm": 0.12075281384282141, + "learning_rate": 5.633581907791858e-06, + "loss": 1.7784, + "step": 715 + }, + { + "epoch": 0.9741496598639455, + "grad_norm": 0.1332470465992645, + "learning_rate": 5.6224268854884e-06, + "loss": 1.675, + "step": 716 + }, + { + "epoch": 0.9755102040816327, + "grad_norm": 0.1283354676957953, + "learning_rate": 5.611268715472187e-06, + "loss": 1.4725, + "step": 717 + }, + { + "epoch": 0.9768707482993197, + "grad_norm": 0.16904870188773163, + "learning_rate": 5.600107454171879e-06, + "loss": 1.6237, + "step": 718 + }, + { + "epoch": 0.9782312925170068, + "grad_norm": 0.1259321663471572, + "learning_rate": 5.5889431580317655e-06, + "loss": 1.663, + "step": 719 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.10967294130093679, + "learning_rate": 5.577775883511489e-06, + "loss": 1.6294, + "step": 720 + }, + { + "epoch": 0.9809523809523809, + "grad_norm": 0.17546450235130967, + "learning_rate": 5.566605687085749e-06, + "loss": 1.5841, + "step": 721 + }, + { + "epoch": 0.9823129251700681, + "grad_norm": 0.12265102828756684, + "learning_rate": 5.555432625244024e-06, + "loss": 1.4919, + "step": 722 + }, + { + "epoch": 0.9836734693877551, + "grad_norm": 0.10886604673890071, + "learning_rate": 5.5442567544902805e-06, + "loss": 1.6385, + "step": 723 + }, + { + "epoch": 0.9850340136054422, + "grad_norm": 0.11421831431853076, + "learning_rate": 5.533078131342695e-06, + "loss": 1.6341, + "step": 724 + }, + { + "epoch": 0.9863945578231292, + "grad_norm": 0.09054819655341287, + "learning_rate": 5.5218968123333594e-06, + "loss": 1.624, + "step": 725 + }, + { + "epoch": 0.9877551020408163, + "grad_norm": 0.10971827184516575, + "learning_rate": 5.510712854008001e-06, + "loss": 1.5447, + "step": 726 + }, + { + "epoch": 0.9891156462585035, + "grad_norm": 0.12112012725777507, + "learning_rate": 5.499526312925693e-06, + "loss": 1.7353, + "step": 727 + }, + { + "epoch": 0.9904761904761905, + "grad_norm": 0.10541432689931088, + "learning_rate": 5.488337245658569e-06, + "loss": 1.6583, + "step": 728 + }, + { + "epoch": 0.9918367346938776, + "grad_norm": 0.14736400599380917, + "learning_rate": 5.477145708791543e-06, + "loss": 1.6641, + "step": 729 + }, + { + "epoch": 0.9931972789115646, + "grad_norm": 0.10683105881710352, + "learning_rate": 5.4659517589220135e-06, + "loss": 1.4082, + "step": 730 + }, + { + "epoch": 0.9945578231292517, + "grad_norm": 0.1212803888010618, + "learning_rate": 5.454755452659583e-06, + "loss": 1.7298, + "step": 731 + }, + { + "epoch": 0.9959183673469387, + "grad_norm": 0.114898199928806, + "learning_rate": 5.443556846625773e-06, + "loss": 1.6922, + "step": 732 + }, + { + "epoch": 0.9972789115646259, + "grad_norm": 0.17977169651896993, + "learning_rate": 5.432355997453729e-06, + "loss": 1.6933, + "step": 733 + }, + { + "epoch": 0.998639455782313, + "grad_norm": 0.13478171843239625, + "learning_rate": 5.42115296178795e-06, + "loss": 1.758, + "step": 734 + }, + { + "epoch": 1.0, + "grad_norm": 0.10983192009638694, + "learning_rate": 5.409947796283982e-06, + "loss": 1.6745, + "step": 735 + }, + { + "epoch": 1.0013605442176872, + "grad_norm": 0.15728756550496747, + "learning_rate": 5.398740557608151e-06, + "loss": 1.5976, + "step": 736 + }, + { + "epoch": 1.0013605442176872, + "eval_loss": 1.6908553838729858, + "eval_runtime": 76.8223, + "eval_samples_per_second": 53.018, + "eval_steps_per_second": 6.639, + "step": 736 + }, + { + "epoch": 1.002721088435374, + "grad_norm": 0.10538368349304568, + "learning_rate": 5.38753130243726e-06, + "loss": 1.6615, + "step": 737 + }, + { + "epoch": 1.0040816326530613, + "grad_norm": 0.11186773529092461, + "learning_rate": 5.376320087458316e-06, + "loss": 1.686, + "step": 738 + }, + { + "epoch": 1.0054421768707482, + "grad_norm": 0.09819212487389627, + "learning_rate": 5.365106969368235e-06, + "loss": 1.6144, + "step": 739 + }, + { + "epoch": 1.0068027210884354, + "grad_norm": 0.10858676538086111, + "learning_rate": 5.353892004873554e-06, + "loss": 1.7423, + "step": 740 + }, + { + "epoch": 1.0081632653061225, + "grad_norm": 0.11968981856525805, + "learning_rate": 5.34267525069015e-06, + "loss": 1.6532, + "step": 741 + }, + { + "epoch": 1.0095238095238095, + "grad_norm": 0.09936611311145167, + "learning_rate": 5.331456763542954e-06, + "loss": 1.8078, + "step": 742 + }, + { + "epoch": 1.0108843537414967, + "grad_norm": 0.10022311903878614, + "learning_rate": 5.3202366001656535e-06, + "loss": 1.5739, + "step": 743 + }, + { + "epoch": 1.0122448979591836, + "grad_norm": 0.4103772585061572, + "learning_rate": 5.309014817300422e-06, + "loss": 1.6617, + "step": 744 + }, + { + "epoch": 1.0136054421768708, + "grad_norm": 0.10960071410597617, + "learning_rate": 5.297791471697614e-06, + "loss": 1.5742, + "step": 745 + }, + { + "epoch": 1.014965986394558, + "grad_norm": 0.1310811762519516, + "learning_rate": 5.286566620115493e-06, + "loss": 1.7022, + "step": 746 + }, + { + "epoch": 1.0163265306122449, + "grad_norm": 0.15568202046790616, + "learning_rate": 5.2753403193199374e-06, + "loss": 1.592, + "step": 747 + }, + { + "epoch": 1.017687074829932, + "grad_norm": 0.11383339823280172, + "learning_rate": 5.264112626084153e-06, + "loss": 1.6331, + "step": 748 + }, + { + "epoch": 1.019047619047619, + "grad_norm": 0.11927282820236593, + "learning_rate": 5.2528835971883876e-06, + "loss": 1.7091, + "step": 749 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.11606987091212463, + "learning_rate": 5.241653289419647e-06, + "loss": 1.8403, + "step": 750 + }, + { + "epoch": 1.021768707482993, + "grad_norm": 0.16512841788961088, + "learning_rate": 5.230421759571398e-06, + "loss": 1.785, + "step": 751 + }, + { + "epoch": 1.0231292517006803, + "grad_norm": 0.12135979152340173, + "learning_rate": 5.219189064443296e-06, + "loss": 1.5237, + "step": 752 + }, + { + "epoch": 1.0244897959183674, + "grad_norm": 0.12001388963231435, + "learning_rate": 5.207955260840879e-06, + "loss": 1.6265, + "step": 753 + }, + { + "epoch": 1.0258503401360544, + "grad_norm": 0.14241973320076035, + "learning_rate": 5.1967204055753e-06, + "loss": 1.6843, + "step": 754 + }, + { + "epoch": 1.0272108843537415, + "grad_norm": 0.3503178069734055, + "learning_rate": 5.185484555463026e-06, + "loss": 1.8022, + "step": 755 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 0.10584406401171531, + "learning_rate": 5.17424776732556e-06, + "loss": 1.6713, + "step": 756 + }, + { + "epoch": 1.0299319727891156, + "grad_norm": 0.11525477562466327, + "learning_rate": 5.163010097989138e-06, + "loss": 1.73, + "step": 757 + }, + { + "epoch": 1.0312925170068028, + "grad_norm": 0.13500765537475423, + "learning_rate": 5.151771604284465e-06, + "loss": 1.405, + "step": 758 + }, + { + "epoch": 1.0326530612244897, + "grad_norm": 0.18732245961092447, + "learning_rate": 5.140532343046406e-06, + "loss": 1.5587, + "step": 759 + }, + { + "epoch": 1.034013605442177, + "grad_norm": 0.10827851940146976, + "learning_rate": 5.129292371113712e-06, + "loss": 1.7328, + "step": 760 + }, + { + "epoch": 1.0353741496598639, + "grad_norm": 0.11027741963001209, + "learning_rate": 5.118051745328725e-06, + "loss": 1.6382, + "step": 761 + }, + { + "epoch": 1.036734693877551, + "grad_norm": 0.09919441836217119, + "learning_rate": 5.1068105225370975e-06, + "loss": 1.6855, + "step": 762 + }, + { + "epoch": 1.0380952380952382, + "grad_norm": 0.13830797432010328, + "learning_rate": 5.095568759587497e-06, + "loss": 1.7411, + "step": 763 + }, + { + "epoch": 1.0394557823129251, + "grad_norm": 0.0972282280148536, + "learning_rate": 5.084326513331328e-06, + "loss": 1.621, + "step": 764 + }, + { + "epoch": 1.0408163265306123, + "grad_norm": 0.10578341535327669, + "learning_rate": 5.0730838406224324e-06, + "loss": 1.6273, + "step": 765 + }, + { + "epoch": 1.0421768707482992, + "grad_norm": 0.10463652066026084, + "learning_rate": 5.061840798316815e-06, + "loss": 1.725, + "step": 766 + }, + { + "epoch": 1.0435374149659864, + "grad_norm": 0.11942338757679548, + "learning_rate": 5.0505974432723445e-06, + "loss": 1.5898, + "step": 767 + }, + { + "epoch": 1.0448979591836736, + "grad_norm": 0.13689962498964267, + "learning_rate": 5.039353832348477e-06, + "loss": 1.5068, + "step": 768 + }, + { + "epoch": 1.0462585034013605, + "grad_norm": 0.13678139029155267, + "learning_rate": 5.028110022405955e-06, + "loss": 1.7158, + "step": 769 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.1000363922457913, + "learning_rate": 5.0168660703065354e-06, + "loss": 1.741, + "step": 770 + }, + { + "epoch": 1.0489795918367346, + "grad_norm": 0.1188815368304704, + "learning_rate": 5.005622032912687e-06, + "loss": 1.6623, + "step": 771 + }, + { + "epoch": 1.0503401360544218, + "grad_norm": 0.12019243813821356, + "learning_rate": 4.994377967087316e-06, + "loss": 1.5774, + "step": 772 + }, + { + "epoch": 1.051700680272109, + "grad_norm": 0.12450888223025358, + "learning_rate": 4.983133929693467e-06, + "loss": 1.5663, + "step": 773 + }, + { + "epoch": 1.0530612244897959, + "grad_norm": 0.10816898093198718, + "learning_rate": 4.971889977594048e-06, + "loss": 1.6911, + "step": 774 + }, + { + "epoch": 1.054421768707483, + "grad_norm": 1.729544315460825, + "learning_rate": 4.960646167651524e-06, + "loss": 1.7524, + "step": 775 + }, + { + "epoch": 1.05578231292517, + "grad_norm": 0.1133964092151596, + "learning_rate": 4.949402556727655e-06, + "loss": 1.7612, + "step": 776 + }, + { + "epoch": 1.0571428571428572, + "grad_norm": 0.12041549603751739, + "learning_rate": 4.9381592016831856e-06, + "loss": 1.5116, + "step": 777 + }, + { + "epoch": 1.0585034013605443, + "grad_norm": 0.11576436512487191, + "learning_rate": 4.9269161593775675e-06, + "loss": 1.5329, + "step": 778 + }, + { + "epoch": 1.0598639455782313, + "grad_norm": 0.12679277289105279, + "learning_rate": 4.915673486668673e-06, + "loss": 1.5506, + "step": 779 + }, + { + "epoch": 1.0612244897959184, + "grad_norm": 0.11679477613827106, + "learning_rate": 4.904431240412503e-06, + "loss": 1.5008, + "step": 780 + }, + { + "epoch": 1.0625850340136054, + "grad_norm": 0.10065169200286987, + "learning_rate": 4.893189477462905e-06, + "loss": 1.7685, + "step": 781 + }, + { + "epoch": 1.0639455782312925, + "grad_norm": 0.10287181892423168, + "learning_rate": 4.881948254671277e-06, + "loss": 1.6379, + "step": 782 + }, + { + "epoch": 1.0653061224489795, + "grad_norm": 0.10896624508987876, + "learning_rate": 4.870707628886291e-06, + "loss": 1.5234, + "step": 783 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.10915491188092791, + "learning_rate": 4.859467656953596e-06, + "loss": 1.5865, + "step": 784 + }, + { + "epoch": 1.0680272108843538, + "grad_norm": 0.1275517896298314, + "learning_rate": 4.8482283957155355e-06, + "loss": 1.7069, + "step": 785 + }, + { + "epoch": 1.0693877551020408, + "grad_norm": 0.10862109015230759, + "learning_rate": 4.836989902010863e-06, + "loss": 1.7682, + "step": 786 + }, + { + "epoch": 1.070748299319728, + "grad_norm": 0.09805361474836685, + "learning_rate": 4.825752232674441e-06, + "loss": 1.6228, + "step": 787 + }, + { + "epoch": 1.0721088435374149, + "grad_norm": 0.12481176224206536, + "learning_rate": 4.814515444536975e-06, + "loss": 1.5027, + "step": 788 + }, + { + "epoch": 1.073469387755102, + "grad_norm": 0.11390040747547601, + "learning_rate": 4.8032795944247e-06, + "loss": 1.5168, + "step": 789 + }, + { + "epoch": 1.0748299319727892, + "grad_norm": 0.13458744184978433, + "learning_rate": 4.792044739159124e-06, + "loss": 1.5188, + "step": 790 + }, + { + "epoch": 1.0761904761904761, + "grad_norm": 0.12294480568667183, + "learning_rate": 4.780810935556707e-06, + "loss": 1.3946, + "step": 791 + }, + { + "epoch": 1.0775510204081633, + "grad_norm": 0.13732274998725952, + "learning_rate": 4.7695782404286045e-06, + "loss": 1.6201, + "step": 792 + }, + { + "epoch": 1.0789115646258503, + "grad_norm": 0.09919081899157949, + "learning_rate": 4.758346710580355e-06, + "loss": 1.7815, + "step": 793 + }, + { + "epoch": 1.0802721088435374, + "grad_norm": 0.10000541950613748, + "learning_rate": 4.747116402811612e-06, + "loss": 1.8098, + "step": 794 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 0.11420683474567415, + "learning_rate": 4.735887373915848e-06, + "loss": 1.4835, + "step": 795 + }, + { + "epoch": 1.0829931972789115, + "grad_norm": 0.15140465279516127, + "learning_rate": 4.724659680680063e-06, + "loss": 1.5029, + "step": 796 + }, + { + "epoch": 1.0843537414965987, + "grad_norm": 0.11208117898027617, + "learning_rate": 4.713433379884508e-06, + "loss": 1.7194, + "step": 797 + }, + { + "epoch": 1.0857142857142856, + "grad_norm": 0.16855436651508154, + "learning_rate": 4.7022085283023875e-06, + "loss": 1.7491, + "step": 798 + }, + { + "epoch": 1.0870748299319728, + "grad_norm": 0.11595366549169164, + "learning_rate": 4.690985182699581e-06, + "loss": 1.6313, + "step": 799 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 0.09503431401550515, + "learning_rate": 4.679763399834347e-06, + "loss": 1.5984, + "step": 800 + }, + { + "epoch": 1.089795918367347, + "grad_norm": 0.10896290135710643, + "learning_rate": 4.668543236457049e-06, + "loss": 1.7379, + "step": 801 + }, + { + "epoch": 1.091156462585034, + "grad_norm": 0.11312340154871599, + "learning_rate": 4.657324749309851e-06, + "loss": 1.7817, + "step": 802 + }, + { + "epoch": 1.092517006802721, + "grad_norm": 0.11398862476288807, + "learning_rate": 4.646107995126447e-06, + "loss": 1.6113, + "step": 803 + }, + { + "epoch": 1.0938775510204082, + "grad_norm": 0.11528891846651022, + "learning_rate": 4.634893030631767e-06, + "loss": 1.6745, + "step": 804 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.11540329784321814, + "learning_rate": 4.623679912541683e-06, + "loss": 1.6443, + "step": 805 + }, + { + "epoch": 1.0965986394557823, + "grad_norm": 0.11457602895195093, + "learning_rate": 4.612468697562741e-06, + "loss": 1.5109, + "step": 806 + }, + { + "epoch": 1.0979591836734695, + "grad_norm": 0.108466968677861, + "learning_rate": 4.6012594423918505e-06, + "loss": 1.6285, + "step": 807 + }, + { + "epoch": 1.0993197278911564, + "grad_norm": 0.11392368141416775, + "learning_rate": 4.5900522037160205e-06, + "loss": 1.524, + "step": 808 + }, + { + "epoch": 1.1006802721088436, + "grad_norm": 0.10657614384924384, + "learning_rate": 4.578847038212052e-06, + "loss": 1.7741, + "step": 809 + }, + { + "epoch": 1.1020408163265305, + "grad_norm": 0.11158460621214396, + "learning_rate": 4.567644002546273e-06, + "loss": 1.6648, + "step": 810 + }, + { + "epoch": 1.1034013605442177, + "grad_norm": 0.11249423664710494, + "learning_rate": 4.556443153374229e-06, + "loss": 1.5484, + "step": 811 + }, + { + "epoch": 1.1047619047619048, + "grad_norm": 0.18597431248705018, + "learning_rate": 4.5452445473404175e-06, + "loss": 1.4591, + "step": 812 + }, + { + "epoch": 1.1061224489795918, + "grad_norm": 0.1146701110937394, + "learning_rate": 4.534048241077987e-06, + "loss": 1.7267, + "step": 813 + }, + { + "epoch": 1.107482993197279, + "grad_norm": 0.10820118015434177, + "learning_rate": 4.522854291208458e-06, + "loss": 1.7739, + "step": 814 + }, + { + "epoch": 1.1088435374149659, + "grad_norm": 0.09878497379061389, + "learning_rate": 4.511662754341433e-06, + "loss": 1.6488, + "step": 815 + }, + { + "epoch": 1.110204081632653, + "grad_norm": 0.12970442140488536, + "learning_rate": 4.50047368707431e-06, + "loss": 1.4903, + "step": 816 + }, + { + "epoch": 1.1115646258503402, + "grad_norm": 0.10382954289821261, + "learning_rate": 4.489287145992002e-06, + "loss": 1.8014, + "step": 817 + }, + { + "epoch": 1.1129251700680272, + "grad_norm": 0.12667728251125698, + "learning_rate": 4.478103187666642e-06, + "loss": 1.6478, + "step": 818 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.13500564587776376, + "learning_rate": 4.4669218686573065e-06, + "loss": 1.6239, + "step": 819 + }, + { + "epoch": 1.1156462585034013, + "grad_norm": 0.1244506846499162, + "learning_rate": 4.45574324550972e-06, + "loss": 1.718, + "step": 820 + }, + { + "epoch": 1.1170068027210884, + "grad_norm": 0.11873853458664813, + "learning_rate": 4.444567374755978e-06, + "loss": 1.7092, + "step": 821 + }, + { + "epoch": 1.1183673469387756, + "grad_norm": 0.12208418986043022, + "learning_rate": 4.433394312914253e-06, + "loss": 1.7201, + "step": 822 + }, + { + "epoch": 1.1197278911564625, + "grad_norm": 0.13523682008047516, + "learning_rate": 4.4222241164885114e-06, + "loss": 1.6248, + "step": 823 + }, + { + "epoch": 1.1210884353741497, + "grad_norm": 0.11246329856395026, + "learning_rate": 4.411056841968236e-06, + "loss": 1.5008, + "step": 824 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 0.12873507179822666, + "learning_rate": 4.3998925458281225e-06, + "loss": 1.6518, + "step": 825 + }, + { + "epoch": 1.1238095238095238, + "grad_norm": 0.13059910987895082, + "learning_rate": 4.388731284527816e-06, + "loss": 1.6547, + "step": 826 + }, + { + "epoch": 1.125170068027211, + "grad_norm": 0.13485735751834116, + "learning_rate": 4.377573114511602e-06, + "loss": 1.5989, + "step": 827 + }, + { + "epoch": 1.126530612244898, + "grad_norm": 0.12101663226880303, + "learning_rate": 4.366418092208144e-06, + "loss": 1.6142, + "step": 828 + }, + { + "epoch": 1.127891156462585, + "grad_norm": 0.11607507412451191, + "learning_rate": 4.355266274030177e-06, + "loss": 1.6316, + "step": 829 + }, + { + "epoch": 1.129251700680272, + "grad_norm": 0.10911266136424244, + "learning_rate": 4.344117716374241e-06, + "loss": 1.5342, + "step": 830 + }, + { + "epoch": 1.1306122448979592, + "grad_norm": 0.12731437170330562, + "learning_rate": 4.332972475620381e-06, + "loss": 1.6973, + "step": 831 + }, + { + "epoch": 1.1319727891156464, + "grad_norm": 0.09762084699859862, + "learning_rate": 4.321830608131872e-06, + "loss": 1.6633, + "step": 832 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.12978891884133562, + "learning_rate": 4.310692170254927e-06, + "loss": 1.6098, + "step": 833 + }, + { + "epoch": 1.1346938775510205, + "grad_norm": 0.12415908074295716, + "learning_rate": 4.299557218318413e-06, + "loss": 1.6307, + "step": 834 + }, + { + "epoch": 1.1360544217687074, + "grad_norm": 0.2173948925361782, + "learning_rate": 4.2884258086335755e-06, + "loss": 1.5907, + "step": 835 + }, + { + "epoch": 1.1374149659863946, + "grad_norm": 0.11912082507711948, + "learning_rate": 4.277297997493737e-06, + "loss": 1.5837, + "step": 836 + }, + { + "epoch": 1.1387755102040815, + "grad_norm": 0.19450908692643373, + "learning_rate": 4.266173841174031e-06, + "loss": 1.7324, + "step": 837 + }, + { + "epoch": 1.1401360544217687, + "grad_norm": 0.11032655579441553, + "learning_rate": 4.255053395931097e-06, + "loss": 1.7134, + "step": 838 + }, + { + "epoch": 1.1414965986394559, + "grad_norm": 0.10178083734022528, + "learning_rate": 4.243936718002818e-06, + "loss": 1.6472, + "step": 839 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.10102775840788815, + "learning_rate": 4.23282386360802e-06, + "loss": 1.6893, + "step": 840 + }, + { + "epoch": 1.14421768707483, + "grad_norm": 0.13340734842779356, + "learning_rate": 4.22171488894619e-06, + "loss": 1.582, + "step": 841 + }, + { + "epoch": 1.1455782312925171, + "grad_norm": 0.39827971211004304, + "learning_rate": 4.2106098501972e-06, + "loss": 1.5918, + "step": 842 + }, + { + "epoch": 1.146938775510204, + "grad_norm": 0.12809647826224843, + "learning_rate": 4.1995088035210126e-06, + "loss": 1.6786, + "step": 843 + }, + { + "epoch": 1.1482993197278912, + "grad_norm": 0.11881853753159395, + "learning_rate": 4.1884118050574084e-06, + "loss": 1.6218, + "step": 844 + }, + { + "epoch": 1.1496598639455782, + "grad_norm": 0.13188862048286334, + "learning_rate": 4.177318910925686e-06, + "loss": 1.6943, + "step": 845 + }, + { + "epoch": 1.1510204081632653, + "grad_norm": 0.11973362482121355, + "learning_rate": 4.1662301772244e-06, + "loss": 1.3518, + "step": 846 + }, + { + "epoch": 1.1523809523809523, + "grad_norm": 0.1265246682564999, + "learning_rate": 4.15514566003105e-06, + "loss": 1.717, + "step": 847 + }, + { + "epoch": 1.1537414965986394, + "grad_norm": 0.12163131262108975, + "learning_rate": 4.144065415401825e-06, + "loss": 1.6591, + "step": 848 + }, + { + "epoch": 1.1551020408163266, + "grad_norm": 0.11281334037077397, + "learning_rate": 4.132989499371303e-06, + "loss": 1.7488, + "step": 849 + }, + { + "epoch": 1.1564625850340136, + "grad_norm": 0.116733746797477, + "learning_rate": 4.1219179679521675e-06, + "loss": 1.4784, + "step": 850 + }, + { + "epoch": 1.1578231292517007, + "grad_norm": 0.10824656908525278, + "learning_rate": 4.110850877134935e-06, + "loss": 1.6377, + "step": 851 + }, + { + "epoch": 1.1591836734693877, + "grad_norm": 0.13121005190396953, + "learning_rate": 4.099788282887658e-06, + "loss": 1.7325, + "step": 852 + }, + { + "epoch": 1.1605442176870748, + "grad_norm": 0.10756631934140032, + "learning_rate": 4.088730241155657e-06, + "loss": 1.7462, + "step": 853 + }, + { + "epoch": 1.161904761904762, + "grad_norm": 0.17347249647835658, + "learning_rate": 4.077676807861221e-06, + "loss": 1.4988, + "step": 854 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 0.10923737210751697, + "learning_rate": 4.066628038903341e-06, + "loss": 1.753, + "step": 855 + }, + { + "epoch": 1.164625850340136, + "grad_norm": 0.12401952453107581, + "learning_rate": 4.055583990157416e-06, + "loss": 1.6061, + "step": 856 + }, + { + "epoch": 1.165986394557823, + "grad_norm": 0.16613834484616688, + "learning_rate": 4.044544717474974e-06, + "loss": 1.6004, + "step": 857 + }, + { + "epoch": 1.1673469387755102, + "grad_norm": 0.10830889852190581, + "learning_rate": 4.033510276683392e-06, + "loss": 1.8508, + "step": 858 + }, + { + "epoch": 1.1687074829931974, + "grad_norm": 0.11570096470891132, + "learning_rate": 4.022480723585608e-06, + "loss": 1.7405, + "step": 859 + }, + { + "epoch": 1.1700680272108843, + "grad_norm": 0.13085816560126146, + "learning_rate": 4.011456113959845e-06, + "loss": 1.6882, + "step": 860 + }, + { + "epoch": 1.1714285714285715, + "grad_norm": 0.09457110643687838, + "learning_rate": 4.000436503559324e-06, + "loss": 1.7382, + "step": 861 + }, + { + "epoch": 1.1727891156462584, + "grad_norm": 0.11447572869844899, + "learning_rate": 3.989421948111987e-06, + "loss": 1.7447, + "step": 862 + }, + { + "epoch": 1.1741496598639456, + "grad_norm": 0.10219706136235482, + "learning_rate": 3.978412503320207e-06, + "loss": 1.6079, + "step": 863 + }, + { + "epoch": 1.1755102040816325, + "grad_norm": 0.09697745892488446, + "learning_rate": 3.967408224860518e-06, + "loss": 1.7658, + "step": 864 + }, + { + "epoch": 1.1768707482993197, + "grad_norm": 0.11935090111442251, + "learning_rate": 3.956409168383325e-06, + "loss": 1.642, + "step": 865 + }, + { + "epoch": 1.1782312925170069, + "grad_norm": 0.09593894653017256, + "learning_rate": 3.94541538951262e-06, + "loss": 1.6837, + "step": 866 + }, + { + "epoch": 1.1795918367346938, + "grad_norm": 0.17924060912455603, + "learning_rate": 3.934426943845712e-06, + "loss": 1.7668, + "step": 867 + }, + { + "epoch": 1.180952380952381, + "grad_norm": 0.11737602024296209, + "learning_rate": 3.923443886952934e-06, + "loss": 1.5348, + "step": 868 + }, + { + "epoch": 1.1823129251700681, + "grad_norm": 0.09917281103783915, + "learning_rate": 3.912466274377371e-06, + "loss": 1.6659, + "step": 869 + }, + { + "epoch": 1.183673469387755, + "grad_norm": 0.10815570929741564, + "learning_rate": 3.901494161634571e-06, + "loss": 1.5029, + "step": 870 + }, + { + "epoch": 1.1850340136054422, + "grad_norm": 0.09777863761181164, + "learning_rate": 3.890527604212273e-06, + "loss": 1.7809, + "step": 871 + }, + { + "epoch": 1.1863945578231292, + "grad_norm": 0.12446664048031521, + "learning_rate": 3.879566657570118e-06, + "loss": 1.6918, + "step": 872 + }, + { + "epoch": 1.1877551020408164, + "grad_norm": 0.11809808630659835, + "learning_rate": 3.868611377139375e-06, + "loss": 1.5293, + "step": 873 + }, + { + "epoch": 1.1891156462585033, + "grad_norm": 0.11626193699249888, + "learning_rate": 3.857661818322657e-06, + "loss": 1.5724, + "step": 874 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.10675553850073687, + "learning_rate": 3.846718036493642e-06, + "loss": 1.7026, + "step": 875 + }, + { + "epoch": 1.1918367346938776, + "grad_norm": 0.25785039953724753, + "learning_rate": 3.835780086996794e-06, + "loss": 1.816, + "step": 876 + }, + { + "epoch": 1.1931972789115646, + "grad_norm": 0.09919492633891115, + "learning_rate": 3.824848025147078e-06, + "loss": 1.8633, + "step": 877 + }, + { + "epoch": 1.1945578231292517, + "grad_norm": 0.13021628412390385, + "learning_rate": 3.81392190622969e-06, + "loss": 1.6292, + "step": 878 + }, + { + "epoch": 1.1959183673469387, + "grad_norm": 0.1063338899612945, + "learning_rate": 3.8030017854997654e-06, + "loss": 1.7619, + "step": 879 + }, + { + "epoch": 1.1972789115646258, + "grad_norm": 0.1400157862890337, + "learning_rate": 3.7920877181821136e-06, + "loss": 1.5179, + "step": 880 + }, + { + "epoch": 1.198639455782313, + "grad_norm": 0.1386445404000312, + "learning_rate": 3.781179759470921e-06, + "loss": 1.6624, + "step": 881 + }, + { + "epoch": 1.2, + "grad_norm": 0.1069092489728771, + "learning_rate": 3.7702779645294907e-06, + "loss": 1.8113, + "step": 882 + }, + { + "epoch": 1.2013605442176871, + "grad_norm": 0.1190549983688599, + "learning_rate": 3.759382388489952e-06, + "loss": 1.6425, + "step": 883 + }, + { + "epoch": 1.202721088435374, + "grad_norm": 0.11079642382956639, + "learning_rate": 3.74849308645298e-06, + "loss": 1.6609, + "step": 884 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 0.14105617751955826, + "learning_rate": 3.7376101134875278e-06, + "loss": 1.55, + "step": 885 + }, + { + "epoch": 1.2054421768707484, + "grad_norm": 0.14448379899642935, + "learning_rate": 3.7267335246305346e-06, + "loss": 1.778, + "step": 886 + }, + { + "epoch": 1.2068027210884353, + "grad_norm": 0.10874354603342407, + "learning_rate": 3.715863374886661e-06, + "loss": 1.6611, + "step": 887 + }, + { + "epoch": 1.2081632653061225, + "grad_norm": 0.10802721342705796, + "learning_rate": 3.7049997192279976e-06, + "loss": 1.5966, + "step": 888 + }, + { + "epoch": 1.2095238095238094, + "grad_norm": 0.12233010145027158, + "learning_rate": 3.6941426125937992e-06, + "loss": 1.5311, + "step": 889 + }, + { + "epoch": 1.2108843537414966, + "grad_norm": 0.10009553518935768, + "learning_rate": 3.6832921098901952e-06, + "loss": 1.5145, + "step": 890 + }, + { + "epoch": 1.2122448979591836, + "grad_norm": 0.11901267626986793, + "learning_rate": 3.6724482659899226e-06, + "loss": 1.7466, + "step": 891 + }, + { + "epoch": 1.2136054421768707, + "grad_norm": 0.12962797644138863, + "learning_rate": 3.661611135732043e-06, + "loss": 1.5964, + "step": 892 + }, + { + "epoch": 1.2149659863945579, + "grad_norm": 0.10733667423090783, + "learning_rate": 3.6507807739216628e-06, + "loss": 1.7763, + "step": 893 + }, + { + "epoch": 1.2163265306122448, + "grad_norm": 0.11144056418847463, + "learning_rate": 3.6399572353296642e-06, + "loss": 1.7047, + "step": 894 + }, + { + "epoch": 1.217687074829932, + "grad_norm": 0.08875006973132216, + "learning_rate": 3.6291405746924186e-06, + "loss": 1.7604, + "step": 895 + }, + { + "epoch": 1.2190476190476192, + "grad_norm": 0.13024552194401284, + "learning_rate": 3.6183308467115174e-06, + "loss": 1.6061, + "step": 896 + }, + { + "epoch": 1.220408163265306, + "grad_norm": 0.12857972357080566, + "learning_rate": 3.6075281060534917e-06, + "loss": 1.6149, + "step": 897 + }, + { + "epoch": 1.2217687074829933, + "grad_norm": 0.11603998479755155, + "learning_rate": 3.5967324073495363e-06, + "loss": 1.6111, + "step": 898 + }, + { + "epoch": 1.2231292517006802, + "grad_norm": 0.11178507529251955, + "learning_rate": 3.585943805195232e-06, + "loss": 1.6971, + "step": 899 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.14802404367937794, + "learning_rate": 3.575162354150276e-06, + "loss": 1.7452, + "step": 900 + }, + { + "epoch": 1.2258503401360543, + "grad_norm": 0.09565639349194041, + "learning_rate": 3.5643881087381983e-06, + "loss": 1.7094, + "step": 901 + }, + { + "epoch": 1.2272108843537415, + "grad_norm": 0.12259182420540694, + "learning_rate": 3.553621123446087e-06, + "loss": 1.4945, + "step": 902 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 0.09891155424272953, + "learning_rate": 3.542861452724318e-06, + "loss": 1.7481, + "step": 903 + }, + { + "epoch": 1.2299319727891156, + "grad_norm": 0.12577917367601982, + "learning_rate": 3.5321091509862733e-06, + "loss": 1.658, + "step": 904 + }, + { + "epoch": 1.2312925170068028, + "grad_norm": 0.11110224185718393, + "learning_rate": 3.521364272608071e-06, + "loss": 1.7805, + "step": 905 + }, + { + "epoch": 1.2326530612244897, + "grad_norm": 1.029133559209144, + "learning_rate": 3.5106268719282863e-06, + "loss": 1.6974, + "step": 906 + }, + { + "epoch": 1.2340136054421769, + "grad_norm": 0.11635973055276655, + "learning_rate": 3.499897003247682e-06, + "loss": 1.6067, + "step": 907 + }, + { + "epoch": 1.235374149659864, + "grad_norm": 0.1374434820581917, + "learning_rate": 3.489174720828924e-06, + "loss": 1.4329, + "step": 908 + }, + { + "epoch": 1.236734693877551, + "grad_norm": 0.12005946386037955, + "learning_rate": 3.4784600788963197e-06, + "loss": 1.6061, + "step": 909 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.2566914570489289, + "learning_rate": 3.4677531316355343e-06, + "loss": 1.6285, + "step": 910 + }, + { + "epoch": 1.239455782312925, + "grad_norm": 0.12805153195567712, + "learning_rate": 3.4570539331933196e-06, + "loss": 1.6518, + "step": 911 + }, + { + "epoch": 1.2408163265306122, + "grad_norm": 0.12193109943671782, + "learning_rate": 3.4463625376772415e-06, + "loss": 1.7769, + "step": 912 + }, + { + "epoch": 1.2421768707482994, + "grad_norm": 0.11785291933334519, + "learning_rate": 3.4356789991554036e-06, + "loss": 1.7037, + "step": 913 + }, + { + "epoch": 1.2435374149659864, + "grad_norm": 0.13098314516857928, + "learning_rate": 3.425003371656178e-06, + "loss": 1.6332, + "step": 914 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 0.11058534722726451, + "learning_rate": 3.4143357091679276e-06, + "loss": 1.8928, + "step": 915 + }, + { + "epoch": 1.2462585034013605, + "grad_norm": 0.1304628400810422, + "learning_rate": 3.403676065638735e-06, + "loss": 1.5842, + "step": 916 + }, + { + "epoch": 1.2476190476190476, + "grad_norm": 0.12141593155806699, + "learning_rate": 3.393024494976128e-06, + "loss": 1.6872, + "step": 917 + }, + { + "epoch": 1.2489795918367346, + "grad_norm": 0.1180901086883869, + "learning_rate": 3.3823810510468146e-06, + "loss": 1.4999, + "step": 918 + }, + { + "epoch": 1.2503401360544217, + "grad_norm": 0.1426231115449843, + "learning_rate": 3.3717457876763994e-06, + "loss": 1.7262, + "step": 919 + }, + { + "epoch": 1.251700680272109, + "grad_norm": 0.11420848866165159, + "learning_rate": 3.361118758649116e-06, + "loss": 1.617, + "step": 920 + }, + { + "epoch": 1.251700680272109, + "eval_loss": 1.6881320476531982, + "eval_runtime": 76.6095, + "eval_samples_per_second": 53.166, + "eval_steps_per_second": 6.657, + "step": 920 + }, + { + "epoch": 1.2530612244897958, + "grad_norm": 0.1528479382450589, + "learning_rate": 3.3505000177075604e-06, + "loss": 1.6073, + "step": 921 + }, + { + "epoch": 1.254421768707483, + "grad_norm": 0.13709231144088224, + "learning_rate": 3.3398896185524084e-06, + "loss": 1.6827, + "step": 922 + }, + { + "epoch": 1.2557823129251702, + "grad_norm": 0.11431374619526342, + "learning_rate": 3.329287614842155e-06, + "loss": 1.5719, + "step": 923 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 0.1326188319367622, + "learning_rate": 3.3186940601928352e-06, + "loss": 1.6046, + "step": 924 + }, + { + "epoch": 1.2585034013605443, + "grad_norm": 0.11858562580408218, + "learning_rate": 3.3081090081777577e-06, + "loss": 1.5719, + "step": 925 + }, + { + "epoch": 1.2598639455782312, + "grad_norm": 0.1133691965363108, + "learning_rate": 3.2975325123272304e-06, + "loss": 1.6144, + "step": 926 + }, + { + "epoch": 1.2612244897959184, + "grad_norm": 0.12690673847281403, + "learning_rate": 3.286964626128292e-06, + "loss": 1.6217, + "step": 927 + }, + { + "epoch": 1.2625850340136053, + "grad_norm": 0.1329474778691663, + "learning_rate": 3.2764054030244406e-06, + "loss": 1.7347, + "step": 928 + }, + { + "epoch": 1.2639455782312925, + "grad_norm": 0.12616283586928403, + "learning_rate": 3.2658548964153636e-06, + "loss": 1.7557, + "step": 929 + }, + { + "epoch": 1.2653061224489797, + "grad_norm": 0.1153724209206064, + "learning_rate": 3.25531315965667e-06, + "loss": 1.568, + "step": 930 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.19514499732232182, + "learning_rate": 3.2447802460596124e-06, + "loss": 1.6412, + "step": 931 + }, + { + "epoch": 1.2680272108843538, + "grad_norm": 0.1324775105635469, + "learning_rate": 3.234256208890831e-06, + "loss": 1.4851, + "step": 932 + }, + { + "epoch": 1.269387755102041, + "grad_norm": 0.16422315509218421, + "learning_rate": 3.2237411013720686e-06, + "loss": 1.614, + "step": 933 + }, + { + "epoch": 1.2707482993197279, + "grad_norm": 0.40727066672865614, + "learning_rate": 3.2132349766799166e-06, + "loss": 1.6335, + "step": 934 + }, + { + "epoch": 1.272108843537415, + "grad_norm": 0.11728481208845543, + "learning_rate": 3.202737887945533e-06, + "loss": 1.6401, + "step": 935 + }, + { + "epoch": 1.273469387755102, + "grad_norm": 0.11606746557775659, + "learning_rate": 3.1922498882543808e-06, + "loss": 1.5821, + "step": 936 + }, + { + "epoch": 1.2748299319727892, + "grad_norm": 0.13755764701050108, + "learning_rate": 3.1817710306459638e-06, + "loss": 1.5831, + "step": 937 + }, + { + "epoch": 1.276190476190476, + "grad_norm": 0.11936848733746346, + "learning_rate": 3.171301368113545e-06, + "loss": 1.5987, + "step": 938 + }, + { + "epoch": 1.2775510204081633, + "grad_norm": 0.09685227260059284, + "learning_rate": 3.160840953603892e-06, + "loss": 1.7107, + "step": 939 + }, + { + "epoch": 1.2789115646258504, + "grad_norm": 0.11226497972441736, + "learning_rate": 3.1503898400169986e-06, + "loss": 1.7636, + "step": 940 + }, + { + "epoch": 1.2802721088435374, + "grad_norm": 0.19072705706329435, + "learning_rate": 3.139948080205828e-06, + "loss": 1.5306, + "step": 941 + }, + { + "epoch": 1.2816326530612245, + "grad_norm": 0.11221444877520503, + "learning_rate": 3.1295157269760347e-06, + "loss": 1.5718, + "step": 942 + }, + { + "epoch": 1.2829931972789117, + "grad_norm": 0.25831949084527644, + "learning_rate": 3.1190928330857063e-06, + "loss": 1.8018, + "step": 943 + }, + { + "epoch": 1.2843537414965986, + "grad_norm": 0.12949049150025038, + "learning_rate": 3.108679451245091e-06, + "loss": 1.6306, + "step": 944 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.11975166110577949, + "learning_rate": 3.0982756341163344e-06, + "loss": 1.5288, + "step": 945 + }, + { + "epoch": 1.2870748299319728, + "grad_norm": 0.12063089261265142, + "learning_rate": 3.087881434313212e-06, + "loss": 1.8165, + "step": 946 + }, + { + "epoch": 1.28843537414966, + "grad_norm": 0.10837632174717707, + "learning_rate": 3.0774969044008596e-06, + "loss": 1.7264, + "step": 947 + }, + { + "epoch": 1.2897959183673469, + "grad_norm": 0.13219269814229098, + "learning_rate": 3.067122096895515e-06, + "loss": 1.6243, + "step": 948 + }, + { + "epoch": 1.291156462585034, + "grad_norm": 0.11897597832335097, + "learning_rate": 3.0567570642642453e-06, + "loss": 1.7471, + "step": 949 + }, + { + "epoch": 1.2925170068027212, + "grad_norm": 0.11968690206341515, + "learning_rate": 3.0464018589246867e-06, + "loss": 1.544, + "step": 950 + }, + { + "epoch": 1.2938775510204081, + "grad_norm": 0.12640952696385496, + "learning_rate": 3.0360565332447754e-06, + "loss": 1.6696, + "step": 951 + }, + { + "epoch": 1.2952380952380953, + "grad_norm": 0.1598172355650226, + "learning_rate": 3.0257211395424846e-06, + "loss": 1.7097, + "step": 952 + }, + { + "epoch": 1.2965986394557822, + "grad_norm": 0.13149924526184206, + "learning_rate": 3.015395730085565e-06, + "loss": 1.6884, + "step": 953 + }, + { + "epoch": 1.2979591836734694, + "grad_norm": 0.14095309321731594, + "learning_rate": 3.0050803570912644e-06, + "loss": 1.766, + "step": 954 + }, + { + "epoch": 1.2993197278911564, + "grad_norm": 0.11707205160488976, + "learning_rate": 2.994775072726087e-06, + "loss": 1.6714, + "step": 955 + }, + { + "epoch": 1.3006802721088435, + "grad_norm": 0.11113529704832883, + "learning_rate": 2.984479929105508e-06, + "loss": 1.6734, + "step": 956 + }, + { + "epoch": 1.3020408163265307, + "grad_norm": 0.12550353089395527, + "learning_rate": 2.9741949782937252e-06, + "loss": 1.5402, + "step": 957 + }, + { + "epoch": 1.3034013605442176, + "grad_norm": 0.15756100509185716, + "learning_rate": 2.9639202723033843e-06, + "loss": 1.7012, + "step": 958 + }, + { + "epoch": 1.3047619047619048, + "grad_norm": 0.145899594151416, + "learning_rate": 2.9536558630953262e-06, + "loss": 1.6383, + "step": 959 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 0.12913992593059825, + "learning_rate": 2.943401802578315e-06, + "loss": 1.5585, + "step": 960 + }, + { + "epoch": 1.307482993197279, + "grad_norm": 0.1278808504306049, + "learning_rate": 2.9331581426087847e-06, + "loss": 1.6784, + "step": 961 + }, + { + "epoch": 1.308843537414966, + "grad_norm": 0.15484400934294676, + "learning_rate": 2.9229249349905686e-06, + "loss": 1.5304, + "step": 962 + }, + { + "epoch": 1.310204081632653, + "grad_norm": 0.11410951165140533, + "learning_rate": 2.912702231474639e-06, + "loss": 1.6496, + "step": 963 + }, + { + "epoch": 1.3115646258503402, + "grad_norm": 0.11183117701615923, + "learning_rate": 2.902490083758856e-06, + "loss": 1.6686, + "step": 964 + }, + { + "epoch": 1.3129251700680271, + "grad_norm": 0.10575601089764713, + "learning_rate": 2.8922885434876846e-06, + "loss": 1.6357, + "step": 965 + }, + { + "epoch": 1.3142857142857143, + "grad_norm": 0.12071397882258213, + "learning_rate": 2.882097662251956e-06, + "loss": 1.7448, + "step": 966 + }, + { + "epoch": 1.3156462585034014, + "grad_norm": 0.12145862487643339, + "learning_rate": 2.8719174915885935e-06, + "loss": 1.5548, + "step": 967 + }, + { + "epoch": 1.3170068027210884, + "grad_norm": 0.11127135568946671, + "learning_rate": 2.8617480829803574e-06, + "loss": 1.6849, + "step": 968 + }, + { + "epoch": 1.3183673469387756, + "grad_norm": 0.11147070098606063, + "learning_rate": 2.8515894878555766e-06, + "loss": 1.518, + "step": 969 + }, + { + "epoch": 1.3197278911564627, + "grad_norm": 0.12955537422802776, + "learning_rate": 2.841441757587901e-06, + "loss": 1.6614, + "step": 970 + }, + { + "epoch": 1.3210884353741497, + "grad_norm": 0.10292257541425158, + "learning_rate": 2.831304943496033e-06, + "loss": 1.7421, + "step": 971 + }, + { + "epoch": 1.3224489795918366, + "grad_norm": 0.10623781986886265, + "learning_rate": 2.8211790968434692e-06, + "loss": 1.6891, + "step": 972 + }, + { + "epoch": 1.3238095238095238, + "grad_norm": 0.13299637838745879, + "learning_rate": 2.8110642688382455e-06, + "loss": 1.7151, + "step": 973 + }, + { + "epoch": 1.325170068027211, + "grad_norm": 0.12676062857059178, + "learning_rate": 2.800960510632668e-06, + "loss": 1.7884, + "step": 974 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.10370471610066626, + "learning_rate": 2.790867873323067e-06, + "loss": 1.7972, + "step": 975 + }, + { + "epoch": 1.327891156462585, + "grad_norm": 0.11469407246054221, + "learning_rate": 2.7807864079495308e-06, + "loss": 1.7044, + "step": 976 + }, + { + "epoch": 1.3292517006802722, + "grad_norm": 0.127975064009094, + "learning_rate": 2.7707161654956516e-06, + "loss": 1.588, + "step": 977 + }, + { + "epoch": 1.3306122448979592, + "grad_norm": 0.1431937066191807, + "learning_rate": 2.760657196888259e-06, + "loss": 1.6176, + "step": 978 + }, + { + "epoch": 1.3319727891156463, + "grad_norm": 0.12699844201617452, + "learning_rate": 2.750609552997173e-06, + "loss": 1.6426, + "step": 979 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.12207870723428496, + "learning_rate": 2.740573284634949e-06, + "loss": 1.6522, + "step": 980 + }, + { + "epoch": 1.3346938775510204, + "grad_norm": 0.12255489213356785, + "learning_rate": 2.7305484425566033e-06, + "loss": 1.6833, + "step": 981 + }, + { + "epoch": 1.3360544217687074, + "grad_norm": 0.10634070168511277, + "learning_rate": 2.7205350774593757e-06, + "loss": 1.6049, + "step": 982 + }, + { + "epoch": 1.3374149659863945, + "grad_norm": 0.13904029518050998, + "learning_rate": 2.7105332399824567e-06, + "loss": 1.5175, + "step": 983 + }, + { + "epoch": 1.3387755102040817, + "grad_norm": 0.3143888763552485, + "learning_rate": 2.700542980706754e-06, + "loss": 1.6381, + "step": 984 + }, + { + "epoch": 1.3401360544217686, + "grad_norm": 0.11248808835798864, + "learning_rate": 2.6905643501546066e-06, + "loss": 1.6597, + "step": 985 + }, + { + "epoch": 1.3414965986394558, + "grad_norm": 0.12165354663856978, + "learning_rate": 2.680597398789554e-06, + "loss": 1.6165, + "step": 986 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.13469020590289893, + "learning_rate": 2.670642177016071e-06, + "loss": 1.6989, + "step": 987 + }, + { + "epoch": 1.34421768707483, + "grad_norm": 0.10403924107277954, + "learning_rate": 2.6606987351793134e-06, + "loss": 1.6892, + "step": 988 + }, + { + "epoch": 1.345578231292517, + "grad_norm": 0.12657716162900132, + "learning_rate": 2.6507671235648667e-06, + "loss": 1.8571, + "step": 989 + }, + { + "epoch": 1.346938775510204, + "grad_norm": 0.14141908912174195, + "learning_rate": 2.6408473923984813e-06, + "loss": 1.5353, + "step": 990 + }, + { + "epoch": 1.3482993197278912, + "grad_norm": 0.1474252826828531, + "learning_rate": 2.630939591845834e-06, + "loss": 1.5924, + "step": 991 + }, + { + "epoch": 1.3496598639455781, + "grad_norm": 0.12543913644142501, + "learning_rate": 2.6210437720122633e-06, + "loss": 1.6871, + "step": 992 + }, + { + "epoch": 1.3510204081632653, + "grad_norm": 0.13883630578353837, + "learning_rate": 2.6111599829425228e-06, + "loss": 1.6789, + "step": 993 + }, + { + "epoch": 1.3523809523809525, + "grad_norm": 0.12846573886694015, + "learning_rate": 2.6012882746205164e-06, + "loss": 1.6067, + "step": 994 + }, + { + "epoch": 1.3537414965986394, + "grad_norm": 0.12008961120991984, + "learning_rate": 2.591428696969062e-06, + "loss": 1.5903, + "step": 995 + }, + { + "epoch": 1.3551020408163266, + "grad_norm": 0.11903477733151935, + "learning_rate": 2.581581299849627e-06, + "loss": 1.6369, + "step": 996 + }, + { + "epoch": 1.3564625850340137, + "grad_norm": 0.13999706538475443, + "learning_rate": 2.571746133062082e-06, + "loss": 1.5367, + "step": 997 + }, + { + "epoch": 1.3578231292517007, + "grad_norm": 0.11963009361233483, + "learning_rate": 2.5619232463444453e-06, + "loss": 1.4038, + "step": 998 + }, + { + "epoch": 1.3591836734693876, + "grad_norm": 0.12364356934092328, + "learning_rate": 2.5521126893726287e-06, + "loss": 1.5599, + "step": 999 + }, + { + "epoch": 1.3605442176870748, + "grad_norm": 0.1339584043443562, + "learning_rate": 2.542314511760203e-06, + "loss": 1.5419, + "step": 1000 + }, + { + "epoch": 1.361904761904762, + "grad_norm": 0.11610116574614443, + "learning_rate": 2.5325287630581197e-06, + "loss": 1.7461, + "step": 1001 + }, + { + "epoch": 1.363265306122449, + "grad_norm": 0.11167585561833832, + "learning_rate": 2.5227554927544863e-06, + "loss": 1.7333, + "step": 1002 + }, + { + "epoch": 1.364625850340136, + "grad_norm": 0.1443619458869183, + "learning_rate": 2.512994750274295e-06, + "loss": 1.5944, + "step": 1003 + }, + { + "epoch": 1.3659863945578232, + "grad_norm": 0.248168814176006, + "learning_rate": 2.5032465849791964e-06, + "loss": 1.781, + "step": 1004 + }, + { + "epoch": 1.3673469387755102, + "grad_norm": 0.23421079605400402, + "learning_rate": 2.4935110461672225e-06, + "loss": 1.635, + "step": 1005 + }, + { + "epoch": 1.3687074829931973, + "grad_norm": 0.1446962355000272, + "learning_rate": 2.4837881830725584e-06, + "loss": 1.4852, + "step": 1006 + }, + { + "epoch": 1.3700680272108843, + "grad_norm": 0.1291998509141991, + "learning_rate": 2.4740780448652866e-06, + "loss": 1.6777, + "step": 1007 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 0.09864313043191297, + "learning_rate": 2.4643806806511344e-06, + "loss": 1.6307, + "step": 1008 + }, + { + "epoch": 1.3727891156462584, + "grad_norm": 0.11658909231376899, + "learning_rate": 2.4546961394712324e-06, + "loss": 1.6741, + "step": 1009 + }, + { + "epoch": 1.3741496598639455, + "grad_norm": 0.12095557653741797, + "learning_rate": 2.4450244703018587e-06, + "loss": 1.6758, + "step": 1010 + }, + { + "epoch": 1.3755102040816327, + "grad_norm": 0.13782074238898825, + "learning_rate": 2.4353657220541986e-06, + "loss": 1.4454, + "step": 1011 + }, + { + "epoch": 1.3768707482993197, + "grad_norm": 0.12061869165588096, + "learning_rate": 2.4257199435740947e-06, + "loss": 1.6681, + "step": 1012 + }, + { + "epoch": 1.3782312925170068, + "grad_norm": 0.10755743305983602, + "learning_rate": 2.4160871836417992e-06, + "loss": 1.7763, + "step": 1013 + }, + { + "epoch": 1.379591836734694, + "grad_norm": 0.13833342392035397, + "learning_rate": 2.406467490971724e-06, + "loss": 1.4646, + "step": 1014 + }, + { + "epoch": 1.380952380952381, + "grad_norm": 0.14780468168264022, + "learning_rate": 2.396860914212199e-06, + "loss": 1.5136, + "step": 1015 + }, + { + "epoch": 1.382312925170068, + "grad_norm": 0.11362232721903939, + "learning_rate": 2.387267501945233e-06, + "loss": 1.644, + "step": 1016 + }, + { + "epoch": 1.383673469387755, + "grad_norm": 0.227770381091171, + "learning_rate": 2.3776873026862466e-06, + "loss": 1.7492, + "step": 1017 + }, + { + "epoch": 1.3850340136054422, + "grad_norm": 0.11603935670163865, + "learning_rate": 2.368120364883851e-06, + "loss": 1.5114, + "step": 1018 + }, + { + "epoch": 1.3863945578231291, + "grad_norm": 0.1168850031428384, + "learning_rate": 2.3585667369195815e-06, + "loss": 1.623, + "step": 1019 + }, + { + "epoch": 1.3877551020408163, + "grad_norm": 0.11197564926295547, + "learning_rate": 2.349026467107677e-06, + "loss": 1.8705, + "step": 1020 + }, + { + "epoch": 1.3891156462585035, + "grad_norm": 0.144237418189964, + "learning_rate": 2.3394996036948094e-06, + "loss": 1.6448, + "step": 1021 + }, + { + "epoch": 1.3904761904761904, + "grad_norm": 0.11173691780000068, + "learning_rate": 2.3299861948598585e-06, + "loss": 1.6681, + "step": 1022 + }, + { + "epoch": 1.3918367346938776, + "grad_norm": 0.15293585820058914, + "learning_rate": 2.320486288713662e-06, + "loss": 1.6767, + "step": 1023 + }, + { + "epoch": 1.3931972789115648, + "grad_norm": 0.11604159237097413, + "learning_rate": 2.3109999332987714e-06, + "loss": 1.7475, + "step": 1024 + }, + { + "epoch": 1.3945578231292517, + "grad_norm": 0.11608326681410934, + "learning_rate": 2.301527176589211e-06, + "loss": 1.7182, + "step": 1025 + }, + { + "epoch": 1.3959183673469386, + "grad_norm": 0.11802001766377367, + "learning_rate": 2.2920680664902305e-06, + "loss": 1.6692, + "step": 1026 + }, + { + "epoch": 1.3972789115646258, + "grad_norm": 0.14326115262175532, + "learning_rate": 2.282622650838071e-06, + "loss": 1.5178, + "step": 1027 + }, + { + "epoch": 1.398639455782313, + "grad_norm": 0.12464124111634249, + "learning_rate": 2.273190977399717e-06, + "loss": 1.7335, + "step": 1028 + }, + { + "epoch": 1.4, + "grad_norm": 0.12412136945656944, + "learning_rate": 2.2637730938726578e-06, + "loss": 1.6765, + "step": 1029 + }, + { + "epoch": 1.401360544217687, + "grad_norm": 0.11072133022020544, + "learning_rate": 2.254369047884639e-06, + "loss": 1.6853, + "step": 1030 + }, + { + "epoch": 1.4027210884353742, + "grad_norm": 0.11744625816603456, + "learning_rate": 2.2449788869934332e-06, + "loss": 1.7241, + "step": 1031 + }, + { + "epoch": 1.4040816326530612, + "grad_norm": 0.11305715625400635, + "learning_rate": 2.2356026586865976e-06, + "loss": 1.828, + "step": 1032 + }, + { + "epoch": 1.4054421768707483, + "grad_norm": 0.11411548926790505, + "learning_rate": 2.2262404103812197e-06, + "loss": 1.7667, + "step": 1033 + }, + { + "epoch": 1.4068027210884353, + "grad_norm": 0.13159909159310862, + "learning_rate": 2.2168921894236963e-06, + "loss": 1.772, + "step": 1034 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 0.10869740493369748, + "learning_rate": 2.207558043089478e-06, + "loss": 1.6783, + "step": 1035 + }, + { + "epoch": 1.4095238095238094, + "grad_norm": 0.18634482438552744, + "learning_rate": 2.1982380185828485e-06, + "loss": 1.6199, + "step": 1036 + }, + { + "epoch": 1.4108843537414966, + "grad_norm": 0.12490228313384627, + "learning_rate": 2.1889321630366645e-06, + "loss": 1.6002, + "step": 1037 + }, + { + "epoch": 1.4122448979591837, + "grad_norm": 0.13407480013102485, + "learning_rate": 2.179640523512136e-06, + "loss": 1.4965, + "step": 1038 + }, + { + "epoch": 1.4136054421768707, + "grad_norm": 0.12540761630148858, + "learning_rate": 2.1703631469985713e-06, + "loss": 1.5602, + "step": 1039 + }, + { + "epoch": 1.4149659863945578, + "grad_norm": 0.11826915965312917, + "learning_rate": 2.1611000804131588e-06, + "loss": 1.6845, + "step": 1040 + }, + { + "epoch": 1.416326530612245, + "grad_norm": 0.11638811328551839, + "learning_rate": 2.1518513706007154e-06, + "loss": 1.7135, + "step": 1041 + }, + { + "epoch": 1.417687074829932, + "grad_norm": 0.12121992820341544, + "learning_rate": 2.1426170643334477e-06, + "loss": 1.6918, + "step": 1042 + }, + { + "epoch": 1.4190476190476191, + "grad_norm": 0.127097445433669, + "learning_rate": 2.133397208310728e-06, + "loss": 1.5874, + "step": 1043 + }, + { + "epoch": 1.420408163265306, + "grad_norm": 0.1120249401560636, + "learning_rate": 2.1241918491588503e-06, + "loss": 1.6122, + "step": 1044 + }, + { + "epoch": 1.4217687074829932, + "grad_norm": 0.13980832815015817, + "learning_rate": 2.1150010334307954e-06, + "loss": 1.7686, + "step": 1045 + }, + { + "epoch": 1.4231292517006802, + "grad_norm": 0.16218374995611207, + "learning_rate": 2.1058248076059916e-06, + "loss": 1.7442, + "step": 1046 + }, + { + "epoch": 1.4244897959183673, + "grad_norm": 0.11333274239691936, + "learning_rate": 2.09666321809009e-06, + "loss": 1.7052, + "step": 1047 + }, + { + "epoch": 1.4258503401360545, + "grad_norm": 0.11296676049286146, + "learning_rate": 2.087516311214719e-06, + "loss": 1.6709, + "step": 1048 + }, + { + "epoch": 1.4272108843537414, + "grad_norm": 0.16489322883195276, + "learning_rate": 2.0783841332372567e-06, + "loss": 1.6074, + "step": 1049 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.138226915964094, + "learning_rate": 2.0692667303405962e-06, + "loss": 1.6082, + "step": 1050 + }, + { + "epoch": 1.4299319727891158, + "grad_norm": 0.15926811847519332, + "learning_rate": 2.060164148632903e-06, + "loss": 1.5277, + "step": 1051 + }, + { + "epoch": 1.4312925170068027, + "grad_norm": 0.16960130944371998, + "learning_rate": 2.0510764341474032e-06, + "loss": 1.6701, + "step": 1052 + }, + { + "epoch": 1.4326530612244899, + "grad_norm": 0.11553461200820175, + "learning_rate": 2.042003632842123e-06, + "loss": 1.604, + "step": 1053 + }, + { + "epoch": 1.4340136054421768, + "grad_norm": 0.12461036286243339, + "learning_rate": 2.0329457905996785e-06, + "loss": 1.7384, + "step": 1054 + }, + { + "epoch": 1.435374149659864, + "grad_norm": 0.12232104998061016, + "learning_rate": 2.023902953227029e-06, + "loss": 1.639, + "step": 1055 + }, + { + "epoch": 1.436734693877551, + "grad_norm": 0.11267495872797326, + "learning_rate": 2.014875166455264e-06, + "loss": 1.7093, + "step": 1056 + }, + { + "epoch": 1.438095238095238, + "grad_norm": 0.11804588705775097, + "learning_rate": 2.005862475939344e-06, + "loss": 1.7411, + "step": 1057 + }, + { + "epoch": 1.4394557823129253, + "grad_norm": 0.10870950926259154, + "learning_rate": 1.9968649272578955e-06, + "loss": 1.8004, + "step": 1058 + }, + { + "epoch": 1.4408163265306122, + "grad_norm": 0.15480060013281374, + "learning_rate": 1.987882565912968e-06, + "loss": 1.5773, + "step": 1059 + }, + { + "epoch": 1.4421768707482994, + "grad_norm": 0.13989692137884865, + "learning_rate": 1.978915437329806e-06, + "loss": 1.6361, + "step": 1060 + }, + { + "epoch": 1.4435374149659865, + "grad_norm": 0.10649783390764571, + "learning_rate": 1.96996358685662e-06, + "loss": 1.6262, + "step": 1061 + }, + { + "epoch": 1.4448979591836735, + "grad_norm": 0.14326122461794985, + "learning_rate": 1.961027059764354e-06, + "loss": 1.5732, + "step": 1062 + }, + { + "epoch": 1.4462585034013604, + "grad_norm": 0.12408176967614352, + "learning_rate": 1.952105901246461e-06, + "loss": 1.5585, + "step": 1063 + }, + { + "epoch": 1.4476190476190476, + "grad_norm": 0.16522994922872408, + "learning_rate": 1.943200156418674e-06, + "loss": 1.6795, + "step": 1064 + }, + { + "epoch": 1.4489795918367347, + "grad_norm": 0.3681795789769912, + "learning_rate": 1.9343098703187777e-06, + "loss": 1.5695, + "step": 1065 + }, + { + "epoch": 1.4503401360544217, + "grad_norm": 0.13093870638144472, + "learning_rate": 1.925435087906373e-06, + "loss": 1.4991, + "step": 1066 + }, + { + "epoch": 1.4517006802721089, + "grad_norm": 0.11132092495609136, + "learning_rate": 1.9165758540626595e-06, + "loss": 1.7495, + "step": 1067 + }, + { + "epoch": 1.453061224489796, + "grad_norm": 0.1320841952456979, + "learning_rate": 1.907732213590212e-06, + "loss": 1.6193, + "step": 1068 + }, + { + "epoch": 1.454421768707483, + "grad_norm": 0.13072820770934512, + "learning_rate": 1.898904211212736e-06, + "loss": 1.6072, + "step": 1069 + }, + { + "epoch": 1.4557823129251701, + "grad_norm": 0.1309250057364391, + "learning_rate": 1.8900918915748612e-06, + "loss": 1.6395, + "step": 1070 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 0.12551858156974602, + "learning_rate": 1.881295299241898e-06, + "loss": 1.6186, + "step": 1071 + }, + { + "epoch": 1.4585034013605442, + "grad_norm": 0.13244662699660312, + "learning_rate": 1.8725144786996347e-06, + "loss": 1.7768, + "step": 1072 + }, + { + "epoch": 1.4598639455782312, + "grad_norm": 0.12869848294692346, + "learning_rate": 1.8637494743540851e-06, + "loss": 1.8306, + "step": 1073 + }, + { + "epoch": 1.4612244897959183, + "grad_norm": 0.10453495592437635, + "learning_rate": 1.855000330531289e-06, + "loss": 1.7562, + "step": 1074 + }, + { + "epoch": 1.4625850340136055, + "grad_norm": 0.11842884214674637, + "learning_rate": 1.846267091477067e-06, + "loss": 1.6221, + "step": 1075 + }, + { + "epoch": 1.4639455782312925, + "grad_norm": 0.1366451554577879, + "learning_rate": 1.8375498013568166e-06, + "loss": 1.4415, + "step": 1076 + }, + { + "epoch": 1.4653061224489796, + "grad_norm": 0.11721529234776305, + "learning_rate": 1.8288485042552739e-06, + "loss": 1.6414, + "step": 1077 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.1296793815948946, + "learning_rate": 1.820163244176294e-06, + "loss": 1.5815, + "step": 1078 + }, + { + "epoch": 1.4680272108843537, + "grad_norm": 0.10030127712618647, + "learning_rate": 1.8114940650426332e-06, + "loss": 1.7649, + "step": 1079 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 0.14298941316959418, + "learning_rate": 1.8028410106957239e-06, + "loss": 1.6847, + "step": 1080 + }, + { + "epoch": 1.4707482993197278, + "grad_norm": 0.140985161297301, + "learning_rate": 1.7942041248954534e-06, + "loss": 1.6794, + "step": 1081 + }, + { + "epoch": 1.472108843537415, + "grad_norm": 0.11886070737447318, + "learning_rate": 1.7855834513199365e-06, + "loss": 1.6796, + "step": 1082 + }, + { + "epoch": 1.473469387755102, + "grad_norm": 0.12803178657461758, + "learning_rate": 1.7769790335653076e-06, + "loss": 1.543, + "step": 1083 + }, + { + "epoch": 1.474829931972789, + "grad_norm": 0.10256161260658421, + "learning_rate": 1.7683909151454888e-06, + "loss": 1.7032, + "step": 1084 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.21096092474063058, + "learning_rate": 1.7598191394919738e-06, + "loss": 1.5423, + "step": 1085 + }, + { + "epoch": 1.4775510204081632, + "grad_norm": 0.12083343658033356, + "learning_rate": 1.7512637499536123e-06, + "loss": 1.6169, + "step": 1086 + }, + { + "epoch": 1.4789115646258504, + "grad_norm": 0.11608713517334812, + "learning_rate": 1.7427247897963784e-06, + "loss": 1.7293, + "step": 1087 + }, + { + "epoch": 1.4802721088435375, + "grad_norm": 0.1319069173143083, + "learning_rate": 1.7342023022031712e-06, + "loss": 1.455, + "step": 1088 + }, + { + "epoch": 1.4816326530612245, + "grad_norm": 0.09994450469778726, + "learning_rate": 1.7256963302735752e-06, + "loss": 1.8571, + "step": 1089 + }, + { + "epoch": 1.4829931972789114, + "grad_norm": 0.11652396846241518, + "learning_rate": 1.71720691702366e-06, + "loss": 1.7252, + "step": 1090 + }, + { + "epoch": 1.4843537414965986, + "grad_norm": 0.14171795087480568, + "learning_rate": 1.7087341053857475e-06, + "loss": 1.5228, + "step": 1091 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 0.11989790454950104, + "learning_rate": 1.700277938208214e-06, + "loss": 1.577, + "step": 1092 + }, + { + "epoch": 1.4870748299319727, + "grad_norm": 0.10858297624497283, + "learning_rate": 1.6918384582552504e-06, + "loss": 1.7052, + "step": 1093 + }, + { + "epoch": 1.4884353741496599, + "grad_norm": 0.10917059280407902, + "learning_rate": 1.6834157082066638e-06, + "loss": 1.7003, + "step": 1094 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 0.0982808829346907, + "learning_rate": 1.6750097306576552e-06, + "loss": 1.6922, + "step": 1095 + }, + { + "epoch": 1.491156462585034, + "grad_norm": 0.13697494235902768, + "learning_rate": 1.6666205681186032e-06, + "loss": 1.5286, + "step": 1096 + }, + { + "epoch": 1.4925170068027211, + "grad_norm": 0.12713384229112287, + "learning_rate": 1.6582482630148516e-06, + "loss": 1.5449, + "step": 1097 + }, + { + "epoch": 1.493877551020408, + "grad_norm": 0.28082296494568143, + "learning_rate": 1.6498928576864892e-06, + "loss": 1.6008, + "step": 1098 + }, + { + "epoch": 1.4952380952380953, + "grad_norm": 0.12441418528009313, + "learning_rate": 1.6415543943881457e-06, + "loss": 1.6326, + "step": 1099 + }, + { + "epoch": 1.4965986394557822, + "grad_norm": 0.12076209117459938, + "learning_rate": 1.6332329152887683e-06, + "loss": 1.6048, + "step": 1100 + }, + { + "epoch": 1.4979591836734694, + "grad_norm": 0.10773083863289847, + "learning_rate": 1.624928462471414e-06, + "loss": 1.7003, + "step": 1101 + }, + { + "epoch": 1.4993197278911565, + "grad_norm": 0.12873545682401677, + "learning_rate": 1.6166410779330372e-06, + "loss": 1.7761, + "step": 1102 + }, + { + "epoch": 1.5006802721088435, + "grad_norm": 0.12486712488322993, + "learning_rate": 1.6083708035842683e-06, + "loss": 1.6096, + "step": 1103 + }, + { + "epoch": 1.5020408163265306, + "grad_norm": 0.11819276519267044, + "learning_rate": 1.6001176812492164e-06, + "loss": 1.644, + "step": 1104 + }, + { + "epoch": 1.5020408163265306, + "eval_loss": 1.686686635017395, + "eval_runtime": 76.5083, + "eval_samples_per_second": 53.236, + "eval_steps_per_second": 6.666, + "step": 1104 + }, + { + "epoch": 1.5034013605442178, + "grad_norm": 0.12565443858963696, + "learning_rate": 1.5918817526652457e-06, + "loss": 1.6855, + "step": 1105 + }, + { + "epoch": 1.5047619047619047, + "grad_norm": 0.1235458965699543, + "learning_rate": 1.583663059482773e-06, + "loss": 1.5379, + "step": 1106 + }, + { + "epoch": 1.5061224489795917, + "grad_norm": 0.11893034993481316, + "learning_rate": 1.5754616432650443e-06, + "loss": 1.6431, + "step": 1107 + }, + { + "epoch": 1.507482993197279, + "grad_norm": 0.11339427750946436, + "learning_rate": 1.5672775454879458e-06, + "loss": 1.7272, + "step": 1108 + }, + { + "epoch": 1.508843537414966, + "grad_norm": 0.1845008212172551, + "learning_rate": 1.5591108075397698e-06, + "loss": 1.6404, + "step": 1109 + }, + { + "epoch": 1.510204081632653, + "grad_norm": 0.12462997782891976, + "learning_rate": 1.5509614707210235e-06, + "loss": 1.7679, + "step": 1110 + }, + { + "epoch": 1.5115646258503401, + "grad_norm": 0.12287064399894775, + "learning_rate": 1.542829576244213e-06, + "loss": 1.7342, + "step": 1111 + }, + { + "epoch": 1.5129251700680273, + "grad_norm": 0.1480464966747393, + "learning_rate": 1.5347151652336351e-06, + "loss": 1.552, + "step": 1112 + }, + { + "epoch": 1.5142857142857142, + "grad_norm": 0.11081261407520034, + "learning_rate": 1.5266182787251705e-06, + "loss": 1.8438, + "step": 1113 + }, + { + "epoch": 1.5156462585034014, + "grad_norm": 0.1029266924311367, + "learning_rate": 1.5185389576660725e-06, + "loss": 1.7652, + "step": 1114 + }, + { + "epoch": 1.5170068027210886, + "grad_norm": 0.12456172646180011, + "learning_rate": 1.5104772429147674e-06, + "loss": 1.5471, + "step": 1115 + }, + { + "epoch": 1.5183673469387755, + "grad_norm": 0.1371815920370596, + "learning_rate": 1.5024331752406418e-06, + "loss": 1.7099, + "step": 1116 + }, + { + "epoch": 1.5197278911564625, + "grad_norm": 0.12573407133507336, + "learning_rate": 1.4944067953238396e-06, + "loss": 1.4374, + "step": 1117 + }, + { + "epoch": 1.5210884353741496, + "grad_norm": 0.13604275704368526, + "learning_rate": 1.48639814375505e-06, + "loss": 1.5731, + "step": 1118 + }, + { + "epoch": 1.5224489795918368, + "grad_norm": 0.25410293091386754, + "learning_rate": 1.4784072610353129e-06, + "loss": 1.7197, + "step": 1119 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.114015489104318, + "learning_rate": 1.4704341875758055e-06, + "loss": 1.766, + "step": 1120 + }, + { + "epoch": 1.525170068027211, + "grad_norm": 0.12807572724335445, + "learning_rate": 1.4624789636976411e-06, + "loss": 1.4984, + "step": 1121 + }, + { + "epoch": 1.526530612244898, + "grad_norm": 0.12697960510593037, + "learning_rate": 1.4545416296316667e-06, + "loss": 1.4318, + "step": 1122 + }, + { + "epoch": 1.527891156462585, + "grad_norm": 0.2166265032856606, + "learning_rate": 1.446622225518251e-06, + "loss": 1.5213, + "step": 1123 + }, + { + "epoch": 1.5292517006802722, + "grad_norm": 0.12831405795176154, + "learning_rate": 1.4387207914070995e-06, + "loss": 1.8113, + "step": 1124 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 0.11776616874418332, + "learning_rate": 1.4308373672570286e-06, + "loss": 1.6389, + "step": 1125 + }, + { + "epoch": 1.5319727891156463, + "grad_norm": 0.11476693865274581, + "learning_rate": 1.4229719929357838e-06, + "loss": 1.6624, + "step": 1126 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.11217140275465112, + "learning_rate": 1.4151247082198223e-06, + "loss": 1.4694, + "step": 1127 + }, + { + "epoch": 1.5346938775510204, + "grad_norm": 0.12808950619446724, + "learning_rate": 1.4072955527941283e-06, + "loss": 1.5511, + "step": 1128 + }, + { + "epoch": 1.5360544217687075, + "grad_norm": 0.10834567787069439, + "learning_rate": 1.3994845662519985e-06, + "loss": 1.7348, + "step": 1129 + }, + { + "epoch": 1.5374149659863945, + "grad_norm": 0.12572410397799513, + "learning_rate": 1.3916917880948434e-06, + "loss": 1.6245, + "step": 1130 + }, + { + "epoch": 1.5387755102040817, + "grad_norm": 0.13586524305503017, + "learning_rate": 1.3839172577319953e-06, + "loss": 1.6267, + "step": 1131 + }, + { + "epoch": 1.5401360544217688, + "grad_norm": 0.1286100284581808, + "learning_rate": 1.3761610144805037e-06, + "loss": 1.6301, + "step": 1132 + }, + { + "epoch": 1.5414965986394558, + "grad_norm": 0.1380668839742671, + "learning_rate": 1.3684230975649387e-06, + "loss": 1.5868, + "step": 1133 + }, + { + "epoch": 1.5428571428571427, + "grad_norm": 0.12199901535322918, + "learning_rate": 1.3607035461171858e-06, + "loss": 1.4813, + "step": 1134 + }, + { + "epoch": 1.54421768707483, + "grad_norm": 0.14412311583495385, + "learning_rate": 1.3530023991762582e-06, + "loss": 1.6484, + "step": 1135 + }, + { + "epoch": 1.545578231292517, + "grad_norm": 0.13408653590603778, + "learning_rate": 1.3453196956880932e-06, + "loss": 1.4982, + "step": 1136 + }, + { + "epoch": 1.546938775510204, + "grad_norm": 0.10781564217676724, + "learning_rate": 1.3376554745053566e-06, + "loss": 1.6386, + "step": 1137 + }, + { + "epoch": 1.5482993197278911, + "grad_norm": 0.12672977099050545, + "learning_rate": 1.3300097743872476e-06, + "loss": 1.6502, + "step": 1138 + }, + { + "epoch": 1.5496598639455783, + "grad_norm": 0.2591283699743902, + "learning_rate": 1.3223826339992973e-06, + "loss": 1.7084, + "step": 1139 + }, + { + "epoch": 1.5510204081632653, + "grad_norm": 0.11440960704306805, + "learning_rate": 1.3147740919131814e-06, + "loss": 1.7807, + "step": 1140 + }, + { + "epoch": 1.5523809523809524, + "grad_norm": 0.11225169501875551, + "learning_rate": 1.3071841866065194e-06, + "loss": 1.7457, + "step": 1141 + }, + { + "epoch": 1.5537414965986396, + "grad_norm": 0.11995542951043948, + "learning_rate": 1.299612956462683e-06, + "loss": 1.6081, + "step": 1142 + }, + { + "epoch": 1.5551020408163265, + "grad_norm": 0.12456556248904507, + "learning_rate": 1.2920604397705955e-06, + "loss": 1.5734, + "step": 1143 + }, + { + "epoch": 1.5564625850340135, + "grad_norm": 0.12126055030258995, + "learning_rate": 1.2845266747245528e-06, + "loss": 1.7169, + "step": 1144 + }, + { + "epoch": 1.5578231292517006, + "grad_norm": 0.11877363967721631, + "learning_rate": 1.27701169942401e-06, + "loss": 1.8646, + "step": 1145 + }, + { + "epoch": 1.5591836734693878, + "grad_norm": 0.1274047299203738, + "learning_rate": 1.269515551873407e-06, + "loss": 1.5867, + "step": 1146 + }, + { + "epoch": 1.5605442176870747, + "grad_norm": 0.24308037811468208, + "learning_rate": 1.262038269981966e-06, + "loss": 1.7842, + "step": 1147 + }, + { + "epoch": 1.561904761904762, + "grad_norm": 0.101172361144347, + "learning_rate": 1.254579891563502e-06, + "loss": 1.7421, + "step": 1148 + }, + { + "epoch": 1.563265306122449, + "grad_norm": 0.11594265987998417, + "learning_rate": 1.2471404543362353e-06, + "loss": 1.6775, + "step": 1149 + }, + { + "epoch": 1.564625850340136, + "grad_norm": 0.13220275668191248, + "learning_rate": 1.2397199959225914e-06, + "loss": 1.8161, + "step": 1150 + }, + { + "epoch": 1.5659863945578232, + "grad_norm": 0.18599776723683806, + "learning_rate": 1.232318553849023e-06, + "loss": 1.6873, + "step": 1151 + }, + { + "epoch": 1.5673469387755103, + "grad_norm": 0.12379825057112667, + "learning_rate": 1.2249361655458104e-06, + "loss": 1.5921, + "step": 1152 + }, + { + "epoch": 1.5687074829931973, + "grad_norm": 0.1388929946755345, + "learning_rate": 1.2175728683468802e-06, + "loss": 1.601, + "step": 1153 + }, + { + "epoch": 1.5700680272108842, + "grad_norm": 0.11853041576236652, + "learning_rate": 1.210228699489605e-06, + "loss": 1.6044, + "step": 1154 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.10770432816956119, + "learning_rate": 1.2029036961146284e-06, + "loss": 1.6532, + "step": 1155 + }, + { + "epoch": 1.5727891156462586, + "grad_norm": 0.11627754549370563, + "learning_rate": 1.19559789526567e-06, + "loss": 1.573, + "step": 1156 + }, + { + "epoch": 1.5741496598639455, + "grad_norm": 0.12390094032474464, + "learning_rate": 1.1883113338893376e-06, + "loss": 1.6402, + "step": 1157 + }, + { + "epoch": 1.5755102040816327, + "grad_norm": 0.13389994045491743, + "learning_rate": 1.181044048834944e-06, + "loss": 1.7001, + "step": 1158 + }, + { + "epoch": 1.5768707482993198, + "grad_norm": 0.13235829797643509, + "learning_rate": 1.173796076854315e-06, + "loss": 1.5359, + "step": 1159 + }, + { + "epoch": 1.5782312925170068, + "grad_norm": 0.14770782101504495, + "learning_rate": 1.1665674546016103e-06, + "loss": 1.618, + "step": 1160 + }, + { + "epoch": 1.5795918367346937, + "grad_norm": 0.12522035555164912, + "learning_rate": 1.1593582186331342e-06, + "loss": 1.7929, + "step": 1161 + }, + { + "epoch": 1.580952380952381, + "grad_norm": 0.13198188480502632, + "learning_rate": 1.1521684054071524e-06, + "loss": 1.6348, + "step": 1162 + }, + { + "epoch": 1.582312925170068, + "grad_norm": 0.12666436947708926, + "learning_rate": 1.144998051283701e-06, + "loss": 1.5635, + "step": 1163 + }, + { + "epoch": 1.583673469387755, + "grad_norm": 0.13525407680642348, + "learning_rate": 1.1378471925244172e-06, + "loss": 1.8656, + "step": 1164 + }, + { + "epoch": 1.5850340136054422, + "grad_norm": 0.15049864837576243, + "learning_rate": 1.1307158652923427e-06, + "loss": 1.4551, + "step": 1165 + }, + { + "epoch": 1.5863945578231293, + "grad_norm": 0.1353796405332583, + "learning_rate": 1.1236041056517416e-06, + "loss": 1.7247, + "step": 1166 + }, + { + "epoch": 1.5877551020408163, + "grad_norm": 0.13147241896533723, + "learning_rate": 1.1165119495679272e-06, + "loss": 1.5845, + "step": 1167 + }, + { + "epoch": 1.5891156462585034, + "grad_norm": 0.11125489339677437, + "learning_rate": 1.1094394329070713e-06, + "loss": 1.6689, + "step": 1168 + }, + { + "epoch": 1.5904761904761906, + "grad_norm": 0.14732043400502948, + "learning_rate": 1.1023865914360288e-06, + "loss": 1.4794, + "step": 1169 + }, + { + "epoch": 1.5918367346938775, + "grad_norm": 0.1455095242199474, + "learning_rate": 1.095353460822149e-06, + "loss": 1.6856, + "step": 1170 + }, + { + "epoch": 1.5931972789115645, + "grad_norm": 0.14264573128464317, + "learning_rate": 1.0883400766331047e-06, + "loss": 1.7051, + "step": 1171 + }, + { + "epoch": 1.5945578231292517, + "grad_norm": 0.2311325753008129, + "learning_rate": 1.081346474336707e-06, + "loss": 1.6173, + "step": 1172 + }, + { + "epoch": 1.5959183673469388, + "grad_norm": 0.14054024648190236, + "learning_rate": 1.0743726893007257e-06, + "loss": 1.7158, + "step": 1173 + }, + { + "epoch": 1.5972789115646258, + "grad_norm": 0.16087280596160825, + "learning_rate": 1.067418756792713e-06, + "loss": 1.5578, + "step": 1174 + }, + { + "epoch": 1.598639455782313, + "grad_norm": 0.1203122510282115, + "learning_rate": 1.0604847119798212e-06, + "loss": 1.668, + "step": 1175 + }, + { + "epoch": 1.6, + "grad_norm": 0.11094545607591003, + "learning_rate": 1.0535705899286292e-06, + "loss": 1.6685, + "step": 1176 + }, + { + "epoch": 1.601360544217687, + "grad_norm": 0.14762381520202184, + "learning_rate": 1.0466764256049632e-06, + "loss": 1.536, + "step": 1177 + }, + { + "epoch": 1.6027210884353742, + "grad_norm": 0.11571415514742311, + "learning_rate": 1.039802253873722e-06, + "loss": 1.7084, + "step": 1178 + }, + { + "epoch": 1.6040816326530614, + "grad_norm": 0.13034797124348868, + "learning_rate": 1.0329481094986921e-06, + "loss": 1.7737, + "step": 1179 + }, + { + "epoch": 1.6054421768707483, + "grad_norm": 0.1202706060362133, + "learning_rate": 1.0261140271423858e-06, + "loss": 1.6395, + "step": 1180 + }, + { + "epoch": 1.6068027210884352, + "grad_norm": 0.36927305650043224, + "learning_rate": 1.0193000413658538e-06, + "loss": 1.5686, + "step": 1181 + }, + { + "epoch": 1.6081632653061224, + "grad_norm": 0.11834428054470161, + "learning_rate": 1.0125061866285186e-06, + "loss": 1.5722, + "step": 1182 + }, + { + "epoch": 1.6095238095238096, + "grad_norm": 0.11481074155938689, + "learning_rate": 1.0057324972879933e-06, + "loss": 1.6238, + "step": 1183 + }, + { + "epoch": 1.6108843537414965, + "grad_norm": 0.22756149512694648, + "learning_rate": 9.989790075999145e-07, + "loss": 1.5934, + "step": 1184 + }, + { + "epoch": 1.6122448979591837, + "grad_norm": 0.11551066672925502, + "learning_rate": 9.922457517177648e-07, + "loss": 1.6947, + "step": 1185 + }, + { + "epoch": 1.6136054421768709, + "grad_norm": 0.17228114247643034, + "learning_rate": 9.855327636926992e-07, + "loss": 1.6467, + "step": 1186 + }, + { + "epoch": 1.6149659863945578, + "grad_norm": 0.13261272278043656, + "learning_rate": 9.78840077473377e-07, + "loss": 1.6139, + "step": 1187 + }, + { + "epoch": 1.6163265306122447, + "grad_norm": 0.14256320918658308, + "learning_rate": 9.721677269057882e-07, + "loss": 1.8835, + "step": 1188 + }, + { + "epoch": 1.6176870748299321, + "grad_norm": 0.13515325397992245, + "learning_rate": 9.655157457330816e-07, + "loss": 1.6236, + "step": 1189 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 0.1213632301061294, + "learning_rate": 9.588841675953952e-07, + "loss": 1.6718, + "step": 1190 + }, + { + "epoch": 1.620408163265306, + "grad_norm": 0.12023247373916009, + "learning_rate": 9.522730260296836e-07, + "loss": 1.7103, + "step": 1191 + }, + { + "epoch": 1.6217687074829932, + "grad_norm": 0.12033848582541032, + "learning_rate": 9.45682354469552e-07, + "loss": 1.6756, + "step": 1192 + }, + { + "epoch": 1.6231292517006803, + "grad_norm": 0.15874394674606557, + "learning_rate": 9.391121862450858e-07, + "loss": 1.5691, + "step": 1193 + }, + { + "epoch": 1.6244897959183673, + "grad_norm": 0.16013714918796576, + "learning_rate": 9.325625545826822e-07, + "loss": 1.8024, + "step": 1194 + }, + { + "epoch": 1.6258503401360545, + "grad_norm": 0.27054822930134187, + "learning_rate": 9.260334926048787e-07, + "loss": 1.7083, + "step": 1195 + }, + { + "epoch": 1.6272108843537416, + "grad_norm": 0.11952723384185746, + "learning_rate": 9.195250333301913e-07, + "loss": 1.6187, + "step": 1196 + }, + { + "epoch": 1.6285714285714286, + "grad_norm": 0.1313632997033654, + "learning_rate": 9.130372096729446e-07, + "loss": 1.4098, + "step": 1197 + }, + { + "epoch": 1.6299319727891155, + "grad_norm": 0.12834344479681326, + "learning_rate": 9.06570054443105e-07, + "loss": 1.661, + "step": 1198 + }, + { + "epoch": 1.6312925170068027, + "grad_norm": 0.1090830592683463, + "learning_rate": 9.00123600346115e-07, + "loss": 1.6595, + "step": 1199 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.16043162392680574, + "learning_rate": 8.936978799827295e-07, + "loss": 1.7272, + "step": 1200 + }, + { + "epoch": 1.6340136054421768, + "grad_norm": 0.11867938317150763, + "learning_rate": 8.872929258488489e-07, + "loss": 1.827, + "step": 1201 + }, + { + "epoch": 1.635374149659864, + "grad_norm": 0.12762812337248913, + "learning_rate": 8.809087703353536e-07, + "loss": 1.6494, + "step": 1202 + }, + { + "epoch": 1.636734693877551, + "grad_norm": 0.1292715431110236, + "learning_rate": 8.74545445727944e-07, + "loss": 1.7392, + "step": 1203 + }, + { + "epoch": 1.638095238095238, + "grad_norm": 0.14455856202047512, + "learning_rate": 8.682029842069761e-07, + "loss": 1.5346, + "step": 1204 + }, + { + "epoch": 1.6394557823129252, + "grad_norm": 0.1074436319272245, + "learning_rate": 8.618814178472973e-07, + "loss": 1.7282, + "step": 1205 + }, + { + "epoch": 1.6408163265306124, + "grad_norm": 0.12115387945531858, + "learning_rate": 8.555807786180814e-07, + "loss": 1.7237, + "step": 1206 + }, + { + "epoch": 1.6421768707482993, + "grad_norm": 0.12805781303993397, + "learning_rate": 8.493010983826749e-07, + "loss": 1.634, + "step": 1207 + }, + { + "epoch": 1.6435374149659863, + "grad_norm": 0.12535314476626302, + "learning_rate": 8.430424088984295e-07, + "loss": 1.5882, + "step": 1208 + }, + { + "epoch": 1.6448979591836734, + "grad_norm": 0.12698774386082093, + "learning_rate": 8.368047418165437e-07, + "loss": 1.6526, + "step": 1209 + }, + { + "epoch": 1.6462585034013606, + "grad_norm": 0.13808871614579815, + "learning_rate": 8.305881286819029e-07, + "loss": 1.5524, + "step": 1210 + }, + { + "epoch": 1.6476190476190475, + "grad_norm": 0.11270379177937306, + "learning_rate": 8.243926009329156e-07, + "loss": 1.6414, + "step": 1211 + }, + { + "epoch": 1.6489795918367347, + "grad_norm": 0.1108976444955171, + "learning_rate": 8.18218189901363e-07, + "loss": 1.6002, + "step": 1212 + }, + { + "epoch": 1.6503401360544219, + "grad_norm": 0.12054751150197075, + "learning_rate": 8.120649268122333e-07, + "loss": 1.5422, + "step": 1213 + }, + { + "epoch": 1.6517006802721088, + "grad_norm": 0.14215811110011028, + "learning_rate": 8.059328427835672e-07, + "loss": 1.6454, + "step": 1214 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 0.11698885610808762, + "learning_rate": 7.998219688262971e-07, + "loss": 1.5852, + "step": 1215 + }, + { + "epoch": 1.6544217687074831, + "grad_norm": 0.1344113229524319, + "learning_rate": 7.937323358440935e-07, + "loss": 1.7622, + "step": 1216 + }, + { + "epoch": 1.65578231292517, + "grad_norm": 0.13434164059919554, + "learning_rate": 7.876639746332132e-07, + "loss": 1.7963, + "step": 1217 + }, + { + "epoch": 1.657142857142857, + "grad_norm": 0.17809396042944695, + "learning_rate": 7.816169158823295e-07, + "loss": 1.6858, + "step": 1218 + }, + { + "epoch": 1.6585034013605442, + "grad_norm": 0.13306970694049092, + "learning_rate": 7.755911901723917e-07, + "loss": 1.7431, + "step": 1219 + }, + { + "epoch": 1.6598639455782314, + "grad_norm": 0.11805560858596509, + "learning_rate": 7.695868279764623e-07, + "loss": 1.7348, + "step": 1220 + }, + { + "epoch": 1.6612244897959183, + "grad_norm": 0.138038505695151, + "learning_rate": 7.636038596595669e-07, + "loss": 1.6465, + "step": 1221 + }, + { + "epoch": 1.6625850340136055, + "grad_norm": 0.10405731691151054, + "learning_rate": 7.576423154785345e-07, + "loss": 1.6682, + "step": 1222 + }, + { + "epoch": 1.6639455782312926, + "grad_norm": 0.11496805932975722, + "learning_rate": 7.51702225581854e-07, + "loss": 1.5804, + "step": 1223 + }, + { + "epoch": 1.6653061224489796, + "grad_norm": 0.12087556838995649, + "learning_rate": 7.457836200095137e-07, + "loss": 1.6466, + "step": 1224 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.14333497087068797, + "learning_rate": 7.398865286928531e-07, + "loss": 1.7441, + "step": 1225 + }, + { + "epoch": 1.668027210884354, + "grad_norm": 0.13232481621976905, + "learning_rate": 7.34010981454411e-07, + "loss": 1.65, + "step": 1226 + }, + { + "epoch": 1.6693877551020408, + "grad_norm": 0.24932955649343116, + "learning_rate": 7.281570080077716e-07, + "loss": 1.4175, + "step": 1227 + }, + { + "epoch": 1.6707482993197278, + "grad_norm": 0.11924561800840275, + "learning_rate": 7.223246379574206e-07, + "loss": 1.6962, + "step": 1228 + }, + { + "epoch": 1.672108843537415, + "grad_norm": 0.12513856122351408, + "learning_rate": 7.165139007985899e-07, + "loss": 1.656, + "step": 1229 + }, + { + "epoch": 1.6734693877551021, + "grad_norm": 0.10354825149011261, + "learning_rate": 7.107248259171118e-07, + "loss": 1.7198, + "step": 1230 + }, + { + "epoch": 1.674829931972789, + "grad_norm": 0.1473977816911567, + "learning_rate": 7.049574425892663e-07, + "loss": 1.5774, + "step": 1231 + }, + { + "epoch": 1.6761904761904762, + "grad_norm": 0.12018009205318546, + "learning_rate": 6.992117799816383e-07, + "loss": 1.6315, + "step": 1232 + }, + { + "epoch": 1.6775510204081634, + "grad_norm": 0.36801389059595907, + "learning_rate": 6.934878671509665e-07, + "loss": 1.5693, + "step": 1233 + }, + { + "epoch": 1.6789115646258503, + "grad_norm": 0.12229393665572565, + "learning_rate": 6.87785733043998e-07, + "loss": 1.6481, + "step": 1234 + }, + { + "epoch": 1.6802721088435373, + "grad_norm": 0.1056291236057399, + "learning_rate": 6.821054064973415e-07, + "loss": 1.7811, + "step": 1235 + }, + { + "epoch": 1.6816326530612244, + "grad_norm": 0.1290627059039082, + "learning_rate": 6.764469162373183e-07, + "loss": 1.5862, + "step": 1236 + }, + { + "epoch": 1.6829931972789116, + "grad_norm": 0.14140062572587017, + "learning_rate": 6.708102908798269e-07, + "loss": 1.5937, + "step": 1237 + }, + { + "epoch": 1.6843537414965986, + "grad_norm": 0.12372660811883815, + "learning_rate": 6.651955589301839e-07, + "loss": 1.5442, + "step": 1238 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.11398935678427591, + "learning_rate": 6.596027487829915e-07, + "loss": 1.6802, + "step": 1239 + }, + { + "epoch": 1.6870748299319729, + "grad_norm": 0.1125362910385827, + "learning_rate": 6.540318887219899e-07, + "loss": 1.5756, + "step": 1240 + }, + { + "epoch": 1.6884353741496598, + "grad_norm": 0.12182979669358265, + "learning_rate": 6.484830069199132e-07, + "loss": 1.7316, + "step": 1241 + }, + { + "epoch": 1.689795918367347, + "grad_norm": 0.1757220644211557, + "learning_rate": 6.429561314383475e-07, + "loss": 1.5394, + "step": 1242 + }, + { + "epoch": 1.6911564625850342, + "grad_norm": 0.7508195823704957, + "learning_rate": 6.374512902275903e-07, + "loss": 1.5372, + "step": 1243 + }, + { + "epoch": 1.692517006802721, + "grad_norm": 0.11573367958160206, + "learning_rate": 6.319685111265078e-07, + "loss": 1.6907, + "step": 1244 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 0.12247233986880039, + "learning_rate": 6.265078218623949e-07, + "loss": 1.6788, + "step": 1245 + }, + { + "epoch": 1.6952380952380952, + "grad_norm": 0.1420406063065376, + "learning_rate": 6.210692500508359e-07, + "loss": 1.7363, + "step": 1246 + }, + { + "epoch": 1.6965986394557824, + "grad_norm": 0.11985428532614184, + "learning_rate": 6.1565282319556e-07, + "loss": 1.5928, + "step": 1247 + }, + { + "epoch": 1.6979591836734693, + "grad_norm": 0.10678231200017475, + "learning_rate": 6.102585686883078e-07, + "loss": 1.7134, + "step": 1248 + }, + { + "epoch": 1.6993197278911565, + "grad_norm": 0.11899866385551988, + "learning_rate": 6.048865138086929e-07, + "loss": 1.5433, + "step": 1249 + }, + { + "epoch": 1.7006802721088436, + "grad_norm": 0.15494511895804758, + "learning_rate": 5.995366857240592e-07, + "loss": 1.6062, + "step": 1250 + }, + { + "epoch": 1.7020408163265306, + "grad_norm": 0.11026358843797994, + "learning_rate": 5.942091114893455e-07, + "loss": 1.7031, + "step": 1251 + }, + { + "epoch": 1.7034013605442175, + "grad_norm": 0.22132054192752112, + "learning_rate": 5.889038180469509e-07, + "loss": 1.6915, + "step": 1252 + }, + { + "epoch": 1.704761904761905, + "grad_norm": 0.1342507596457615, + "learning_rate": 5.836208322265991e-07, + "loss": 1.7164, + "step": 1253 + }, + { + "epoch": 1.7061224489795919, + "grad_norm": 0.12737785530895251, + "learning_rate": 5.783601807451949e-07, + "loss": 1.5374, + "step": 1254 + }, + { + "epoch": 1.7074829931972788, + "grad_norm": 0.12036712264849207, + "learning_rate": 5.73121890206701e-07, + "loss": 1.5903, + "step": 1255 + }, + { + "epoch": 1.708843537414966, + "grad_norm": 0.12472273002304834, + "learning_rate": 5.679059871019904e-07, + "loss": 1.5398, + "step": 1256 + }, + { + "epoch": 1.7102040816326531, + "grad_norm": 0.19389386529197455, + "learning_rate": 5.627124978087267e-07, + "loss": 1.6439, + "step": 1257 + }, + { + "epoch": 1.71156462585034, + "grad_norm": 0.1428822953499787, + "learning_rate": 5.575414485912173e-07, + "loss": 1.6785, + "step": 1258 + }, + { + "epoch": 1.7129251700680272, + "grad_norm": 0.12043634562801463, + "learning_rate": 5.523928656002897e-07, + "loss": 1.6439, + "step": 1259 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.19566443134501496, + "learning_rate": 5.472667748731552e-07, + "loss": 1.6816, + "step": 1260 + }, + { + "epoch": 1.7156462585034014, + "grad_norm": 0.12298818504704644, + "learning_rate": 5.421632023332779e-07, + "loss": 1.8452, + "step": 1261 + }, + { + "epoch": 1.7170068027210883, + "grad_norm": 0.12457908114663241, + "learning_rate": 5.370821737902455e-07, + "loss": 1.648, + "step": 1262 + }, + { + "epoch": 1.7183673469387755, + "grad_norm": 0.12905422383417112, + "learning_rate": 5.320237149396329e-07, + "loss": 1.7109, + "step": 1263 + }, + { + "epoch": 1.7197278911564626, + "grad_norm": 0.14174623921254864, + "learning_rate": 5.269878513628806e-07, + "loss": 1.6398, + "step": 1264 + }, + { + "epoch": 1.7210884353741496, + "grad_norm": 0.11019848785148706, + "learning_rate": 5.219746085271599e-07, + "loss": 1.6362, + "step": 1265 + }, + { + "epoch": 1.7224489795918367, + "grad_norm": 0.15843308364174372, + "learning_rate": 5.169840117852454e-07, + "loss": 1.6709, + "step": 1266 + }, + { + "epoch": 1.723809523809524, + "grad_norm": 0.12183022697550175, + "learning_rate": 5.120160863753859e-07, + "loss": 1.722, + "step": 1267 + }, + { + "epoch": 1.7251700680272108, + "grad_norm": 0.11328748286322111, + "learning_rate": 5.070708574211769e-07, + "loss": 1.7427, + "step": 1268 + }, + { + "epoch": 1.726530612244898, + "grad_norm": 0.5467870469075437, + "learning_rate": 5.0214834993144e-07, + "loss": 1.7051, + "step": 1269 + }, + { + "epoch": 1.7278911564625852, + "grad_norm": 0.1142321717186957, + "learning_rate": 4.972485888000822e-07, + "loss": 1.663, + "step": 1270 + }, + { + "epoch": 1.7292517006802721, + "grad_norm": 0.15669214354154648, + "learning_rate": 4.923715988059851e-07, + "loss": 1.7283, + "step": 1271 + }, + { + "epoch": 1.730612244897959, + "grad_norm": 0.12321158474736231, + "learning_rate": 4.875174046128684e-07, + "loss": 1.7658, + "step": 1272 + }, + { + "epoch": 1.7319727891156462, + "grad_norm": 0.12592275959820878, + "learning_rate": 4.826860307691749e-07, + "loss": 1.7346, + "step": 1273 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.11852385189098655, + "learning_rate": 4.778775017079357e-07, + "loss": 1.7388, + "step": 1274 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 0.11468824991687426, + "learning_rate": 4.730918417466551e-07, + "loss": 1.7385, + "step": 1275 + }, + { + "epoch": 1.7360544217687075, + "grad_norm": 0.15371819150117932, + "learning_rate": 4.683290750871855e-07, + "loss": 1.589, + "step": 1276 + }, + { + "epoch": 1.7374149659863947, + "grad_norm": 0.1990093146846139, + "learning_rate": 4.635892258156027e-07, + "loss": 1.6007, + "step": 1277 + }, + { + "epoch": 1.7387755102040816, + "grad_norm": 0.1663021965076431, + "learning_rate": 4.588723179020865e-07, + "loss": 1.5926, + "step": 1278 + }, + { + "epoch": 1.7401360544217686, + "grad_norm": 0.21861874038762558, + "learning_rate": 4.5417837520079667e-07, + "loss": 1.7584, + "step": 1279 + }, + { + "epoch": 1.741496598639456, + "grad_norm": 0.11717800217118578, + "learning_rate": 4.4950742144975525e-07, + "loss": 1.5905, + "step": 1280 + }, + { + "epoch": 1.7428571428571429, + "grad_norm": 0.12603146125560444, + "learning_rate": 4.4485948027072734e-07, + "loss": 1.6514, + "step": 1281 + }, + { + "epoch": 1.7442176870748298, + "grad_norm": 0.12095214576683595, + "learning_rate": 4.402345751690984e-07, + "loss": 1.6237, + "step": 1282 + }, + { + "epoch": 1.745578231292517, + "grad_norm": 0.12347022358642107, + "learning_rate": 4.3563272953375426e-07, + "loss": 1.6195, + "step": 1283 + }, + { + "epoch": 1.7469387755102042, + "grad_norm": 0.10437898422428948, + "learning_rate": 4.3105396663696896e-07, + "loss": 1.6093, + "step": 1284 + }, + { + "epoch": 1.748299319727891, + "grad_norm": 0.11098541618904957, + "learning_rate": 4.2649830963428227e-07, + "loss": 1.6999, + "step": 1285 + }, + { + "epoch": 1.7496598639455783, + "grad_norm": 0.23522787615940605, + "learning_rate": 4.2196578156438293e-07, + "loss": 1.6786, + "step": 1286 + }, + { + "epoch": 1.7510204081632654, + "grad_norm": 0.11988459025584766, + "learning_rate": 4.174564053489949e-07, + "loss": 1.5861, + "step": 1287 + }, + { + "epoch": 1.7523809523809524, + "grad_norm": 0.12717182374610603, + "learning_rate": 4.129702037927558e-07, + "loss": 1.5982, + "step": 1288 + }, + { + "epoch": 1.7523809523809524, + "eval_loss": 1.686065673828125, + "eval_runtime": 76.6201, + "eval_samples_per_second": 53.158, + "eval_steps_per_second": 6.656, + "step": 1288 + }, + { + "epoch": 1.7537414965986393, + "grad_norm": 0.13809405448384865, + "learning_rate": 4.0850719958311024e-07, + "loss": 1.5886, + "step": 1289 + }, + { + "epoch": 1.7551020408163265, + "grad_norm": 0.11274789712003737, + "learning_rate": 4.040674152901852e-07, + "loss": 1.694, + "step": 1290 + }, + { + "epoch": 1.7564625850340136, + "grad_norm": 0.1882765689460891, + "learning_rate": 3.9965087336668485e-07, + "loss": 1.5427, + "step": 1291 + }, + { + "epoch": 1.7578231292517006, + "grad_norm": 0.14271738534965617, + "learning_rate": 3.9525759614776894e-07, + "loss": 1.5454, + "step": 1292 + }, + { + "epoch": 1.7591836734693878, + "grad_norm": 0.1405107274242078, + "learning_rate": 3.9088760585094976e-07, + "loss": 1.5758, + "step": 1293 + }, + { + "epoch": 1.760544217687075, + "grad_norm": 0.13748980285277887, + "learning_rate": 3.8654092457596714e-07, + "loss": 1.7284, + "step": 1294 + }, + { + "epoch": 1.7619047619047619, + "grad_norm": 0.13551778981512638, + "learning_rate": 3.822175743046874e-07, + "loss": 1.4861, + "step": 1295 + }, + { + "epoch": 1.763265306122449, + "grad_norm": 0.12395464181970267, + "learning_rate": 3.7791757690098797e-07, + "loss": 1.6397, + "step": 1296 + }, + { + "epoch": 1.7646258503401362, + "grad_norm": 0.1305188086923226, + "learning_rate": 3.7364095411064614e-07, + "loss": 1.6356, + "step": 1297 + }, + { + "epoch": 1.7659863945578231, + "grad_norm": 0.11815051876585439, + "learning_rate": 3.693877275612312e-07, + "loss": 1.6778, + "step": 1298 + }, + { + "epoch": 1.76734693877551, + "grad_norm": 0.13907578783569868, + "learning_rate": 3.6515791876199136e-07, + "loss": 1.6404, + "step": 1299 + }, + { + "epoch": 1.7687074829931972, + "grad_norm": 0.11634068765275582, + "learning_rate": 3.6095154910375076e-07, + "loss": 1.7575, + "step": 1300 + }, + { + "epoch": 1.7700680272108844, + "grad_norm": 0.12322320540073207, + "learning_rate": 3.567686398587955e-07, + "loss": 1.7204, + "step": 1301 + }, + { + "epoch": 1.7714285714285714, + "grad_norm": 0.2519201944711608, + "learning_rate": 3.5260921218077116e-07, + "loss": 1.6642, + "step": 1302 + }, + { + "epoch": 1.7727891156462585, + "grad_norm": 0.12072706338657095, + "learning_rate": 3.484732871045704e-07, + "loss": 1.6003, + "step": 1303 + }, + { + "epoch": 1.7741496598639457, + "grad_norm": 0.12368254206995616, + "learning_rate": 3.443608855462294e-07, + "loss": 1.4418, + "step": 1304 + }, + { + "epoch": 1.7755102040816326, + "grad_norm": 0.1349862699749685, + "learning_rate": 3.402720283028277e-07, + "loss": 1.4863, + "step": 1305 + }, + { + "epoch": 1.7768707482993196, + "grad_norm": 0.1586268302178325, + "learning_rate": 3.3620673605236907e-07, + "loss": 1.6977, + "step": 1306 + }, + { + "epoch": 1.778231292517007, + "grad_norm": 0.10301426967234653, + "learning_rate": 3.3216502935369265e-07, + "loss": 1.7133, + "step": 1307 + }, + { + "epoch": 1.779591836734694, + "grad_norm": 0.1074501899935655, + "learning_rate": 3.2814692864635513e-07, + "loss": 1.6611, + "step": 1308 + }, + { + "epoch": 1.7809523809523808, + "grad_norm": 0.14501762775546292, + "learning_rate": 3.2415245425054087e-07, + "loss": 1.5968, + "step": 1309 + }, + { + "epoch": 1.782312925170068, + "grad_norm": 0.11188619893577698, + "learning_rate": 3.201816263669461e-07, + "loss": 1.4696, + "step": 1310 + }, + { + "epoch": 1.7836734693877552, + "grad_norm": 0.1444196373172706, + "learning_rate": 3.162344650766874e-07, + "loss": 1.6787, + "step": 1311 + }, + { + "epoch": 1.7850340136054421, + "grad_norm": 0.11758282717179587, + "learning_rate": 3.123109903411903e-07, + "loss": 1.6935, + "step": 1312 + }, + { + "epoch": 1.7863945578231293, + "grad_norm": 0.11743931898274525, + "learning_rate": 3.0841122200210014e-07, + "loss": 1.6681, + "step": 1313 + }, + { + "epoch": 1.7877551020408164, + "grad_norm": 0.15092126786575402, + "learning_rate": 3.045351797811713e-07, + "loss": 1.7548, + "step": 1314 + }, + { + "epoch": 1.7891156462585034, + "grad_norm": 0.13374065025706797, + "learning_rate": 3.006828832801706e-07, + "loss": 1.6463, + "step": 1315 + }, + { + "epoch": 1.7904761904761903, + "grad_norm": 0.1070307983897091, + "learning_rate": 2.9685435198078095e-07, + "loss": 1.6918, + "step": 1316 + }, + { + "epoch": 1.7918367346938775, + "grad_norm": 0.1043359426709795, + "learning_rate": 2.9304960524450034e-07, + "loss": 1.8017, + "step": 1317 + }, + { + "epoch": 1.7931972789115647, + "grad_norm": 0.12298508287191963, + "learning_rate": 2.892686623125446e-07, + "loss": 1.3275, + "step": 1318 + }, + { + "epoch": 1.7945578231292516, + "grad_norm": 0.12081576860619693, + "learning_rate": 2.855115423057492e-07, + "loss": 1.7012, + "step": 1319 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 0.11298191984110575, + "learning_rate": 2.817782642244732e-07, + "loss": 1.7513, + "step": 1320 + }, + { + "epoch": 1.797278911564626, + "grad_norm": 0.10476335704776595, + "learning_rate": 2.780688469485038e-07, + "loss": 1.7937, + "step": 1321 + }, + { + "epoch": 1.7986394557823129, + "grad_norm": 0.13240769456180743, + "learning_rate": 2.743833092369608e-07, + "loss": 1.5026, + "step": 1322 + }, + { + "epoch": 1.8, + "grad_norm": 0.1349026266688947, + "learning_rate": 2.7072166972820005e-07, + "loss": 1.478, + "step": 1323 + }, + { + "epoch": 1.8013605442176872, + "grad_norm": 0.1581587709409474, + "learning_rate": 2.6708394693971963e-07, + "loss": 1.6953, + "step": 1324 + }, + { + "epoch": 1.8027210884353742, + "grad_norm": 0.1413036863393829, + "learning_rate": 2.6347015926807053e-07, + "loss": 1.7564, + "step": 1325 + }, + { + "epoch": 1.804081632653061, + "grad_norm": 0.10947438586849194, + "learning_rate": 2.5988032498875604e-07, + "loss": 1.693, + "step": 1326 + }, + { + "epoch": 1.8054421768707483, + "grad_norm": 0.18719949874204633, + "learning_rate": 2.563144622561453e-07, + "loss": 1.5172, + "step": 1327 + }, + { + "epoch": 1.8068027210884354, + "grad_norm": 0.12603171462141458, + "learning_rate": 2.527725891033772e-07, + "loss": 1.5421, + "step": 1328 + }, + { + "epoch": 1.8081632653061224, + "grad_norm": 0.11373462842692589, + "learning_rate": 2.49254723442276e-07, + "loss": 1.6575, + "step": 1329 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.12320391589520092, + "learning_rate": 2.4576088306325184e-07, + "loss": 1.6266, + "step": 1330 + }, + { + "epoch": 1.8108843537414967, + "grad_norm": 0.1362145220231982, + "learning_rate": 2.422910856352173e-07, + "loss": 1.6477, + "step": 1331 + }, + { + "epoch": 1.8122448979591836, + "grad_norm": 0.12607909355245228, + "learning_rate": 2.3884534870549534e-07, + "loss": 1.6541, + "step": 1332 + }, + { + "epoch": 1.8136054421768706, + "grad_norm": 0.12201919923215693, + "learning_rate": 2.354236896997314e-07, + "loss": 1.6405, + "step": 1333 + }, + { + "epoch": 1.814965986394558, + "grad_norm": 0.1475512645133478, + "learning_rate": 2.3202612592180485e-07, + "loss": 1.5981, + "step": 1334 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.1410369523336787, + "learning_rate": 2.2865267455374042e-07, + "loss": 1.6012, + "step": 1335 + }, + { + "epoch": 1.8176870748299319, + "grad_norm": 0.14726614058677065, + "learning_rate": 2.2530335265562475e-07, + "loss": 1.5929, + "step": 1336 + }, + { + "epoch": 1.819047619047619, + "grad_norm": 0.11101939816547764, + "learning_rate": 2.2197817716551663e-07, + "loss": 1.7944, + "step": 1337 + }, + { + "epoch": 1.8204081632653062, + "grad_norm": 0.31120839304837183, + "learning_rate": 2.1867716489936297e-07, + "loss": 1.7441, + "step": 1338 + }, + { + "epoch": 1.8217687074829931, + "grad_norm": 0.1087680907822782, + "learning_rate": 2.1540033255091187e-07, + "loss": 1.7397, + "step": 1339 + }, + { + "epoch": 1.8231292517006803, + "grad_norm": 0.15824282985610105, + "learning_rate": 2.1214769669163137e-07, + "loss": 1.5673, + "step": 1340 + }, + { + "epoch": 1.8244897959183675, + "grad_norm": 0.11523860122450502, + "learning_rate": 2.0891927377062526e-07, + "loss": 1.7194, + "step": 1341 + }, + { + "epoch": 1.8258503401360544, + "grad_norm": 0.16694763381862118, + "learning_rate": 2.0571508011454467e-07, + "loss": 1.7396, + "step": 1342 + }, + { + "epoch": 1.8272108843537413, + "grad_norm": 0.12107304038026233, + "learning_rate": 2.0253513192751374e-07, + "loss": 1.5903, + "step": 1343 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 0.12306189922695104, + "learning_rate": 1.993794452910386e-07, + "loss": 1.6681, + "step": 1344 + }, + { + "epoch": 1.8299319727891157, + "grad_norm": 0.13264203303883543, + "learning_rate": 1.962480361639363e-07, + "loss": 1.5602, + "step": 1345 + }, + { + "epoch": 1.8312925170068026, + "grad_norm": 0.19210662239119836, + "learning_rate": 1.9314092038224376e-07, + "loss": 1.7383, + "step": 1346 + }, + { + "epoch": 1.8326530612244898, + "grad_norm": 0.11196653339216565, + "learning_rate": 1.9005811365914561e-07, + "loss": 1.6738, + "step": 1347 + }, + { + "epoch": 1.834013605442177, + "grad_norm": 0.13383855955903828, + "learning_rate": 1.8699963158489097e-07, + "loss": 1.6137, + "step": 1348 + }, + { + "epoch": 1.835374149659864, + "grad_norm": 0.13158068933328537, + "learning_rate": 1.8396548962671456e-07, + "loss": 1.7042, + "step": 1349 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.11719394942534897, + "learning_rate": 1.8095570312876066e-07, + "loss": 1.5956, + "step": 1350 + }, + { + "epoch": 1.8380952380952382, + "grad_norm": 0.11938431133510057, + "learning_rate": 1.779702873120015e-07, + "loss": 1.665, + "step": 1351 + }, + { + "epoch": 1.8394557823129252, + "grad_norm": 0.12054936992306543, + "learning_rate": 1.7500925727416517e-07, + "loss": 1.7007, + "step": 1352 + }, + { + "epoch": 1.8408163265306121, + "grad_norm": 0.12302278736973048, + "learning_rate": 1.7207262798965617e-07, + "loss": 1.8256, + "step": 1353 + }, + { + "epoch": 1.8421768707482993, + "grad_norm": 0.20874731105186642, + "learning_rate": 1.691604143094816e-07, + "loss": 1.6506, + "step": 1354 + }, + { + "epoch": 1.8435374149659864, + "grad_norm": 0.11883930933955864, + "learning_rate": 1.6627263096117286e-07, + "loss": 1.4408, + "step": 1355 + }, + { + "epoch": 1.8448979591836734, + "grad_norm": 0.13925197256417077, + "learning_rate": 1.6340929254871406e-07, + "loss": 1.597, + "step": 1356 + }, + { + "epoch": 1.8462585034013606, + "grad_norm": 0.12520385420034488, + "learning_rate": 1.6057041355246827e-07, + "loss": 1.7058, + "step": 1357 + }, + { + "epoch": 1.8476190476190477, + "grad_norm": 0.14534052106943013, + "learning_rate": 1.5775600832910187e-07, + "loss": 1.7239, + "step": 1358 + }, + { + "epoch": 1.8489795918367347, + "grad_norm": 0.10822664134568488, + "learning_rate": 1.5496609111151583e-07, + "loss": 1.6079, + "step": 1359 + }, + { + "epoch": 1.8503401360544216, + "grad_norm": 0.12860377257334074, + "learning_rate": 1.5220067600876686e-07, + "loss": 1.605, + "step": 1360 + }, + { + "epoch": 1.851700680272109, + "grad_norm": 0.12116010186228711, + "learning_rate": 1.4945977700600634e-07, + "loss": 1.5666, + "step": 1361 + }, + { + "epoch": 1.853061224489796, + "grad_norm": 0.15034373846159615, + "learning_rate": 1.4674340796439924e-07, + "loss": 1.5519, + "step": 1362 + }, + { + "epoch": 1.8544217687074829, + "grad_norm": 0.1249592480463045, + "learning_rate": 1.4405158262106144e-07, + "loss": 1.6923, + "step": 1363 + }, + { + "epoch": 1.85578231292517, + "grad_norm": 0.12093711468318252, + "learning_rate": 1.4138431458898372e-07, + "loss": 1.5712, + "step": 1364 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.1535628658201845, + "learning_rate": 1.3874161735697166e-07, + "loss": 1.5705, + "step": 1365 + }, + { + "epoch": 1.8585034013605441, + "grad_norm": 0.11441660994747492, + "learning_rate": 1.3612350428956865e-07, + "loss": 1.7873, + "step": 1366 + }, + { + "epoch": 1.8598639455782313, + "grad_norm": 0.1334231424049127, + "learning_rate": 1.3352998862699252e-07, + "loss": 1.6774, + "step": 1367 + }, + { + "epoch": 1.8612244897959185, + "grad_norm": 0.12313352697990386, + "learning_rate": 1.3096108348506942e-07, + "loss": 1.6795, + "step": 1368 + }, + { + "epoch": 1.8625850340136054, + "grad_norm": 0.13161417747426524, + "learning_rate": 1.2841680185516515e-07, + "loss": 1.6117, + "step": 1369 + }, + { + "epoch": 1.8639455782312924, + "grad_norm": 0.11729319664592838, + "learning_rate": 1.2589715660412117e-07, + "loss": 1.7447, + "step": 1370 + }, + { + "epoch": 1.8653061224489798, + "grad_norm": 0.17489311720035736, + "learning_rate": 1.2340216047418697e-07, + "loss": 1.6488, + "step": 1371 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.1323254687842618, + "learning_rate": 1.2093182608296007e-07, + "loss": 1.5905, + "step": 1372 + }, + { + "epoch": 1.8680272108843536, + "grad_norm": 0.13009049948669843, + "learning_rate": 1.1848616592331718e-07, + "loss": 1.7288, + "step": 1373 + }, + { + "epoch": 1.8693877551020408, + "grad_norm": 0.13326010230560859, + "learning_rate": 1.1606519236335601e-07, + "loss": 1.4685, + "step": 1374 + }, + { + "epoch": 1.870748299319728, + "grad_norm": 0.13195963848274078, + "learning_rate": 1.1366891764632792e-07, + "loss": 1.6705, + "step": 1375 + }, + { + "epoch": 1.872108843537415, + "grad_norm": 0.20403157309863337, + "learning_rate": 1.1129735389057872e-07, + "loss": 1.5451, + "step": 1376 + }, + { + "epoch": 1.873469387755102, + "grad_norm": 0.13333819468646058, + "learning_rate": 1.0895051308948857e-07, + "loss": 1.6161, + "step": 1377 + }, + { + "epoch": 1.8748299319727892, + "grad_norm": 0.11437641155105588, + "learning_rate": 1.0662840711140765e-07, + "loss": 1.6297, + "step": 1378 + }, + { + "epoch": 1.8761904761904762, + "grad_norm": 0.14869492133046525, + "learning_rate": 1.0433104769959901e-07, + "loss": 1.6101, + "step": 1379 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 0.10691630289178386, + "learning_rate": 1.0205844647217745e-07, + "loss": 1.7722, + "step": 1380 + }, + { + "epoch": 1.8789115646258503, + "grad_norm": 0.12576056527826357, + "learning_rate": 9.981061492205457e-08, + "loss": 1.6726, + "step": 1381 + }, + { + "epoch": 1.8802721088435375, + "grad_norm": 0.14899620665692015, + "learning_rate": 9.758756441687333e-08, + "loss": 1.7, + "step": 1382 + }, + { + "epoch": 1.8816326530612244, + "grad_norm": 0.11208257992745001, + "learning_rate": 9.538930619895914e-08, + "loss": 1.7984, + "step": 1383 + }, + { + "epoch": 1.8829931972789116, + "grad_norm": 0.1224966922554696, + "learning_rate": 9.321585138525546e-08, + "loss": 1.6945, + "step": 1384 + }, + { + "epoch": 1.8843537414965987, + "grad_norm": 0.11370992913550665, + "learning_rate": 9.106721096727334e-08, + "loss": 1.6014, + "step": 1385 + }, + { + "epoch": 1.8857142857142857, + "grad_norm": 0.1374350456716246, + "learning_rate": 8.89433958110325e-08, + "loss": 1.5331, + "step": 1386 + }, + { + "epoch": 1.8870748299319728, + "grad_norm": 0.1295417008752131, + "learning_rate": 8.684441665700704e-08, + "loss": 1.6951, + "step": 1387 + }, + { + "epoch": 1.88843537414966, + "grad_norm": 0.14326716925245933, + "learning_rate": 8.477028412007204e-08, + "loss": 1.4568, + "step": 1388 + }, + { + "epoch": 1.889795918367347, + "grad_norm": 0.11941708812661872, + "learning_rate": 8.272100868944865e-08, + "loss": 1.6917, + "step": 1389 + }, + { + "epoch": 1.891156462585034, + "grad_norm": 0.11037035678246186, + "learning_rate": 8.069660072865304e-08, + "loss": 1.7508, + "step": 1390 + }, + { + "epoch": 1.892517006802721, + "grad_norm": 0.12587422498732503, + "learning_rate": 7.869707047543973e-08, + "loss": 1.7974, + "step": 1391 + }, + { + "epoch": 1.8938775510204082, + "grad_norm": 0.1266780218730049, + "learning_rate": 7.6722428041755e-08, + "loss": 1.953, + "step": 1392 + }, + { + "epoch": 1.8952380952380952, + "grad_norm": 0.12200593740003406, + "learning_rate": 7.47726834136836e-08, + "loss": 1.7449, + "step": 1393 + }, + { + "epoch": 1.8965986394557823, + "grad_norm": 0.15472206202543426, + "learning_rate": 7.284784645139653e-08, + "loss": 1.5177, + "step": 1394 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.1624808633869621, + "learning_rate": 7.094792688910446e-08, + "loss": 1.4361, + "step": 1395 + }, + { + "epoch": 1.8993197278911564, + "grad_norm": 0.11848244574224435, + "learning_rate": 6.907293433500494e-08, + "loss": 1.8216, + "step": 1396 + }, + { + "epoch": 1.9006802721088434, + "grad_norm": 0.16167022314859605, + "learning_rate": 6.722287827123697e-08, + "loss": 1.6308, + "step": 1397 + }, + { + "epoch": 1.9020408163265308, + "grad_norm": 0.1190014648907852, + "learning_rate": 6.539776805383146e-08, + "loss": 1.5514, + "step": 1398 + }, + { + "epoch": 1.9034013605442177, + "grad_norm": 0.14368372012309139, + "learning_rate": 6.359761291266365e-08, + "loss": 1.6309, + "step": 1399 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.11701809111487846, + "learning_rate": 6.182242195140686e-08, + "loss": 1.5709, + "step": 1400 + }, + { + "epoch": 1.9061224489795918, + "grad_norm": 0.10607705943146, + "learning_rate": 6.007220414748772e-08, + "loss": 1.6508, + "step": 1401 + }, + { + "epoch": 1.907482993197279, + "grad_norm": 0.12434317488105485, + "learning_rate": 5.8346968352038794e-08, + "loss": 1.6759, + "step": 1402 + }, + { + "epoch": 1.908843537414966, + "grad_norm": 0.12404880236130861, + "learning_rate": 5.6646723289853767e-08, + "loss": 1.5871, + "step": 1403 + }, + { + "epoch": 1.910204081632653, + "grad_norm": 0.1032281528541635, + "learning_rate": 5.4971477559346286e-08, + "loss": 1.8326, + "step": 1404 + }, + { + "epoch": 1.9115646258503403, + "grad_norm": 0.13405302158392976, + "learning_rate": 5.33212396325028e-08, + "loss": 1.6984, + "step": 1405 + }, + { + "epoch": 1.9129251700680272, + "grad_norm": 0.10242389969549208, + "learning_rate": 5.1696017854841463e-08, + "loss": 1.8544, + "step": 1406 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.1445888697947125, + "learning_rate": 5.0095820445369424e-08, + "loss": 1.5013, + "step": 1407 + }, + { + "epoch": 1.9156462585034013, + "grad_norm": 0.20630737260948012, + "learning_rate": 4.852065549654228e-08, + "loss": 1.4415, + "step": 1408 + }, + { + "epoch": 1.9170068027210885, + "grad_norm": 0.1271819648110208, + "learning_rate": 4.697053097422299e-08, + "loss": 1.712, + "step": 1409 + }, + { + "epoch": 1.9183673469387754, + "grad_norm": 0.12993476660803524, + "learning_rate": 4.544545471763917e-08, + "loss": 1.7681, + "step": 1410 + }, + { + "epoch": 1.9197278911564626, + "grad_norm": 0.12496814049626584, + "learning_rate": 4.394543443934751e-08, + "loss": 1.7471, + "step": 1411 + }, + { + "epoch": 1.9210884353741497, + "grad_norm": 0.13393563592633062, + "learning_rate": 4.247047772519108e-08, + "loss": 1.4003, + "step": 1412 + }, + { + "epoch": 1.9224489795918367, + "grad_norm": 0.11852283012443121, + "learning_rate": 4.102059203426267e-08, + "loss": 1.7373, + "step": 1413 + }, + { + "epoch": 1.9238095238095239, + "grad_norm": 0.15124695275803585, + "learning_rate": 3.95957846988676e-08, + "loss": 1.8101, + "step": 1414 + }, + { + "epoch": 1.925170068027211, + "grad_norm": 0.1329717102509104, + "learning_rate": 3.819606292448541e-08, + "loss": 1.6143, + "step": 1415 + }, + { + "epoch": 1.926530612244898, + "grad_norm": 0.1287044226542165, + "learning_rate": 3.682143378973269e-08, + "loss": 1.5269, + "step": 1416 + }, + { + "epoch": 1.927891156462585, + "grad_norm": 0.13046094115226015, + "learning_rate": 3.5471904246331955e-08, + "loss": 1.7016, + "step": 1417 + }, + { + "epoch": 1.929251700680272, + "grad_norm": 0.1464904620501286, + "learning_rate": 3.414748111907007e-08, + "loss": 1.582, + "step": 1418 + }, + { + "epoch": 1.9306122448979592, + "grad_norm": 0.11902086875565884, + "learning_rate": 3.2848171105766545e-08, + "loss": 1.6504, + "step": 1419 + }, + { + "epoch": 1.9319727891156462, + "grad_norm": 0.1112077111298243, + "learning_rate": 3.15739807772425e-08, + "loss": 1.7393, + "step": 1420 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.12536056522513223, + "learning_rate": 3.032491657728176e-08, + "loss": 1.6239, + "step": 1421 + }, + { + "epoch": 1.9346938775510205, + "grad_norm": 0.120512032686707, + "learning_rate": 2.9100984822603705e-08, + "loss": 1.6468, + "step": 1422 + }, + { + "epoch": 1.9360544217687075, + "grad_norm": 0.14047085918823868, + "learning_rate": 2.7902191702827152e-08, + "loss": 1.7043, + "step": 1423 + }, + { + "epoch": 1.9374149659863944, + "grad_norm": 0.18275770385842988, + "learning_rate": 2.67285432804415e-08, + "loss": 1.6855, + "step": 1424 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.15317594155825304, + "learning_rate": 2.558004549077564e-08, + "loss": 1.6121, + "step": 1425 + }, + { + "epoch": 1.9401360544217687, + "grad_norm": 0.15316596709144686, + "learning_rate": 2.4456704141967437e-08, + "loss": 1.3756, + "step": 1426 + }, + { + "epoch": 1.9414965986394557, + "grad_norm": 0.12215537441906983, + "learning_rate": 2.335852491493429e-08, + "loss": 1.5432, + "step": 1427 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 0.11247818731447141, + "learning_rate": 2.2285513363344833e-08, + "loss": 1.4894, + "step": 1428 + }, + { + "epoch": 1.94421768707483, + "grad_norm": 0.14019650679504936, + "learning_rate": 2.1237674913591744e-08, + "loss": 1.6705, + "step": 1429 + }, + { + "epoch": 1.945578231292517, + "grad_norm": 0.1282593607045711, + "learning_rate": 2.0215014864761185e-08, + "loss": 1.5348, + "step": 1430 + }, + { + "epoch": 1.9469387755102041, + "grad_norm": 0.12284414079551015, + "learning_rate": 1.9217538388610625e-08, + "loss": 1.6945, + "step": 1431 + }, + { + "epoch": 1.9482993197278913, + "grad_norm": 0.11971853998705126, + "learning_rate": 1.824525052953774e-08, + "loss": 1.6959, + "step": 1432 + }, + { + "epoch": 1.9496598639455782, + "grad_norm": 0.12822616099558357, + "learning_rate": 1.7298156204559324e-08, + "loss": 1.7549, + "step": 1433 + }, + { + "epoch": 1.9510204081632652, + "grad_norm": 0.12499088858434286, + "learning_rate": 1.6376260203282422e-08, + "loss": 1.5989, + "step": 1434 + }, + { + "epoch": 1.9523809523809523, + "grad_norm": 0.12657981100016752, + "learning_rate": 1.5479567187884346e-08, + "loss": 1.6822, + "step": 1435 + }, + { + "epoch": 1.9537414965986395, + "grad_norm": 0.10376271150021693, + "learning_rate": 1.460808169308492e-08, + "loss": 1.861, + "step": 1436 + }, + { + "epoch": 1.9551020408163264, + "grad_norm": 0.12572469181134285, + "learning_rate": 1.3761808126126486e-08, + "loss": 1.7238, + "step": 1437 + }, + { + "epoch": 1.9564625850340136, + "grad_norm": 0.19569694241617253, + "learning_rate": 1.2940750766751164e-08, + "loss": 1.6449, + "step": 1438 + }, + { + "epoch": 1.9578231292517008, + "grad_norm": 0.133934432892983, + "learning_rate": 1.2144913767176968e-08, + "loss": 1.6028, + "step": 1439 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 0.276184773751762, + "learning_rate": 1.1374301152079491e-08, + "loss": 1.7483, + "step": 1440 + }, + { + "epoch": 1.9605442176870749, + "grad_norm": 0.12157965543272396, + "learning_rate": 1.0628916818571366e-08, + "loss": 1.7897, + "step": 1441 + }, + { + "epoch": 1.961904761904762, + "grad_norm": 0.12393150387814673, + "learning_rate": 9.90876453618006e-09, + "loss": 1.6346, + "step": 1442 + }, + { + "epoch": 1.963265306122449, + "grad_norm": 0.16519695397062947, + "learning_rate": 9.213847946832333e-09, + "loss": 1.6296, + "step": 1443 + }, + { + "epoch": 1.964625850340136, + "grad_norm": 0.16082589408015513, + "learning_rate": 8.544170564831478e-09, + "loss": 1.5966, + "step": 1444 + }, + { + "epoch": 1.965986394557823, + "grad_norm": 0.12676850389062208, + "learning_rate": 7.899735776845108e-09, + "loss": 1.5393, + "step": 1445 + }, + { + "epoch": 1.9673469387755103, + "grad_norm": 0.11804845339511627, + "learning_rate": 7.280546841882396e-09, + "loss": 1.6414, + "step": 1446 + }, + { + "epoch": 1.9687074829931972, + "grad_norm": 0.10847051602573662, + "learning_rate": 6.6866068912824215e-09, + "loss": 1.6769, + "step": 1447 + }, + { + "epoch": 1.9700680272108844, + "grad_norm": 0.10927297829948371, + "learning_rate": 6.117918928693623e-09, + "loss": 1.6144, + "step": 1448 + }, + { + "epoch": 1.9714285714285715, + "grad_norm": 0.14038993677085151, + "learning_rate": 5.574485830063258e-09, + "loss": 1.5951, + "step": 1449 + }, + { + "epoch": 1.9727891156462585, + "grad_norm": 0.13001361414839283, + "learning_rate": 5.056310343619641e-09, + "loss": 1.5813, + "step": 1450 + }, + { + "epoch": 1.9741496598639454, + "grad_norm": 0.11240349755485105, + "learning_rate": 4.563395089859368e-09, + "loss": 1.7346, + "step": 1451 + }, + { + "epoch": 1.9755102040816328, + "grad_norm": 0.13111702124771224, + "learning_rate": 4.095742561533444e-09, + "loss": 1.531, + "step": 1452 + }, + { + "epoch": 1.9768707482993197, + "grad_norm": 0.15379785521980685, + "learning_rate": 3.6533551236372923e-09, + "loss": 1.6275, + "step": 1453 + }, + { + "epoch": 1.9782312925170067, + "grad_norm": 0.1134944447962432, + "learning_rate": 3.2362350133940956e-09, + "loss": 1.6481, + "step": 1454 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.13403963298652696, + "learning_rate": 2.844384340247586e-09, + "loss": 1.4958, + "step": 1455 + }, + { + "epoch": 1.980952380952381, + "grad_norm": 0.13134186683472326, + "learning_rate": 2.4778050858492718e-09, + "loss": 1.8136, + "step": 1456 + }, + { + "epoch": 1.982312925170068, + "grad_norm": 0.11530031332268623, + "learning_rate": 2.136499104050116e-09, + "loss": 1.672, + "step": 1457 + }, + { + "epoch": 1.9836734693877551, + "grad_norm": 0.14400234513580018, + "learning_rate": 1.8204681208883191e-09, + "loss": 1.5871, + "step": 1458 + }, + { + "epoch": 1.9850340136054423, + "grad_norm": 0.10792037572917504, + "learning_rate": 1.5297137345843261e-09, + "loss": 1.7173, + "step": 1459 + }, + { + "epoch": 1.9863945578231292, + "grad_norm": 0.12254785014104333, + "learning_rate": 1.264237415529168e-09, + "loss": 1.6579, + "step": 1460 + }, + { + "epoch": 1.9877551020408162, + "grad_norm": 0.13854826500139353, + "learning_rate": 1.0240405062794666e-09, + "loss": 1.5945, + "step": 1461 + }, + { + "epoch": 1.9891156462585036, + "grad_norm": 0.124820740824841, + "learning_rate": 8.091242215491068e-10, + "loss": 1.6419, + "step": 1462 + }, + { + "epoch": 1.9904761904761905, + "grad_norm": 0.1529299716450345, + "learning_rate": 6.194896482047963e-10, + "loss": 1.6038, + "step": 1463 + }, + { + "epoch": 1.9918367346938775, + "grad_norm": 0.11316485439035386, + "learning_rate": 4.5513774525884904e-10, + "loss": 1.7278, + "step": 1464 + }, + { + "epoch": 1.9931972789115646, + "grad_norm": 0.16737814919567837, + "learning_rate": 3.1606934386529954e-10, + "loss": 1.745, + "step": 1465 + }, + { + "epoch": 1.9945578231292518, + "grad_norm": 0.1260365200841888, + "learning_rate": 2.022851473154619e-10, + "loss": 1.5775, + "step": 1466 + }, + { + "epoch": 1.9959183673469387, + "grad_norm": 0.14412575459341273, + "learning_rate": 1.13785731034044e-10, + "loss": 1.5964, + "step": 1467 + }, + { + "epoch": 1.997278911564626, + "grad_norm": 0.10844606500028223, + "learning_rate": 5.057154257692709e-11, + "loss": 1.6392, + "step": 1468 + }, + { + "epoch": 1.998639455782313, + "grad_norm": 0.136002057114523, + "learning_rate": 1.2642901628390214e-11, + "loss": 1.6056, + "step": 1469 + }, + { + "epoch": 2.0, + "grad_norm": 0.10516095768661124, + "learning_rate": 0.0, + "loss": 1.6715, + "step": 1470 + } + ], + "logging_steps": 1, + "max_steps": 1470, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 184, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1646792128633242e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}