diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,18273 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7794641184185872,
+  "eval_steps": 500,
+  "global_step": 2600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00029979389169945664,
+      "grad_norm": 4.617544174194336,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 1.26,
+      "step": 1
+    },
+    {
+      "epoch": 0.0005995877833989133,
+      "grad_norm": 5.574990272521973,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 1.2193,
+      "step": 2
+    },
+    {
+      "epoch": 0.0008993816750983699,
+      "grad_norm": 2.685803174972534,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.2098,
+      "step": 3
+    },
+    {
+      "epoch": 0.0011991755667978266,
+      "grad_norm": 1.2137216329574585,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 1.0505,
+      "step": 4
+    },
+    {
+      "epoch": 0.001498969458497283,
+      "grad_norm": 1.1230342388153076,
+      "learning_rate": 0.00015,
+      "loss": 0.8918,
+      "step": 5
+    },
+    {
+      "epoch": 0.0017987633501967398,
+      "grad_norm": 0.9145472049713135,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.8306,
+      "step": 6
+    },
+    {
+      "epoch": 0.0020985572418961962,
+      "grad_norm": 1.5094902515411377,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.788,
+      "step": 7
+    },
+    {
+      "epoch": 0.002398351133595653,
+      "grad_norm": 0.5805755257606506,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.8026,
+      "step": 8
+    },
+    {
+      "epoch": 0.0026981450252951096,
+      "grad_norm": 0.338878333568573,
+      "learning_rate": 0.00027,
+      "loss": 0.723,
+      "step": 9
+    },
+    {
+      "epoch": 0.002997938916994566,
+      "grad_norm": 0.34122711420059204,
+      "learning_rate": 0.0003,
+      "loss": 0.705,
+      "step": 10
+    },
+    {
+      "epoch": 0.003297732808694023,
+      "grad_norm": 0.3165666162967682,
+      "learning_rate": 0.0002999549549549549,
+      "loss": 0.7282,
+      "step": 11
+    },
+    {
+      "epoch": 0.0035975267003934795,
+      "grad_norm": 0.3415158689022064,
+      "learning_rate": 0.0002999099099099099,
+      "loss": 0.6833,
+      "step": 12
+    },
+    {
+      "epoch": 0.003897320592092936,
+      "grad_norm": 0.3431508243083954,
+      "learning_rate": 0.00029986486486486484,
+      "loss": 0.713,
+      "step": 13
+    },
+    {
+      "epoch": 0.0041971144837923925,
+      "grad_norm": 0.3494497239589691,
+      "learning_rate": 0.0002998198198198198,
+      "loss": 0.6822,
+      "step": 14
+    },
+    {
+      "epoch": 0.004496908375491849,
+      "grad_norm": 0.29291579127311707,
+      "learning_rate": 0.00029977477477477477,
+      "loss": 0.6348,
+      "step": 15
+    },
+    {
+      "epoch": 0.004796702267191306,
+      "grad_norm": 0.3061606287956238,
+      "learning_rate": 0.0002997297297297297,
+      "loss": 0.6584,
+      "step": 16
+    },
+    {
+      "epoch": 0.005096496158890762,
+      "grad_norm": 0.32851532101631165,
+      "learning_rate": 0.00029968468468468464,
+      "loss": 0.6792,
+      "step": 17
+    },
+    {
+      "epoch": 0.005396290050590219,
+      "grad_norm": 0.3087822198867798,
+      "learning_rate": 0.00029963963963963963,
+      "loss": 0.686,
+      "step": 18
+    },
+    {
+      "epoch": 0.005696083942289676,
+      "grad_norm": 0.2965148687362671,
+      "learning_rate": 0.00029959459459459457,
+      "loss": 0.6549,
+      "step": 19
+    },
+    {
+      "epoch": 0.005995877833989132,
+      "grad_norm": 0.32235461473464966,
+      "learning_rate": 0.0002995495495495495,
+      "loss": 0.6653,
+      "step": 20
+    },
+    {
+      "epoch": 0.006295671725688589,
+      "grad_norm": 0.29656529426574707,
+      "learning_rate": 0.0002995045045045045,
+      "loss": 0.6423,
+      "step": 21
+    },
+    {
+      "epoch": 0.006595465617388046,
+      "grad_norm": 0.3357860743999481,
+      "learning_rate": 0.00029945945945945943,
+      "loss": 0.6957,
+      "step": 22
+    },
+    {
+      "epoch": 0.006895259509087502,
+      "grad_norm": 0.28682708740234375,
+      "learning_rate": 0.00029941441441441437,
+      "loss": 0.6475,
+      "step": 23
+    },
+    {
+      "epoch": 0.007195053400786959,
+      "grad_norm": 0.29643768072128296,
+      "learning_rate": 0.0002993693693693693,
+      "loss": 0.6651,
+      "step": 24
+    },
+    {
+      "epoch": 0.007494847292486416,
+      "grad_norm": 0.2794937789440155,
+      "learning_rate": 0.0002993243243243243,
+      "loss": 0.6423,
+      "step": 25
+    },
+    {
+      "epoch": 0.007794641184185872,
+      "grad_norm": 0.3023228943347931,
+      "learning_rate": 0.00029927927927927923,
+      "loss": 0.6728,
+      "step": 26
+    },
+    {
+      "epoch": 0.008094435075885328,
+      "grad_norm": 0.27937522530555725,
+      "learning_rate": 0.00029923423423423417,
+      "loss": 0.6538,
+      "step": 27
+    },
+    {
+      "epoch": 0.008394228967584785,
+      "grad_norm": 0.2845768332481384,
+      "learning_rate": 0.00029918918918918916,
+      "loss": 0.6588,
+      "step": 28
+    },
+    {
+      "epoch": 0.008694022859284242,
+      "grad_norm": 0.2637081742286682,
+      "learning_rate": 0.0002991441441441441,
+      "loss": 0.6253,
+      "step": 29
+    },
+    {
+      "epoch": 0.008993816750983699,
+      "grad_norm": 0.25597694516181946,
+      "learning_rate": 0.00029909909909909903,
+      "loss": 0.5973,
+      "step": 30
+    },
+    {
+      "epoch": 0.009293610642683156,
+      "grad_norm": 0.28932246565818787,
+      "learning_rate": 0.000299054054054054,
+      "loss": 0.6607,
+      "step": 31
+    },
+    {
+      "epoch": 0.009593404534382613,
+      "grad_norm": 0.28410351276397705,
+      "learning_rate": 0.00029900900900900896,
+      "loss": 0.6496,
+      "step": 32
+    },
+    {
+      "epoch": 0.009893198426082068,
+      "grad_norm": 0.2708141803741455,
+      "learning_rate": 0.00029896396396396395,
+      "loss": 0.6161,
+      "step": 33
+    },
+    {
+      "epoch": 0.010192992317781525,
+      "grad_norm": 0.27171120047569275,
+      "learning_rate": 0.0002989189189189189,
+      "loss": 0.6179,
+      "step": 34
+    },
+    {
+      "epoch": 0.010492786209480982,
+      "grad_norm": 0.2806681990623474,
+      "learning_rate": 0.0002988738738738738,
+      "loss": 0.6111,
+      "step": 35
+    },
+    {
+      "epoch": 0.010792580101180439,
+      "grad_norm": 0.36722084879875183,
+      "learning_rate": 0.0002988288288288288,
+      "loss": 0.5778,
+      "step": 36
+    },
+    {
+      "epoch": 0.011092373992879895,
+      "grad_norm": 0.26182547211647034,
+      "learning_rate": 0.00029878378378378375,
+      "loss": 0.5941,
+      "step": 37
+    },
+    {
+      "epoch": 0.011392167884579352,
+      "grad_norm": 0.26753902435302734,
+      "learning_rate": 0.0002987387387387387,
+      "loss": 0.6019,
+      "step": 38
+    },
+    {
+      "epoch": 0.011691961776278808,
+      "grad_norm": 0.28038090467453003,
+      "learning_rate": 0.0002986936936936937,
+      "loss": 0.6379,
+      "step": 39
+    },
+    {
+      "epoch": 0.011991755667978264,
+      "grad_norm": 0.29290881752967834,
+      "learning_rate": 0.0002986486486486486,
+      "loss": 0.6743,
+      "step": 40
+    },
+    {
+      "epoch": 0.012291549559677721,
+      "grad_norm": 0.28465205430984497,
+      "learning_rate": 0.00029860360360360356,
+      "loss": 0.6214,
+      "step": 41
+    },
+    {
+      "epoch": 0.012591343451377178,
+      "grad_norm": 0.26730677485466003,
+      "learning_rate": 0.00029855855855855855,
+      "loss": 0.6085,
+      "step": 42
+    },
+    {
+      "epoch": 0.012891137343076635,
+      "grad_norm": 0.2801668643951416,
+      "learning_rate": 0.0002985135135135135,
+      "loss": 0.6541,
+      "step": 43
+    },
+    {
+      "epoch": 0.013190931234776092,
+      "grad_norm": 0.2741893529891968,
+      "learning_rate": 0.0002984684684684684,
+      "loss": 0.6102,
+      "step": 44
+    },
+    {
+      "epoch": 0.013490725126475547,
+      "grad_norm": 0.26284873485565186,
+      "learning_rate": 0.0002984234234234234,
+      "loss": 0.5762,
+      "step": 45
+    },
+    {
+      "epoch": 0.013790519018175004,
+      "grad_norm": 0.26464149355888367,
+      "learning_rate": 0.00029837837837837835,
+      "loss": 0.6238,
+      "step": 46
+    },
+    {
+      "epoch": 0.014090312909874461,
+      "grad_norm": 0.2674684226512909,
+      "learning_rate": 0.00029833333333333334,
+      "loss": 0.6342,
+      "step": 47
+    },
+    {
+      "epoch": 0.014390106801573918,
+      "grad_norm": 0.2591281533241272,
+      "learning_rate": 0.0002982882882882883,
+      "loss": 0.5993,
+      "step": 48
+    },
+    {
+      "epoch": 0.014689900693273375,
+      "grad_norm": 0.29611843824386597,
+      "learning_rate": 0.0002982432432432432,
+      "loss": 0.6376,
+      "step": 49
+    },
+    {
+      "epoch": 0.014989694584972832,
+      "grad_norm": 0.26478683948516846,
+      "learning_rate": 0.0002981981981981982,
+      "loss": 0.5935,
+      "step": 50
+    },
+    {
+      "epoch": 0.015289488476672287,
+      "grad_norm": 0.25142884254455566,
+      "learning_rate": 0.00029815315315315314,
+      "loss": 0.5522,
+      "step": 51
+    },
+    {
+      "epoch": 0.015589282368371744,
+      "grad_norm": 0.26863306760787964,
+      "learning_rate": 0.0002981081081081081,
+      "loss": 0.5808,
+      "step": 52
+    },
+    {
+      "epoch": 0.015889076260071203,
+      "grad_norm": 0.26126888394355774,
+      "learning_rate": 0.00029806306306306307,
+      "loss": 0.6101,
+      "step": 53
+    },
+    {
+      "epoch": 0.016188870151770656,
+      "grad_norm": 0.24878369271755219,
+      "learning_rate": 0.000298018018018018,
+      "loss": 0.5787,
+      "step": 54
+    },
+    {
+      "epoch": 0.016488664043470113,
+      "grad_norm": 0.2513170838356018,
+      "learning_rate": 0.00029797297297297294,
+      "loss": 0.5743,
+      "step": 55
+    },
+    {
+      "epoch": 0.01678845793516957,
+      "grad_norm": 0.26510271430015564,
+      "learning_rate": 0.00029792792792792793,
+      "loss": 0.6023,
+      "step": 56
+    },
+    {
+      "epoch": 0.017088251826869027,
+      "grad_norm": 0.2809905409812927,
+      "learning_rate": 0.00029788288288288287,
+      "loss": 0.6336,
+      "step": 57
+    },
+    {
+      "epoch": 0.017388045718568484,
+      "grad_norm": 0.250021755695343,
+      "learning_rate": 0.0002978378378378378,
+      "loss": 0.5385,
+      "step": 58
+    },
+    {
+      "epoch": 0.01768783961026794,
+      "grad_norm": 0.248112291097641,
+      "learning_rate": 0.0002977927927927928,
+      "loss": 0.5633,
+      "step": 59
+    },
+    {
+      "epoch": 0.017987633501967398,
+      "grad_norm": 0.2756728529930115,
+      "learning_rate": 0.00029774774774774773,
+      "loss": 0.5988,
+      "step": 60
+    },
+    {
+      "epoch": 0.018287427393666855,
+      "grad_norm": 0.26302388310432434,
+      "learning_rate": 0.00029770270270270267,
+      "loss": 0.5738,
+      "step": 61
+    },
+    {
+      "epoch": 0.01858722128536631,
+      "grad_norm": 0.2736607789993286,
+      "learning_rate": 0.00029765765765765766,
+      "loss": 0.5908,
+      "step": 62
+    },
+    {
+      "epoch": 0.01888701517706577,
+      "grad_norm": 0.26531675457954407,
+      "learning_rate": 0.0002976126126126126,
+      "loss": 0.6112,
+      "step": 63
+    },
+    {
+      "epoch": 0.019186809068765225,
+      "grad_norm": 0.252244234085083,
+      "learning_rate": 0.00029756756756756753,
+      "loss": 0.5921,
+      "step": 64
+    },
+    {
+      "epoch": 0.019486602960464682,
+      "grad_norm": 0.26548662781715393,
+      "learning_rate": 0.0002975225225225225,
+      "loss": 0.604,
+      "step": 65
+    },
+    {
+      "epoch": 0.019786396852164136,
+      "grad_norm": 0.2575795352458954,
+      "learning_rate": 0.00029747747747747746,
+      "loss": 0.564,
+      "step": 66
+    },
+    {
+      "epoch": 0.020086190743863593,
+      "grad_norm": 0.25535911321640015,
+      "learning_rate": 0.0002974324324324324,
+      "loss": 0.5962,
+      "step": 67
+    },
+    {
+      "epoch": 0.02038598463556305,
+      "grad_norm": 0.26686182618141174,
+      "learning_rate": 0.0002973873873873874,
+      "loss": 0.6268,
+      "step": 68
+    },
+    {
+      "epoch": 0.020685778527262506,
+      "grad_norm": 0.25553348660469055,
+      "learning_rate": 0.00029734234234234233,
+      "loss": 0.5583,
+      "step": 69
+    },
+    {
+      "epoch": 0.020985572418961963,
+      "grad_norm": 0.2537127435207367,
+      "learning_rate": 0.00029729729729729726,
+      "loss": 0.5734,
+      "step": 70
+    },
+    {
+      "epoch": 0.02128536631066142,
+      "grad_norm": 0.27586349844932556,
+      "learning_rate": 0.0002972522522522522,
+      "loss": 0.5667,
+      "step": 71
+    },
+    {
+      "epoch": 0.021585160202360877,
+      "grad_norm": 0.2614293694496155,
+      "learning_rate": 0.0002972072072072072,
+      "loss": 0.5831,
+      "step": 72
+    },
+    {
+      "epoch": 0.021884954094060334,
+      "grad_norm": 0.263944536447525,
+      "learning_rate": 0.00029716216216216213,
+      "loss": 0.5862,
+      "step": 73
+    },
+    {
+      "epoch": 0.02218474798575979,
+      "grad_norm": 0.24802720546722412,
+      "learning_rate": 0.00029711711711711707,
+      "loss": 0.5788,
+      "step": 74
+    },
+    {
+      "epoch": 0.022484541877459248,
+      "grad_norm": 0.2746400833129883,
+      "learning_rate": 0.00029707207207207206,
+      "loss": 0.591,
+      "step": 75
+    },
+    {
+      "epoch": 0.022784335769158705,
+      "grad_norm": 0.2553657591342926,
+      "learning_rate": 0.000297027027027027,
+      "loss": 0.5735,
+      "step": 76
+    },
+    {
+      "epoch": 0.02308412966085816,
+      "grad_norm": 0.2704835534095764,
+      "learning_rate": 0.00029698198198198193,
+      "loss": 0.5809,
+      "step": 77
+    },
+    {
+      "epoch": 0.023383923552557615,
+      "grad_norm": 0.2556537985801697,
+      "learning_rate": 0.0002969369369369369,
+      "loss": 0.5496,
+      "step": 78
+    },
+    {
+      "epoch": 0.023683717444257072,
+      "grad_norm": 0.2523523271083832,
+      "learning_rate": 0.00029689189189189186,
+      "loss": 0.5758,
+      "step": 79
+    },
+    {
+      "epoch": 0.02398351133595653,
+      "grad_norm": 0.25214141607284546,
+      "learning_rate": 0.0002968468468468468,
+      "loss": 0.5844,
+      "step": 80
+    },
+    {
+      "epoch": 0.024283305227655986,
+      "grad_norm": 0.2590884268283844,
+      "learning_rate": 0.0002968018018018018,
+      "loss": 0.5862,
+      "step": 81
+    },
+    {
+      "epoch": 0.024583099119355443,
+      "grad_norm": 0.24399450421333313,
+      "learning_rate": 0.0002967567567567567,
+      "loss": 0.5685,
+      "step": 82
+    },
+    {
+      "epoch": 0.0248828930110549,
+      "grad_norm": 0.24457746744155884,
+      "learning_rate": 0.00029671171171171166,
+      "loss": 0.5684,
+      "step": 83
+    },
+    {
+      "epoch": 0.025182686902754357,
+      "grad_norm": 0.23214662075042725,
+      "learning_rate": 0.00029666666666666665,
+      "loss": 0.5682,
+      "step": 84
+    },
+    {
+      "epoch": 0.025482480794453814,
+      "grad_norm": 0.24058258533477783,
+      "learning_rate": 0.0002966216216216216,
+      "loss": 0.5688,
+      "step": 85
+    },
+    {
+      "epoch": 0.02578227468615327,
+      "grad_norm": 0.2338346391916275,
+      "learning_rate": 0.0002965765765765765,
+      "loss": 0.5485,
+      "step": 86
+    },
+    {
+      "epoch": 0.026082068577852727,
+      "grad_norm": 0.2553529143333435,
+      "learning_rate": 0.0002965315315315315,
+      "loss": 0.5985,
+      "step": 87
+    },
+    {
+      "epoch": 0.026381862469552184,
+      "grad_norm": 0.2518393099308014,
+      "learning_rate": 0.00029648648648648645,
+      "loss": 0.6079,
+      "step": 88
+    },
+    {
+      "epoch": 0.02668165636125164,
+      "grad_norm": 0.23418371379375458,
+      "learning_rate": 0.0002964414414414414,
+      "loss": 0.5942,
+      "step": 89
+    },
+    {
+      "epoch": 0.026981450252951095,
+      "grad_norm": 0.2454022616147995,
+      "learning_rate": 0.0002963963963963964,
+      "loss": 0.5519,
+      "step": 90
+    },
+    {
+      "epoch": 0.02728124414465055,
+      "grad_norm": 0.25474220514297485,
+      "learning_rate": 0.0002963513513513513,
+      "loss": 0.5771,
+      "step": 91
+    },
+    {
+      "epoch": 0.02758103803635001,
+      "grad_norm": 0.2332638055086136,
+      "learning_rate": 0.00029630630630630625,
+      "loss": 0.5477,
+      "step": 92
+    },
+    {
+      "epoch": 0.027880831928049465,
+      "grad_norm": 0.23931057751178741,
+      "learning_rate": 0.00029626126126126124,
+      "loss": 0.5391,
+      "step": 93
+    },
+    {
+      "epoch": 0.028180625819748922,
+      "grad_norm": 0.23103167116641998,
+      "learning_rate": 0.0002962162162162162,
+      "loss": 0.5622,
+      "step": 94
+    },
+    {
+      "epoch": 0.02848041971144838,
+      "grad_norm": 0.2356046438217163,
+      "learning_rate": 0.0002961711711711711,
+      "loss": 0.5729,
+      "step": 95
+    },
+    {
+      "epoch": 0.028780213603147836,
+      "grad_norm": 0.26015767455101013,
+      "learning_rate": 0.0002961261261261261,
+      "loss": 0.5878,
+      "step": 96
+    },
+    {
+      "epoch": 0.029080007494847293,
+      "grad_norm": 0.2299613654613495,
+      "learning_rate": 0.00029608108108108104,
+      "loss": 0.5385,
+      "step": 97
+    },
+    {
+      "epoch": 0.02937980138654675,
+      "grad_norm": 0.24516573548316956,
+      "learning_rate": 0.000296036036036036,
+      "loss": 0.5688,
+      "step": 98
+    },
+    {
+      "epoch": 0.029679595278246207,
+      "grad_norm": 0.25019732117652893,
+      "learning_rate": 0.00029599099099099097,
+      "loss": 0.5562,
+      "step": 99
+    },
+    {
+      "epoch": 0.029979389169945664,
+      "grad_norm": 0.23934195935726166,
+      "learning_rate": 0.0002959459459459459,
+      "loss": 0.5572,
+      "step": 100
+    },
+    {
+      "epoch": 0.030279183061645117,
+      "grad_norm": 0.23651225864887238,
+      "learning_rate": 0.00029590090090090085,
+      "loss": 0.5517,
+      "step": 101
+    },
+    {
+      "epoch": 0.030578976953344574,
+      "grad_norm": 0.2545163631439209,
+      "learning_rate": 0.00029585585585585584,
+      "loss": 0.5553,
+      "step": 102
+    },
+    {
+      "epoch": 0.03087877084504403,
+      "grad_norm": 0.2259800285100937,
+      "learning_rate": 0.0002958108108108108,
+      "loss": 0.5216,
+      "step": 103
+    },
+    {
+      "epoch": 0.031178564736743488,
+      "grad_norm": 0.25137075781822205,
+      "learning_rate": 0.00029576576576576576,
+      "loss": 0.559,
+      "step": 104
+    },
+    {
+      "epoch": 0.03147835862844295,
+      "grad_norm": 0.2526615262031555,
+      "learning_rate": 0.0002957207207207207,
+      "loss": 0.6025,
+      "step": 105
+    },
+    {
+      "epoch": 0.031778152520142405,
+      "grad_norm": 0.2406662553548813,
+      "learning_rate": 0.00029567567567567564,
+      "loss": 0.562,
+      "step": 106
+    },
+    {
+      "epoch": 0.032077946411841855,
+      "grad_norm": 0.23598824441432953,
+      "learning_rate": 0.00029563063063063063,
+      "loss": 0.5749,
+      "step": 107
+    },
+    {
+      "epoch": 0.03237774030354131,
+      "grad_norm": 0.24079738557338715,
+      "learning_rate": 0.00029558558558558557,
+      "loss": 0.5697,
+      "step": 108
+    },
+    {
+      "epoch": 0.03267753419524077,
+      "grad_norm": 0.25064727663993835,
+      "learning_rate": 0.0002955405405405405,
+      "loss": 0.5821,
+      "step": 109
+    },
+    {
+      "epoch": 0.032977328086940226,
+      "grad_norm": 0.22557033598423004,
+      "learning_rate": 0.0002954954954954955,
+      "loss": 0.5367,
+      "step": 110
+    },
+    {
+      "epoch": 0.03327712197863968,
+      "grad_norm": 0.22542916238307953,
+      "learning_rate": 0.00029545045045045043,
+      "loss": 0.55,
+      "step": 111
+    },
+    {
+      "epoch": 0.03357691587033914,
+      "grad_norm": 0.24270494282245636,
+      "learning_rate": 0.00029540540540540537,
+      "loss": 0.5773,
+      "step": 112
+    },
+    {
+      "epoch": 0.0338767097620386,
+      "grad_norm": 0.23397719860076904,
+      "learning_rate": 0.00029536036036036036,
+      "loss": 0.5516,
+      "step": 113
+    },
+    {
+      "epoch": 0.034176503653738054,
+      "grad_norm": 0.2322990894317627,
+      "learning_rate": 0.0002953153153153153,
+      "loss": 0.5307,
+      "step": 114
+    },
+    {
+      "epoch": 0.03447629754543751,
+      "grad_norm": 0.2167540043592453,
+      "learning_rate": 0.0002952702702702703,
+      "loss": 0.5197,
+      "step": 115
+    },
+    {
+      "epoch": 0.03477609143713697,
+      "grad_norm": 0.2705434262752533,
+      "learning_rate": 0.0002952252252252252,
+      "loss": 0.564,
+      "step": 116
+    },
+    {
+      "epoch": 0.035075885328836424,
+      "grad_norm": 0.23195774853229523,
+      "learning_rate": 0.00029518018018018016,
+      "loss": 0.5397,
+      "step": 117
+    },
+    {
+      "epoch": 0.03537567922053588,
+      "grad_norm": 0.22944559156894684,
+      "learning_rate": 0.00029513513513513515,
+      "loss": 0.5197,
+      "step": 118
+    },
+    {
+      "epoch": 0.03567547311223534,
+      "grad_norm": 0.25140368938446045,
+      "learning_rate": 0.0002950900900900901,
+      "loss": 0.5661,
+      "step": 119
+    },
+    {
+      "epoch": 0.035975267003934795,
+      "grad_norm": 0.23137278854846954,
+      "learning_rate": 0.000295045045045045,
+      "loss": 0.5284,
+      "step": 120
+    },
+    {
+      "epoch": 0.03627506089563425,
+      "grad_norm": 0.2358487993478775,
+      "learning_rate": 0.00029499999999999996,
+      "loss": 0.551,
+      "step": 121
+    },
+    {
+      "epoch": 0.03657485478733371,
+      "grad_norm": 0.23212139308452606,
+      "learning_rate": 0.00029495495495495495,
+      "loss": 0.5624,
+      "step": 122
+    },
+    {
+      "epoch": 0.036874648679033166,
+      "grad_norm": 0.2504674196243286,
+      "learning_rate": 0.0002949099099099099,
+      "loss": 0.5649,
+      "step": 123
+    },
+    {
+      "epoch": 0.03717444257073262,
+      "grad_norm": 0.24574224650859833,
+      "learning_rate": 0.0002948648648648648,
+      "loss": 0.5415,
+      "step": 124
+    },
+    {
+      "epoch": 0.03747423646243208,
+      "grad_norm": 0.23007921874523163,
+      "learning_rate": 0.0002948198198198198,
+      "loss": 0.5624,
+      "step": 125
+    },
+    {
+      "epoch": 0.03777403035413154,
+      "grad_norm": 0.2503686547279358,
+      "learning_rate": 0.00029477477477477475,
+      "loss": 0.5484,
+      "step": 126
+    },
+    {
+      "epoch": 0.038073824245830994,
+      "grad_norm": 0.2467029094696045,
+      "learning_rate": 0.0002947297297297297,
+      "loss": 0.5442,
+      "step": 127
+    },
+    {
+      "epoch": 0.03837361813753045,
+      "grad_norm": 0.23382776975631714,
+      "learning_rate": 0.0002946846846846847,
+      "loss": 0.5485,
+      "step": 128
+    },
+    {
+      "epoch": 0.03867341202922991,
+      "grad_norm": 0.25080442428588867,
+      "learning_rate": 0.0002946396396396396,
+      "loss": 0.5726,
+      "step": 129
+    },
+    {
+      "epoch": 0.038973205920929364,
+      "grad_norm": 0.2410043627023697,
+      "learning_rate": 0.00029459459459459455,
+      "loss": 0.5475,
+      "step": 130
+    },
+    {
+      "epoch": 0.039272999812628814,
+      "grad_norm": 0.24639956653118134,
+      "learning_rate": 0.00029454954954954955,
+      "loss": 0.5461,
+      "step": 131
+    },
+    {
+      "epoch": 0.03957279370432827,
+      "grad_norm": 0.23122674226760864,
+      "learning_rate": 0.0002945045045045045,
+      "loss": 0.5605,
+      "step": 132
+    },
+    {
+      "epoch": 0.03987258759602773,
+      "grad_norm": 0.22207331657409668,
+      "learning_rate": 0.0002944594594594594,
+      "loss": 0.5045,
+      "step": 133
+    },
+    {
+      "epoch": 0.040172381487727185,
+      "grad_norm": 0.2344479262828827,
+      "learning_rate": 0.0002944144144144144,
+      "loss": 0.5724,
+      "step": 134
+    },
+    {
+      "epoch": 0.04047217537942664,
+      "grad_norm": 0.2410653978586197,
+      "learning_rate": 0.00029436936936936935,
+      "loss": 0.5266,
+      "step": 135
+    },
+    {
+      "epoch": 0.0407719692711261,
+      "grad_norm": 0.23028722405433655,
+      "learning_rate": 0.0002943243243243243,
+      "loss": 0.545,
+      "step": 136
+    },
+    {
+      "epoch": 0.041071763162825556,
+      "grad_norm": 0.23669356107711792,
+      "learning_rate": 0.0002942792792792793,
+      "loss": 0.5429,
+      "step": 137
+    },
+    {
+      "epoch": 0.04137155705452501,
+      "grad_norm": 0.24232889711856842,
+      "learning_rate": 0.0002942342342342342,
+      "loss": 0.5815,
+      "step": 138
+    },
+    {
+      "epoch": 0.04167135094622447,
+      "grad_norm": 0.22076158225536346,
+      "learning_rate": 0.00029418918918918915,
+      "loss": 0.5028,
+      "step": 139
+    },
+    {
+      "epoch": 0.04197114483792393,
+      "grad_norm": 0.23322072625160217,
+      "learning_rate": 0.00029414414414414414,
+      "loss": 0.5293,
+      "step": 140
+    },
+    {
+      "epoch": 0.042270938729623384,
+      "grad_norm": 0.24910598993301392,
+      "learning_rate": 0.0002940990990990991,
+      "loss": 0.5403,
+      "step": 141
+    },
+    {
+      "epoch": 0.04257073262132284,
+      "grad_norm": 0.24588559567928314,
+      "learning_rate": 0.000294054054054054,
+      "loss": 0.538,
+      "step": 142
+    },
+    {
+      "epoch": 0.0428705265130223,
+      "grad_norm": 0.20689445734024048,
+      "learning_rate": 0.00029400900900900895,
+      "loss": 0.4921,
+      "step": 143
+    },
+    {
+      "epoch": 0.043170320404721754,
+      "grad_norm": 0.2518969178199768,
+      "learning_rate": 0.00029396396396396394,
+      "loss": 0.5625,
+      "step": 144
+    },
+    {
+      "epoch": 0.04347011429642121,
+      "grad_norm": 0.25027644634246826,
+      "learning_rate": 0.0002939189189189189,
+      "loss": 0.5823,
+      "step": 145
+    },
+    {
+      "epoch": 0.04376990818812067,
+      "grad_norm": 0.24390611052513123,
+      "learning_rate": 0.0002938738738738738,
+      "loss": 0.5205,
+      "step": 146
+    },
+    {
+      "epoch": 0.044069702079820125,
+      "grad_norm": 0.24781368672847748,
+      "learning_rate": 0.0002938288288288288,
+      "loss": 0.5559,
+      "step": 147
+    },
+    {
+      "epoch": 0.04436949597151958,
+      "grad_norm": 0.23696455359458923,
+      "learning_rate": 0.00029378378378378374,
+      "loss": 0.5443,
+      "step": 148
+    },
+    {
+      "epoch": 0.04466928986321904,
+      "grad_norm": 0.23888202011585236,
+      "learning_rate": 0.0002937387387387387,
+      "loss": 0.5397,
+      "step": 149
+    },
+    {
+      "epoch": 0.044969083754918496,
+      "grad_norm": 0.22896665334701538,
+      "learning_rate": 0.00029369369369369367,
+      "loss": 0.5462,
+      "step": 150
+    },
+    {
+      "epoch": 0.04526887764661795,
+      "grad_norm": 0.23967792093753815,
+      "learning_rate": 0.0002936486486486486,
+      "loss": 0.5653,
+      "step": 151
+    },
+    {
+      "epoch": 0.04556867153831741,
+      "grad_norm": 0.2244715392589569,
+      "learning_rate": 0.00029360360360360354,
+      "loss": 0.5251,
+      "step": 152
+    },
+    {
+      "epoch": 0.045868465430016866,
+      "grad_norm": 0.2287164330482483,
+      "learning_rate": 0.00029355855855855853,
+      "loss": 0.5313,
+      "step": 153
+    },
+    {
+      "epoch": 0.04616825932171632,
+      "grad_norm": 0.2419881671667099,
+      "learning_rate": 0.00029351351351351347,
+      "loss": 0.5397,
+      "step": 154
+    },
+    {
+      "epoch": 0.04646805321341577,
+      "grad_norm": 0.2414129674434662,
+      "learning_rate": 0.0002934684684684684,
+      "loss": 0.574,
+      "step": 155
+    },
+    {
+      "epoch": 0.04676784710511523,
+      "grad_norm": 0.237702414393425,
+      "learning_rate": 0.0002934234234234234,
+      "loss": 0.5377,
+      "step": 156
+    },
+    {
+      "epoch": 0.04706764099681469,
+      "grad_norm": 0.24712331593036652,
+      "learning_rate": 0.00029337837837837833,
+      "loss": 0.5722,
+      "step": 157
+    },
+    {
+      "epoch": 0.047367434888514144,
+      "grad_norm": 0.23250596225261688,
+      "learning_rate": 0.00029333333333333327,
+      "loss": 0.5663,
+      "step": 158
+    },
+    {
+      "epoch": 0.0476672287802136,
+      "grad_norm": 0.22616314888000488,
+      "learning_rate": 0.00029328828828828826,
+      "loss": 0.4984,
+      "step": 159
+    },
+    {
+      "epoch": 0.04796702267191306,
+      "grad_norm": 0.23835696280002594,
+      "learning_rate": 0.0002932432432432432,
+      "loss": 0.5512,
+      "step": 160
+    },
+    {
+      "epoch": 0.048266816563612515,
+      "grad_norm": 0.23458854854106903,
+      "learning_rate": 0.0002931981981981982,
+      "loss": 0.5472,
+      "step": 161
+    },
+    {
+      "epoch": 0.04856661045531197,
+      "grad_norm": 0.2530427873134613,
+      "learning_rate": 0.0002931531531531531,
+      "loss": 0.5807,
+      "step": 162
+    },
+    {
+      "epoch": 0.04886640434701143,
+      "grad_norm": 0.23911581933498383,
+      "learning_rate": 0.00029310810810810806,
+      "loss": 0.5236,
+      "step": 163
+    },
+    {
+      "epoch": 0.049166198238710886,
+      "grad_norm": 0.24235881865024567,
+      "learning_rate": 0.00029306306306306305,
+      "loss": 0.5509,
+      "step": 164
+    },
+    {
+      "epoch": 0.04946599213041034,
+      "grad_norm": 0.23077982664108276,
+      "learning_rate": 0.000293018018018018,
+      "loss": 0.5265,
+      "step": 165
+    },
+    {
+      "epoch": 0.0497657860221098,
+      "grad_norm": 0.23159649968147278,
+      "learning_rate": 0.00029297297297297293,
+      "loss": 0.5521,
+      "step": 166
+    },
+    {
+      "epoch": 0.050065579913809256,
+      "grad_norm": 0.23901638388633728,
+      "learning_rate": 0.0002929279279279279,
+      "loss": 0.517,
+      "step": 167
+    },
+    {
+      "epoch": 0.05036537380550871,
+      "grad_norm": 0.2441125512123108,
+      "learning_rate": 0.00029288288288288286,
+      "loss": 0.5376,
+      "step": 168
+    },
+    {
+      "epoch": 0.05066516769720817,
+      "grad_norm": 0.2408357411623001,
+      "learning_rate": 0.0002928378378378378,
+      "loss": 0.5119,
+      "step": 169
+    },
+    {
+      "epoch": 0.05096496158890763,
+      "grad_norm": 0.24620820581912994,
+      "learning_rate": 0.0002927927927927928,
+      "loss": 0.5454,
+      "step": 170
+    },
+    {
+      "epoch": 0.051264755480607084,
+      "grad_norm": 0.23092059791088104,
+      "learning_rate": 0.0002927477477477477,
+      "loss": 0.5464,
+      "step": 171
+    },
+    {
+      "epoch": 0.05156454937230654,
+      "grad_norm": 0.23076176643371582,
+      "learning_rate": 0.0002927027027027027,
+      "loss": 0.5153,
+      "step": 172
+    },
+    {
+      "epoch": 0.051864343264006,
+      "grad_norm": 0.2518848478794098,
+      "learning_rate": 0.00029265765765765765,
+      "loss": 0.5416,
+      "step": 173
+    },
+    {
+      "epoch": 0.052164137155705455,
+      "grad_norm": 0.24004964530467987,
+      "learning_rate": 0.0002926126126126126,
+      "loss": 0.5625,
+      "step": 174
+    },
+    {
+      "epoch": 0.05246393104740491,
+      "grad_norm": 0.26169759035110474,
+      "learning_rate": 0.0002925675675675676,
+      "loss": 0.5322,
+      "step": 175
+    },
+    {
+      "epoch": 0.05276372493910437,
+      "grad_norm": 0.23733891546726227,
+      "learning_rate": 0.0002925225225225225,
+      "loss": 0.5526,
+      "step": 176
+    },
+    {
+      "epoch": 0.053063518830803826,
+      "grad_norm": 0.22720226645469666,
+      "learning_rate": 0.00029247747747747745,
+      "loss": 0.5108,
+      "step": 177
+    },
+    {
+      "epoch": 0.05336331272250328,
+      "grad_norm": 0.2425220012664795,
+      "learning_rate": 0.00029243243243243244,
+      "loss": 0.5337,
+      "step": 178
+    },
+    {
+      "epoch": 0.05366310661420273,
+      "grad_norm": 0.24859829246997833,
+      "learning_rate": 0.0002923873873873874,
+      "loss": 0.5239,
+      "step": 179
+    },
+    {
+      "epoch": 0.05396290050590219,
+      "grad_norm": 0.23471564054489136,
+      "learning_rate": 0.0002923423423423423,
+      "loss": 0.5372,
+      "step": 180
+    },
+    {
+      "epoch": 0.054262694397601646,
+      "grad_norm": 0.23243340849876404,
+      "learning_rate": 0.0002922972972972973,
+      "loss": 0.5283,
+      "step": 181
+    },
+    {
+      "epoch": 0.0545624882893011,
+      "grad_norm": 0.240774467587471,
+      "learning_rate": 0.00029225225225225224,
+      "loss": 0.523,
+      "step": 182
+    },
+    {
+      "epoch": 0.05486228218100056,
+      "grad_norm": 0.2518199384212494,
+      "learning_rate": 0.0002922072072072072,
+      "loss": 0.56,
+      "step": 183
+    },
+    {
+      "epoch": 0.05516207607270002,
+      "grad_norm": 0.22320473194122314,
+      "learning_rate": 0.00029216216216216217,
+      "loss": 0.5442,
+      "step": 184
+    },
+    {
+      "epoch": 0.055461869964399474,
+      "grad_norm": 0.24681055545806885,
+      "learning_rate": 0.0002921171171171171,
+      "loss": 0.5596,
+      "step": 185
+    },
+    {
+      "epoch": 0.05576166385609893,
+      "grad_norm": 0.23595312237739563,
+      "learning_rate": 0.00029207207207207204,
+      "loss": 0.5343,
+      "step": 186
+    },
+    {
+      "epoch": 0.05606145774779839,
+      "grad_norm": 0.2307872772216797,
+      "learning_rate": 0.00029202702702702703,
+      "loss": 0.5206,
+      "step": 187
+    },
+    {
+      "epoch": 0.056361251639497845,
+      "grad_norm": 0.24639058113098145,
+      "learning_rate": 0.00029198198198198197,
+      "loss": 0.5449,
+      "step": 188
+    },
+    {
+      "epoch": 0.0566610455311973,
+      "grad_norm": 0.24432207643985748,
+      "learning_rate": 0.0002919369369369369,
+      "loss": 0.5455,
+      "step": 189
+    },
+    {
+      "epoch": 0.05696083942289676,
+      "grad_norm": 0.23387478291988373,
+      "learning_rate": 0.0002918918918918919,
+      "loss": 0.5402,
+      "step": 190
+    },
+    {
+      "epoch": 0.057260633314596215,
+      "grad_norm": 0.2183016538619995,
+      "learning_rate": 0.00029184684684684684,
+      "loss": 0.4989,
+      "step": 191
+    },
+    {
+      "epoch": 0.05756042720629567,
+      "grad_norm": 0.23916368186473846,
+      "learning_rate": 0.00029180180180180177,
+      "loss": 0.5397,
+      "step": 192
+    },
+    {
+      "epoch": 0.05786022109799513,
+      "grad_norm": 0.24624724686145782,
+      "learning_rate": 0.0002917567567567567,
+      "loss": 0.5248,
+      "step": 193
+    },
+    {
+      "epoch": 0.058160014989694586,
+      "grad_norm": 0.2571977376937866,
+      "learning_rate": 0.0002917117117117117,
+      "loss": 0.587,
+      "step": 194
+    },
+    {
+      "epoch": 0.05845980888139404,
+      "grad_norm": 0.22828777134418488,
+      "learning_rate": 0.00029166666666666664,
+      "loss": 0.5424,
+      "step": 195
+    },
+    {
+      "epoch": 0.0587596027730935,
+      "grad_norm": 0.23892144858837128,
+      "learning_rate": 0.0002916216216216216,
+      "loss": 0.5228,
+      "step": 196
+    },
+    {
+      "epoch": 0.05905939666479296,
+      "grad_norm": 0.22904744744300842,
+      "learning_rate": 0.00029157657657657656,
+      "loss": 0.5052,
+      "step": 197
+    },
+    {
+      "epoch": 0.059359190556492414,
+      "grad_norm": 0.2399347573518753,
+      "learning_rate": 0.0002915315315315315,
+      "loss": 0.541,
+      "step": 198
+    },
+    {
+      "epoch": 0.05965898444819187,
+      "grad_norm": 0.2253408432006836,
+      "learning_rate": 0.00029148648648648644,
+      "loss": 0.5204,
+      "step": 199
+    },
+    {
+      "epoch": 0.05995877833989133,
+      "grad_norm": 0.23211434483528137,
+      "learning_rate": 0.00029144144144144143,
+      "loss": 0.5248,
+      "step": 200
+    },
+    {
+      "epoch": 0.060258572231590785,
+      "grad_norm": 0.24016128480434418,
+      "learning_rate": 0.00029139639639639637,
+      "loss": 0.5386,
+      "step": 201
+    },
+    {
+      "epoch": 0.060558366123290235,
+      "grad_norm": 0.2374415397644043,
+      "learning_rate": 0.0002913513513513513,
+      "loss": 0.4904,
+      "step": 202
+    },
+    {
+      "epoch": 0.06085816001498969,
+      "grad_norm": 0.257735013961792,
+      "learning_rate": 0.0002913063063063063,
+      "loss": 0.5253,
+      "step": 203
+    },
+    {
+      "epoch": 0.06115795390668915,
+      "grad_norm": 0.23416665196418762,
+      "learning_rate": 0.00029126126126126123,
+      "loss": 0.5164,
+      "step": 204
+    },
+    {
+      "epoch": 0.061457747798388605,
+      "grad_norm": 0.24382436275482178,
+      "learning_rate": 0.00029121621621621617,
+      "loss": 0.5507,
+      "step": 205
+    },
+    {
+      "epoch": 0.06175754169008806,
+      "grad_norm": 0.2488170564174652,
+      "learning_rate": 0.00029117117117117116,
+      "loss": 0.543,
+      "step": 206
+    },
+    {
+      "epoch": 0.06205733558178752,
+      "grad_norm": 0.24262337386608124,
+      "learning_rate": 0.0002911261261261261,
+      "loss": 0.5533,
+      "step": 207
+    },
+    {
+      "epoch": 0.062357129473486976,
+      "grad_norm": 0.2239450216293335,
+      "learning_rate": 0.00029108108108108103,
+      "loss": 0.4955,
+      "step": 208
+    },
+    {
+      "epoch": 0.06265692336518644,
+      "grad_norm": 0.22644869983196259,
+      "learning_rate": 0.000291036036036036,
+      "loss": 0.5362,
+      "step": 209
+    },
+    {
+      "epoch": 0.0629567172568859,
+      "grad_norm": 0.23194913566112518,
+      "learning_rate": 0.00029099099099099096,
+      "loss": 0.5226,
+      "step": 210
+    },
+    {
+      "epoch": 0.06325651114858535,
+      "grad_norm": 0.22114375233650208,
+      "learning_rate": 0.0002909459459459459,
+      "loss": 0.472,
+      "step": 211
+    },
+    {
+      "epoch": 0.06355630504028481,
+      "grad_norm": 0.24242356419563293,
+      "learning_rate": 0.0002909009009009009,
+      "loss": 0.5041,
+      "step": 212
+    },
+    {
+      "epoch": 0.06385609893198427,
+      "grad_norm": 0.24195095896720886,
+      "learning_rate": 0.0002908558558558558,
+      "loss": 0.511,
+      "step": 213
+    },
+    {
+      "epoch": 0.06415589282368371,
+      "grad_norm": 0.23720628023147583,
+      "learning_rate": 0.00029081081081081076,
+      "loss": 0.5257,
+      "step": 214
+    },
+    {
+      "epoch": 0.06445568671538317,
+      "grad_norm": 0.22656656801700592,
+      "learning_rate": 0.00029076576576576575,
+      "loss": 0.5336,
+      "step": 215
+    },
+    {
+      "epoch": 0.06475548060708262,
+      "grad_norm": 0.23326708376407623,
+      "learning_rate": 0.0002907207207207207,
+      "loss": 0.5184,
+      "step": 216
+    },
+    {
+      "epoch": 0.06505527449878208,
+      "grad_norm": 0.2327839583158493,
+      "learning_rate": 0.0002906756756756756,
+      "loss": 0.5434,
+      "step": 217
+    },
+    {
+      "epoch": 0.06535506839048154,
+      "grad_norm": 0.24181734025478363,
+      "learning_rate": 0.0002906306306306306,
+      "loss": 0.534,
+      "step": 218
+    },
+    {
+      "epoch": 0.065654862282181,
+      "grad_norm": 0.23325061798095703,
+      "learning_rate": 0.00029058558558558555,
+      "loss": 0.5438,
+      "step": 219
+    },
+    {
+      "epoch": 0.06595465617388045,
+      "grad_norm": 0.2559988498687744,
+      "learning_rate": 0.0002905405405405405,
+      "loss": 0.5229,
+      "step": 220
+    },
+    {
+      "epoch": 0.06625445006557991,
+      "grad_norm": 0.2432825118303299,
+      "learning_rate": 0.0002904954954954955,
+      "loss": 0.5223,
+      "step": 221
+    },
+    {
+      "epoch": 0.06655424395727937,
+      "grad_norm": 0.23795264959335327,
+      "learning_rate": 0.0002904504504504504,
+      "loss": 0.5199,
+      "step": 222
+    },
+    {
+      "epoch": 0.06685403784897882,
+      "grad_norm": 0.22999484837055206,
+      "learning_rate": 0.00029040540540540535,
+      "loss": 0.506,
+      "step": 223
+    },
+    {
+      "epoch": 0.06715383174067828,
+      "grad_norm": 0.22168515622615814,
+      "learning_rate": 0.00029036036036036034,
+      "loss": 0.4873,
+      "step": 224
+    },
+    {
+      "epoch": 0.06745362563237774,
+      "grad_norm": 0.23448102176189423,
+      "learning_rate": 0.0002903153153153153,
+      "loss": 0.5533,
+      "step": 225
+    },
+    {
+      "epoch": 0.0677534195240772,
+      "grad_norm": 0.23822058737277985,
+      "learning_rate": 0.0002902702702702702,
+      "loss": 0.5221,
+      "step": 226
+    },
+    {
+      "epoch": 0.06805321341577665,
+      "grad_norm": 0.22952431440353394,
+      "learning_rate": 0.0002902252252252252,
+      "loss": 0.5402,
+      "step": 227
+    },
+    {
+      "epoch": 0.06835300730747611,
+      "grad_norm": 0.2197679728269577,
+      "learning_rate": 0.00029018018018018015,
+      "loss": 0.5115,
+      "step": 228
+    },
+    {
+      "epoch": 0.06865280119917556,
+      "grad_norm": 0.23495204746723175,
+      "learning_rate": 0.00029013513513513514,
+      "loss": 0.4963,
+      "step": 229
+    },
+    {
+      "epoch": 0.06895259509087502,
+      "grad_norm": 0.2233276218175888,
+      "learning_rate": 0.0002900900900900901,
+      "loss": 0.5257,
+      "step": 230
+    },
+    {
+      "epoch": 0.06925238898257448,
+      "grad_norm": 0.23294100165367126,
+      "learning_rate": 0.000290045045045045,
+      "loss": 0.5231,
+      "step": 231
+    },
+    {
+      "epoch": 0.06955218287427394,
+      "grad_norm": 0.24349254369735718,
+      "learning_rate": 0.00029,
+      "loss": 0.5198,
+      "step": 232
+    },
+    {
+      "epoch": 0.06985197676597339,
+      "grad_norm": 0.22396647930145264,
+      "learning_rate": 0.00028995495495495494,
+      "loss": 0.5032,
+      "step": 233
+    },
+    {
+      "epoch": 0.07015177065767285,
+      "grad_norm": 0.23737500607967377,
+      "learning_rate": 0.0002899099099099099,
+      "loss": 0.5071,
+      "step": 234
+    },
+    {
+      "epoch": 0.0704515645493723,
+      "grad_norm": 0.22798973321914673,
+      "learning_rate": 0.00028986486486486487,
+      "loss": 0.4844,
+      "step": 235
+    },
+    {
+      "epoch": 0.07075135844107176,
+      "grad_norm": 0.24095383286476135,
+      "learning_rate": 0.0002898198198198198,
+      "loss": 0.5351,
+      "step": 236
+    },
+    {
+      "epoch": 0.07105115233277122,
+      "grad_norm": 0.23701000213623047,
+      "learning_rate": 0.00028977477477477474,
+      "loss": 0.5051,
+      "step": 237
+    },
+    {
+      "epoch": 0.07135094622447068,
+      "grad_norm": 0.23588530719280243,
+      "learning_rate": 0.00028972972972972973,
+      "loss": 0.4992,
+      "step": 238
+    },
+    {
+      "epoch": 0.07165074011617013,
+      "grad_norm": 0.27736473083496094,
+      "learning_rate": 0.00028968468468468467,
+      "loss": 0.5399,
+      "step": 239
+    },
+    {
+      "epoch": 0.07195053400786959,
+      "grad_norm": 0.2486957311630249,
+      "learning_rate": 0.0002896396396396396,
+      "loss": 0.543,
+      "step": 240
+    },
+    {
+      "epoch": 0.07225032789956905,
+      "grad_norm": 0.23474164307117462,
+      "learning_rate": 0.0002895945945945946,
+      "loss": 0.5275,
+      "step": 241
+    },
+    {
+      "epoch": 0.0725501217912685,
+      "grad_norm": 0.26560667157173157,
+      "learning_rate": 0.00028954954954954953,
+      "loss": 0.5576,
+      "step": 242
+    },
+    {
+      "epoch": 0.07284991568296796,
+      "grad_norm": 0.2260173261165619,
+      "learning_rate": 0.00028950450450450447,
+      "loss": 0.5046,
+      "step": 243
+    },
+    {
+      "epoch": 0.07314970957466742,
+      "grad_norm": 0.24497725069522858,
+      "learning_rate": 0.00028945945945945946,
+      "loss": 0.5093,
+      "step": 244
+    },
+    {
+      "epoch": 0.07344950346636687,
+      "grad_norm": 0.24871525168418884,
+      "learning_rate": 0.0002894144144144144,
+      "loss": 0.5353,
+      "step": 245
+    },
+    {
+      "epoch": 0.07374929735806633,
+      "grad_norm": 0.2592950761318207,
+      "learning_rate": 0.00028936936936936933,
+      "loss": 0.553,
+      "step": 246
+    },
+    {
+      "epoch": 0.07404909124976579,
+      "grad_norm": 0.23617064952850342,
+      "learning_rate": 0.0002893243243243243,
+      "loss": 0.5276,
+      "step": 247
+    },
+    {
+      "epoch": 0.07434888514146525,
+      "grad_norm": 0.23108692467212677,
+      "learning_rate": 0.00028927927927927926,
+      "loss": 0.5027,
+      "step": 248
+    },
+    {
+      "epoch": 0.0746486790331647,
+      "grad_norm": 0.2418566793203354,
+      "learning_rate": 0.0002892342342342342,
+      "loss": 0.5142,
+      "step": 249
+    },
+    {
+      "epoch": 0.07494847292486416,
+      "grad_norm": 0.2367243766784668,
+      "learning_rate": 0.0002891891891891892,
+      "loss": 0.5062,
+      "step": 250
+    },
+    {
+      "epoch": 0.07524826681656362,
+      "grad_norm": 0.2405940294265747,
+      "learning_rate": 0.0002891441441441441,
+      "loss": 0.4964,
+      "step": 251
+    },
+    {
+      "epoch": 0.07554806070826307,
+      "grad_norm": 0.22956568002700806,
+      "learning_rate": 0.00028909909909909906,
+      "loss": 0.5081,
+      "step": 252
+    },
+    {
+      "epoch": 0.07584785459996253,
+      "grad_norm": 0.24594880640506744,
+      "learning_rate": 0.00028905405405405405,
+      "loss": 0.5522,
+      "step": 253
+    },
+    {
+      "epoch": 0.07614764849166199,
+      "grad_norm": 0.25554656982421875,
+      "learning_rate": 0.000289009009009009,
+      "loss": 0.5579,
+      "step": 254
+    },
+    {
+      "epoch": 0.07644744238336144,
+      "grad_norm": 0.24296589195728302,
+      "learning_rate": 0.0002889639639639639,
+      "loss": 0.515,
+      "step": 255
+    },
+    {
+      "epoch": 0.0767472362750609,
+      "grad_norm": 0.2128724306821823,
+      "learning_rate": 0.0002889189189189189,
+      "loss": 0.5046,
+      "step": 256
+    },
+    {
+      "epoch": 0.07704703016676036,
+      "grad_norm": 0.2304687201976776,
+      "learning_rate": 0.00028887387387387385,
+      "loss": 0.5275,
+      "step": 257
+    },
+    {
+      "epoch": 0.07734682405845981,
+      "grad_norm": 0.23740506172180176,
+      "learning_rate": 0.0002888288288288288,
+      "loss": 0.5308,
+      "step": 258
+    },
+    {
+      "epoch": 0.07764661795015927,
+      "grad_norm": 0.2270033061504364,
+      "learning_rate": 0.0002887837837837838,
+      "loss": 0.5232,
+      "step": 259
+    },
+    {
+      "epoch": 0.07794641184185873,
+      "grad_norm": 0.24655793607234955,
+      "learning_rate": 0.0002887387387387387,
+      "loss": 0.5087,
+      "step": 260
+    },
+    {
+      "epoch": 0.07824620573355819,
+      "grad_norm": 0.25199684500694275,
+      "learning_rate": 0.00028869369369369366,
+      "loss": 0.5616,
+      "step": 261
+    },
+    {
+      "epoch": 0.07854599962525763,
+      "grad_norm": 0.2466064840555191,
+      "learning_rate": 0.00028864864864864865,
+      "loss": 0.5343,
+      "step": 262
+    },
+    {
+      "epoch": 0.07884579351695709,
+      "grad_norm": 0.2401546686887741,
+      "learning_rate": 0.0002886036036036036,
+      "loss": 0.5326,
+      "step": 263
+    },
+    {
+      "epoch": 0.07914558740865654,
+      "grad_norm": 0.2478310465812683,
+      "learning_rate": 0.0002885585585585585,
+      "loss": 0.5377,
+      "step": 264
+    },
+    {
+      "epoch": 0.079445381300356,
+      "grad_norm": 0.25078094005584717,
+      "learning_rate": 0.00028851351351351346,
+      "loss": 0.5065,
+      "step": 265
+    },
+    {
+      "epoch": 0.07974517519205546,
+      "grad_norm": 0.24395832419395447,
+      "learning_rate": 0.00028846846846846845,
+      "loss": 0.5066,
+      "step": 266
+    },
+    {
+      "epoch": 0.08004496908375491,
+      "grad_norm": 0.23809655010700226,
+      "learning_rate": 0.0002884234234234234,
+      "loss": 0.5032,
+      "step": 267
+    },
+    {
+      "epoch": 0.08034476297545437,
+      "grad_norm": 0.25203442573547363,
+      "learning_rate": 0.0002883783783783783,
+      "loss": 0.5539,
+      "step": 268
+    },
+    {
+      "epoch": 0.08064455686715383,
+      "grad_norm": 0.22731390595436096,
+      "learning_rate": 0.0002883333333333333,
+      "loss": 0.5056,
+      "step": 269
+    },
+    {
+      "epoch": 0.08094435075885328,
+      "grad_norm": 0.26038557291030884,
+      "learning_rate": 0.00028828828828828825,
+      "loss": 0.5554,
+      "step": 270
+    },
+    {
+      "epoch": 0.08124414465055274,
+      "grad_norm": 0.22735048830509186,
+      "learning_rate": 0.0002882432432432432,
+      "loss": 0.513,
+      "step": 271
+    },
+    {
+      "epoch": 0.0815439385422522,
+      "grad_norm": 0.2459019273519516,
+      "learning_rate": 0.0002881981981981982,
+      "loss": 0.5074,
+      "step": 272
+    },
+    {
+      "epoch": 0.08184373243395165,
+      "grad_norm": 0.23949427902698517,
+      "learning_rate": 0.0002881531531531531,
+      "loss": 0.5289,
+      "step": 273
+    },
+    {
+      "epoch": 0.08214352632565111,
+      "grad_norm": 0.22845999896526337,
+      "learning_rate": 0.00028810810810810805,
+      "loss": 0.5046,
+      "step": 274
+    },
+    {
+      "epoch": 0.08244332021735057,
+      "grad_norm": 0.23616977035999298,
+      "learning_rate": 0.00028806306306306304,
+      "loss": 0.4766,
+      "step": 275
+    },
+    {
+      "epoch": 0.08274311410905003,
+      "grad_norm": 0.24239955842494965,
+      "learning_rate": 0.000288018018018018,
+      "loss": 0.5177,
+      "step": 276
+    },
+    {
+      "epoch": 0.08304290800074948,
+      "grad_norm": 0.22668063640594482,
+      "learning_rate": 0.0002879729729729729,
+      "loss": 0.5343,
+      "step": 277
+    },
+    {
+      "epoch": 0.08334270189244894,
+      "grad_norm": 0.23789212107658386,
+      "learning_rate": 0.0002879279279279279,
+      "loss": 0.5325,
+      "step": 278
+    },
+    {
+      "epoch": 0.0836424957841484,
+      "grad_norm": 0.23370423913002014,
+      "learning_rate": 0.00028788288288288284,
+      "loss": 0.5054,
+      "step": 279
+    },
+    {
+      "epoch": 0.08394228967584785,
+      "grad_norm": 0.2391810417175293,
+      "learning_rate": 0.0002878378378378378,
+      "loss": 0.5128,
+      "step": 280
+    },
+    {
+      "epoch": 0.08424208356754731,
+      "grad_norm": 0.2511520981788635,
+      "learning_rate": 0.00028779279279279277,
+      "loss": 0.5205,
+      "step": 281
+    },
+    {
+      "epoch": 0.08454187745924677,
+      "grad_norm": 0.25624001026153564,
+      "learning_rate": 0.0002877477477477477,
+      "loss": 0.5341,
+      "step": 282
+    },
+    {
+      "epoch": 0.08484167135094622,
+      "grad_norm": 0.22454805672168732,
+      "learning_rate": 0.00028770270270270264,
+      "loss": 0.4776,
+      "step": 283
+    },
+    {
+      "epoch": 0.08514146524264568,
+      "grad_norm": 0.24822303652763367,
+      "learning_rate": 0.00028765765765765764,
+      "loss": 0.5545,
+      "step": 284
+    },
+    {
+      "epoch": 0.08544125913434514,
+      "grad_norm": 0.24899134039878845,
+      "learning_rate": 0.00028761261261261257,
+      "loss": 0.5081,
+      "step": 285
+    },
+    {
+      "epoch": 0.0857410530260446,
+      "grad_norm": 0.22409197688102722,
+      "learning_rate": 0.00028756756756756756,
+      "loss": 0.4964,
+      "step": 286
+    },
+    {
+      "epoch": 0.08604084691774405,
+      "grad_norm": 0.24713397026062012,
+      "learning_rate": 0.0002875225225225225,
+      "loss": 0.5604,
+      "step": 287
+    },
+    {
+      "epoch": 0.08634064080944351,
+      "grad_norm": 0.2386777251958847,
+      "learning_rate": 0.00028747747747747744,
+      "loss": 0.5158,
+      "step": 288
+    },
+    {
+      "epoch": 0.08664043470114297,
+      "grad_norm": 0.2695513963699341,
+      "learning_rate": 0.00028743243243243243,
+      "loss": 0.5289,
+      "step": 289
+    },
+    {
+      "epoch": 0.08694022859284242,
+      "grad_norm": 0.23554596304893494,
+      "learning_rate": 0.00028738738738738736,
+      "loss": 0.5002,
+      "step": 290
+    },
+    {
+      "epoch": 0.08724002248454188,
+      "grad_norm": 0.2348342090845108,
+      "learning_rate": 0.0002873423423423423,
+      "loss": 0.5026,
+      "step": 291
+    },
+    {
+      "epoch": 0.08753981637624134,
+      "grad_norm": 0.24448086321353912,
+      "learning_rate": 0.0002872972972972973,
+      "loss": 0.508,
+      "step": 292
+    },
+    {
+      "epoch": 0.0878396102679408,
+      "grad_norm": 0.22410009801387787,
+      "learning_rate": 0.00028725225225225223,
+      "loss": 0.4932,
+      "step": 293
+    },
+    {
+      "epoch": 0.08813940415964025,
+      "grad_norm": 0.25097909569740295,
+      "learning_rate": 0.00028720720720720717,
+      "loss": 0.5196,
+      "step": 294
+    },
+    {
+      "epoch": 0.0884391980513397,
+      "grad_norm": 0.23564878106117249,
+      "learning_rate": 0.00028716216216216216,
+      "loss": 0.4871,
+      "step": 295
+    },
+    {
+      "epoch": 0.08873899194303916,
+      "grad_norm": 0.22870436310768127,
+      "learning_rate": 0.0002871171171171171,
+      "loss": 0.4717,
+      "step": 296
+    },
+    {
+      "epoch": 0.08903878583473862,
+      "grad_norm": 0.24509447813034058,
+      "learning_rate": 0.00028707207207207203,
+      "loss": 0.5417,
+      "step": 297
+    },
+    {
+      "epoch": 0.08933857972643808,
+      "grad_norm": 0.22554557025432587,
+      "learning_rate": 0.000287027027027027,
+      "loss": 0.5081,
+      "step": 298
+    },
+    {
+      "epoch": 0.08963837361813753,
+      "grad_norm": 0.2863612473011017,
+      "learning_rate": 0.00028698198198198196,
+      "loss": 0.5453,
+      "step": 299
+    },
+    {
+      "epoch": 0.08993816750983699,
+      "grad_norm": 0.23862136900424957,
+      "learning_rate": 0.00028693693693693695,
+      "loss": 0.4938,
+      "step": 300
+    },
+    {
+      "epoch": 0.09023796140153645,
+      "grad_norm": 0.2628205120563507,
+      "learning_rate": 0.0002868918918918919,
+      "loss": 0.5392,
+      "step": 301
+    },
+    {
+      "epoch": 0.0905377552932359,
+      "grad_norm": 0.23156946897506714,
+      "learning_rate": 0.0002868468468468468,
+      "loss": 0.4983,
+      "step": 302
+    },
+    {
+      "epoch": 0.09083754918493536,
+      "grad_norm": 0.27579790353775024,
+      "learning_rate": 0.0002868018018018018,
+      "loss": 0.5343,
+      "step": 303
+    },
+    {
+      "epoch": 0.09113734307663482,
+      "grad_norm": 0.22691994905471802,
+      "learning_rate": 0.00028675675675675675,
+      "loss": 0.4828,
+      "step": 304
+    },
+    {
+      "epoch": 0.09143713696833428,
+      "grad_norm": 0.23544538021087646,
+      "learning_rate": 0.0002867117117117117,
+      "loss": 0.5143,
+      "step": 305
+    },
+    {
+      "epoch": 0.09173693086003373,
+      "grad_norm": 0.22597934305667877,
+      "learning_rate": 0.0002866666666666667,
+      "loss": 0.5059,
+      "step": 306
+    },
+    {
+      "epoch": 0.09203672475173319,
+      "grad_norm": 0.23851321637630463,
+      "learning_rate": 0.0002866216216216216,
+      "loss": 0.5127,
+      "step": 307
+    },
+    {
+      "epoch": 0.09233651864343265,
+      "grad_norm": 0.2260097861289978,
+      "learning_rate": 0.00028657657657657655,
+      "loss": 0.4901,
+      "step": 308
+    },
+    {
+      "epoch": 0.09263631253513209,
+      "grad_norm": 0.2568291127681732,
+      "learning_rate": 0.00028653153153153154,
+      "loss": 0.556,
+      "step": 309
+    },
+    {
+      "epoch": 0.09293610642683155,
+      "grad_norm": 0.23510834574699402,
+      "learning_rate": 0.0002864864864864865,
+      "loss": 0.5254,
+      "step": 310
+    },
+    {
+      "epoch": 0.093235900318531,
+      "grad_norm": 0.24556247889995575,
+      "learning_rate": 0.0002864414414414414,
+      "loss": 0.5292,
+      "step": 311
+    },
+    {
+      "epoch": 0.09353569421023046,
+      "grad_norm": 0.2357538938522339,
+      "learning_rate": 0.00028639639639639635,
+      "loss": 0.5321,
+      "step": 312
+    },
+    {
+      "epoch": 0.09383548810192992,
+      "grad_norm": 0.25484514236450195,
+      "learning_rate": 0.00028635135135135134,
+      "loss": 0.5449,
+      "step": 313
+    },
+    {
+      "epoch": 0.09413528199362937,
+      "grad_norm": 0.243759885430336,
+      "learning_rate": 0.0002863063063063063,
+      "loss": 0.4766,
+      "step": 314
+    },
+    {
+      "epoch": 0.09443507588532883,
+      "grad_norm": 0.232033833861351,
+      "learning_rate": 0.0002862612612612612,
+      "loss": 0.481,
+      "step": 315
+    },
+    {
+      "epoch": 0.09473486977702829,
+      "grad_norm": 0.24136248230934143,
+      "learning_rate": 0.0002862162162162162,
+      "loss": 0.541,
+      "step": 316
+    },
+    {
+      "epoch": 0.09503466366872775,
+      "grad_norm": 0.23708300292491913,
+      "learning_rate": 0.00028617117117117114,
+      "loss": 0.5298,
+      "step": 317
+    },
+    {
+      "epoch": 0.0953344575604272,
+      "grad_norm": 0.22408358752727509,
+      "learning_rate": 0.0002861261261261261,
+      "loss": 0.4818,
+      "step": 318
+    },
+    {
+      "epoch": 0.09563425145212666,
+      "grad_norm": 0.22969000041484833,
+      "learning_rate": 0.00028608108108108107,
+      "loss": 0.4668,
+      "step": 319
+    },
+    {
+      "epoch": 0.09593404534382612,
+      "grad_norm": 0.233534574508667,
+      "learning_rate": 0.000286036036036036,
+      "loss": 0.506,
+      "step": 320
+    },
+    {
+      "epoch": 0.09623383923552557,
+      "grad_norm": 0.2442328929901123,
+      "learning_rate": 0.00028599099099099095,
+      "loss": 0.5208,
+      "step": 321
+    },
+    {
+      "epoch": 0.09653363312722503,
+      "grad_norm": 0.22215022146701813,
+      "learning_rate": 0.00028594594594594594,
+      "loss": 0.4963,
+      "step": 322
+    },
+    {
+      "epoch": 0.09683342701892449,
+      "grad_norm": 0.23993152379989624,
+      "learning_rate": 0.0002859009009009009,
+      "loss": 0.4945,
+      "step": 323
+    },
+    {
+      "epoch": 0.09713322091062394,
+      "grad_norm": 0.22987253963947296,
+      "learning_rate": 0.0002858558558558558,
+      "loss": 0.4895,
+      "step": 324
+    },
+    {
+      "epoch": 0.0974330148023234,
+      "grad_norm": 0.25025674700737,
+      "learning_rate": 0.0002858108108108108,
+      "loss": 0.5053,
+      "step": 325
+    },
+    {
+      "epoch": 0.09773280869402286,
+      "grad_norm": 0.21882835030555725,
+      "learning_rate": 0.00028576576576576574,
+      "loss": 0.4844,
+      "step": 326
+    },
+    {
+      "epoch": 0.09803260258572231,
+      "grad_norm": 0.23361052572727203,
+      "learning_rate": 0.0002857207207207207,
+      "loss": 0.5159,
+      "step": 327
+    },
+    {
+      "epoch": 0.09833239647742177,
+      "grad_norm": 0.23713140189647675,
+      "learning_rate": 0.00028567567567567567,
+      "loss": 0.5419,
+      "step": 328
+    },
+    {
+      "epoch": 0.09863219036912123,
+      "grad_norm": 0.2439422458410263,
+      "learning_rate": 0.0002856306306306306,
+      "loss": 0.5068,
+      "step": 329
+    },
+    {
+      "epoch": 0.09893198426082069,
+      "grad_norm": 0.23442302644252777,
+      "learning_rate": 0.00028558558558558554,
+      "loss": 0.5041,
+      "step": 330
+    },
+    {
+      "epoch": 0.09923177815252014,
+      "grad_norm": 0.2295604944229126,
+      "learning_rate": 0.00028554054054054053,
+      "loss": 0.4838,
+      "step": 331
+    },
+    {
+      "epoch": 0.0995315720442196,
+      "grad_norm": 0.2328769713640213,
+      "learning_rate": 0.00028549549549549547,
+      "loss": 0.5118,
+      "step": 332
+    },
+    {
+      "epoch": 0.09983136593591906,
+      "grad_norm": 0.21605005860328674,
+      "learning_rate": 0.0002854504504504504,
+      "loss": 0.4786,
+      "step": 333
+    },
+    {
+      "epoch": 0.10013115982761851,
+      "grad_norm": 0.22740024328231812,
+      "learning_rate": 0.0002854054054054054,
+      "loss": 0.5116,
+      "step": 334
+    },
+    {
+      "epoch": 0.10043095371931797,
+      "grad_norm": 0.24437595903873444,
+      "learning_rate": 0.00028536036036036033,
+      "loss": 0.5467,
+      "step": 335
+    },
+    {
+      "epoch": 0.10073074761101743,
+      "grad_norm": 0.2281450480222702,
+      "learning_rate": 0.00028531531531531527,
+      "loss": 0.5269,
+      "step": 336
+    },
+    {
+      "epoch": 0.10103054150271688,
+      "grad_norm": 0.2343592643737793,
+      "learning_rate": 0.0002852702702702702,
+      "loss": 0.5176,
+      "step": 337
+    },
+    {
+      "epoch": 0.10133033539441634,
+      "grad_norm": 0.24275358021259308,
+      "learning_rate": 0.0002852252252252252,
+      "loss": 0.5103,
+      "step": 338
+    },
+    {
+      "epoch": 0.1016301292861158,
+      "grad_norm": 0.2498926818370819,
+      "learning_rate": 0.00028518018018018013,
+      "loss": 0.5282,
+      "step": 339
+    },
+    {
+      "epoch": 0.10192992317781525,
+      "grad_norm": 0.23029617965221405,
+      "learning_rate": 0.00028513513513513507,
+      "loss": 0.512,
+      "step": 340
+    },
+    {
+      "epoch": 0.10222971706951471,
+      "grad_norm": 0.2608179450035095,
+      "learning_rate": 0.00028509009009009006,
+      "loss": 0.5419,
+      "step": 341
+    },
+    {
+      "epoch": 0.10252951096121417,
+      "grad_norm": 0.24309523403644562,
+      "learning_rate": 0.000285045045045045,
+      "loss": 0.5178,
+      "step": 342
+    },
+    {
+      "epoch": 0.10282930485291362,
+      "grad_norm": 0.22357851266860962,
+      "learning_rate": 0.000285,
+      "loss": 0.487,
+      "step": 343
+    },
+    {
+      "epoch": 0.10312909874461308,
+      "grad_norm": 0.2575152814388275,
+      "learning_rate": 0.0002849549549549549,
+      "loss": 0.5163,
+      "step": 344
+    },
+    {
+      "epoch": 0.10342889263631254,
+      "grad_norm": 0.25661131739616394,
+      "learning_rate": 0.00028490990990990986,
+      "loss": 0.5229,
+      "step": 345
+    },
+    {
+      "epoch": 0.103728686528012,
+      "grad_norm": 0.25328096747398376,
+      "learning_rate": 0.00028486486486486485,
+      "loss": 0.5456,
+      "step": 346
+    },
+    {
+      "epoch": 0.10402848041971145,
+      "grad_norm": 0.2357887625694275,
+      "learning_rate": 0.0002848198198198198,
+      "loss": 0.4824,
+      "step": 347
+    },
+    {
+      "epoch": 0.10432827431141091,
+      "grad_norm": 0.2275196760892868,
+      "learning_rate": 0.0002847747747747747,
+      "loss": 0.4899,
+      "step": 348
+    },
+    {
+      "epoch": 0.10462806820311037,
+      "grad_norm": 0.2616022229194641,
+      "learning_rate": 0.0002847297297297297,
+      "loss": 0.543,
+      "step": 349
+    },
+    {
+      "epoch": 0.10492786209480982,
+      "grad_norm": 0.23371827602386475,
+      "learning_rate": 0.00028468468468468465,
+      "loss": 0.4876,
+      "step": 350
+    },
+    {
+      "epoch": 0.10522765598650928,
+      "grad_norm": 0.21652457118034363,
+      "learning_rate": 0.0002846396396396396,
+      "loss": 0.5051,
+      "step": 351
+    },
+    {
+      "epoch": 0.10552744987820874,
+      "grad_norm": 0.2464732825756073,
+      "learning_rate": 0.0002845945945945946,
+      "loss": 0.4993,
+      "step": 352
+    },
+    {
+      "epoch": 0.1058272437699082,
+      "grad_norm": 0.24194246530532837,
+      "learning_rate": 0.0002845495495495495,
+      "loss": 0.492,
+      "step": 353
+    },
+    {
+      "epoch": 0.10612703766160765,
+      "grad_norm": 0.21986526250839233,
+      "learning_rate": 0.00028450450450450446,
+      "loss": 0.4668,
+      "step": 354
+    },
+    {
+      "epoch": 0.10642683155330711,
+      "grad_norm": 0.2386123389005661,
+      "learning_rate": 0.00028445945945945945,
+      "loss": 0.4823,
+      "step": 355
+    },
+    {
+      "epoch": 0.10672662544500656,
+      "grad_norm": 0.2368634194135666,
+      "learning_rate": 0.0002844144144144144,
+      "loss": 0.5119,
+      "step": 356
+    },
+    {
+      "epoch": 0.10702641933670601,
+      "grad_norm": 0.24802769720554352,
+      "learning_rate": 0.0002843693693693694,
+      "loss": 0.5298,
+      "step": 357
+    },
+    {
+      "epoch": 0.10732621322840546,
+      "grad_norm": 0.25534310936927795,
+      "learning_rate": 0.0002843243243243243,
+      "loss": 0.5107,
+      "step": 358
+    },
+    {
+      "epoch": 0.10762600712010492,
+      "grad_norm": 0.24845390021800995,
+      "learning_rate": 0.00028427927927927925,
+      "loss": 0.539,
+      "step": 359
+    },
+    {
+      "epoch": 0.10792580101180438,
+      "grad_norm": 0.22670955955982208,
+      "learning_rate": 0.00028423423423423424,
+      "loss": 0.5013,
+      "step": 360
+    },
+    {
+      "epoch": 0.10822559490350384,
+      "grad_norm": 0.21695859730243683,
+      "learning_rate": 0.0002841891891891892,
+      "loss": 0.468,
+      "step": 361
+    },
+    {
+      "epoch": 0.10852538879520329,
+      "grad_norm": 0.24691784381866455,
+      "learning_rate": 0.0002841441441441441,
+      "loss": 0.5011,
+      "step": 362
+    },
+    {
+      "epoch": 0.10882518268690275,
+      "grad_norm": 0.23244348168373108,
+      "learning_rate": 0.0002840990990990991,
+      "loss": 0.481,
+      "step": 363
+    },
+    {
+      "epoch": 0.1091249765786022,
+      "grad_norm": 0.23756597936153412,
+      "learning_rate": 0.00028405405405405404,
+      "loss": 0.4869,
+      "step": 364
+    },
+    {
+      "epoch": 0.10942477047030166,
+      "grad_norm": 0.26336169242858887,
+      "learning_rate": 0.000284009009009009,
+      "loss": 0.5216,
+      "step": 365
+    },
+    {
+      "epoch": 0.10972456436200112,
+      "grad_norm": 0.23734593391418457,
+      "learning_rate": 0.00028396396396396397,
+      "loss": 0.502,
+      "step": 366
+    },
+    {
+      "epoch": 0.11002435825370058,
+      "grad_norm": 0.2188960462808609,
+      "learning_rate": 0.0002839189189189189,
+      "loss": 0.4784,
+      "step": 367
+    },
+    {
+      "epoch": 0.11032415214540003,
+      "grad_norm": 0.26715338230133057,
+      "learning_rate": 0.00028387387387387384,
+      "loss": 0.4997,
+      "step": 368
+    },
+    {
+      "epoch": 0.11062394603709949,
+      "grad_norm": 0.23621448874473572,
+      "learning_rate": 0.00028382882882882883,
+      "loss": 0.4819,
+      "step": 369
+    },
+    {
+      "epoch": 0.11092373992879895,
+      "grad_norm": 0.23166733980178833,
+      "learning_rate": 0.00028378378378378377,
+      "loss": 0.4949,
+      "step": 370
+    },
+    {
+      "epoch": 0.1112235338204984,
+      "grad_norm": 0.22275973856449127,
+      "learning_rate": 0.0002837387387387387,
+      "loss": 0.4893,
+      "step": 371
+    },
+    {
+      "epoch": 0.11152332771219786,
+      "grad_norm": 0.22097566723823547,
+      "learning_rate": 0.0002836936936936937,
+      "loss": 0.5011,
+      "step": 372
+    },
+    {
+      "epoch": 0.11182312160389732,
+      "grad_norm": 0.25464650988578796,
+      "learning_rate": 0.00028364864864864863,
+      "loss": 0.5369,
+      "step": 373
+    },
+    {
+      "epoch": 0.11212291549559678,
+      "grad_norm": 0.2584247589111328,
+      "learning_rate": 0.00028360360360360357,
+      "loss": 0.5271,
+      "step": 374
+    },
+    {
+      "epoch": 0.11242270938729623,
+      "grad_norm": 0.2229800671339035,
+      "learning_rate": 0.00028355855855855856,
+      "loss": 0.4919,
+      "step": 375
+    },
+    {
+      "epoch": 0.11272250327899569,
+      "grad_norm": 0.25196340680122375,
+      "learning_rate": 0.0002835135135135135,
+      "loss": 0.4935,
+      "step": 376
+    },
+    {
+      "epoch": 0.11302229717069515,
+      "grad_norm": 0.23945283889770508,
+      "learning_rate": 0.00028346846846846843,
+      "loss": 0.5158,
+      "step": 377
+    },
+    {
+      "epoch": 0.1133220910623946,
+      "grad_norm": 0.22441525757312775,
+      "learning_rate": 0.0002834234234234234,
+      "loss": 0.5106,
+      "step": 378
+    },
+    {
+      "epoch": 0.11362188495409406,
+      "grad_norm": 0.24109874665737152,
+      "learning_rate": 0.00028337837837837836,
+      "loss": 0.4646,
+      "step": 379
+    },
+    {
+      "epoch": 0.11392167884579352,
+      "grad_norm": 0.23156128823757172,
+      "learning_rate": 0.0002833333333333333,
+      "loss": 0.4752,
+      "step": 380
+    },
+    {
+      "epoch": 0.11422147273749297,
+      "grad_norm": 0.24328507483005524,
+      "learning_rate": 0.0002832882882882883,
+      "loss": 0.5225,
+      "step": 381
+    },
+    {
+      "epoch": 0.11452126662919243,
+      "grad_norm": 0.2364337146282196,
+      "learning_rate": 0.00028324324324324323,
+      "loss": 0.4965,
+      "step": 382
+    },
+    {
+      "epoch": 0.11482106052089189,
+      "grad_norm": 0.24130620062351227,
+      "learning_rate": 0.00028319819819819816,
+      "loss": 0.5058,
+      "step": 383
+    },
+    {
+      "epoch": 0.11512085441259134,
+      "grad_norm": 0.2482600063085556,
+      "learning_rate": 0.00028315315315315315,
+      "loss": 0.5142,
+      "step": 384
+    },
+    {
+      "epoch": 0.1154206483042908,
+      "grad_norm": 0.2426941990852356,
+      "learning_rate": 0.0002831081081081081,
+      "loss": 0.5185,
+      "step": 385
+    },
+    {
+      "epoch": 0.11572044219599026,
+      "grad_norm": 0.24431252479553223,
+      "learning_rate": 0.00028306306306306303,
+      "loss": 0.4946,
+      "step": 386
+    },
+    {
+      "epoch": 0.11602023608768972,
+      "grad_norm": 0.22974024713039398,
+      "learning_rate": 0.00028301801801801797,
+      "loss": 0.4845,
+      "step": 387
+    },
+    {
+      "epoch": 0.11632002997938917,
+      "grad_norm": 0.26395878195762634,
+      "learning_rate": 0.00028297297297297296,
+      "loss": 0.512,
+      "step": 388
+    },
+    {
+      "epoch": 0.11661982387108863,
+      "grad_norm": 0.2300662398338318,
+      "learning_rate": 0.0002829279279279279,
+      "loss": 0.5161,
+      "step": 389
+    },
+    {
+      "epoch": 0.11691961776278809,
+      "grad_norm": 0.23376531898975372,
+      "learning_rate": 0.00028288288288288283,
+      "loss": 0.5165,
+      "step": 390
+    },
+    {
+      "epoch": 0.11721941165448754,
+      "grad_norm": 0.23775231838226318,
+      "learning_rate": 0.0002828378378378378,
+      "loss": 0.5028,
+      "step": 391
+    },
+    {
+      "epoch": 0.117519205546187,
+      "grad_norm": 0.22368961572647095,
+      "learning_rate": 0.00028279279279279276,
+      "loss": 0.4724,
+      "step": 392
+    },
+    {
+      "epoch": 0.11781899943788646,
+      "grad_norm": 0.25789904594421387,
+      "learning_rate": 0.0002827477477477477,
+      "loss": 0.5545,
+      "step": 393
+    },
+    {
+      "epoch": 0.11811879332958591,
+      "grad_norm": 0.22045502066612244,
+      "learning_rate": 0.0002827027027027027,
+      "loss": 0.4739,
+      "step": 394
+    },
+    {
+      "epoch": 0.11841858722128537,
+      "grad_norm": 0.24297311902046204,
+      "learning_rate": 0.0002826576576576576,
+      "loss": 0.5284,
+      "step": 395
+    },
+    {
+      "epoch": 0.11871838111298483,
+      "grad_norm": 0.22944821417331696,
+      "learning_rate": 0.00028261261261261256,
+      "loss": 0.5071,
+      "step": 396
+    },
+    {
+      "epoch": 0.11901817500468428,
+      "grad_norm": 0.22093501687049866,
+      "learning_rate": 0.00028256756756756755,
+      "loss": 0.5016,
+      "step": 397
+    },
+    {
+      "epoch": 0.11931796889638374,
+      "grad_norm": 0.24359877407550812,
+      "learning_rate": 0.0002825225225225225,
+      "loss": 0.494,
+      "step": 398
+    },
+    {
+      "epoch": 0.1196177627880832,
+      "grad_norm": 0.23019848763942719,
+      "learning_rate": 0.0002824774774774774,
+      "loss": 0.4774,
+      "step": 399
+    },
+    {
+      "epoch": 0.11991755667978266,
+      "grad_norm": 0.22894737124443054,
+      "learning_rate": 0.0002824324324324324,
+      "loss": 0.4804,
+      "step": 400
+    },
+    {
+      "epoch": 0.12021735057148211,
+      "grad_norm": 0.23049385845661163,
+      "learning_rate": 0.00028238738738738735,
+      "loss": 0.4816,
+      "step": 401
+    },
+    {
+      "epoch": 0.12051714446318157,
+      "grad_norm": 0.21988703310489655,
+      "learning_rate": 0.0002823423423423423,
+      "loss": 0.479,
+      "step": 402
+    },
+    {
+      "epoch": 0.12081693835488103,
+      "grad_norm": 0.2389378845691681,
+      "learning_rate": 0.0002822972972972973,
+      "loss": 0.4993,
+      "step": 403
+    },
+    {
+      "epoch": 0.12111673224658047,
+      "grad_norm": 0.2271224707365036,
+      "learning_rate": 0.0002822522522522522,
+      "loss": 0.479,
+      "step": 404
+    },
+    {
+      "epoch": 0.12141652613827993,
+      "grad_norm": 0.253499835729599,
+      "learning_rate": 0.00028220720720720715,
+      "loss": 0.5289,
+      "step": 405
+    },
+    {
+      "epoch": 0.12171632002997938,
+      "grad_norm": 0.2213011533021927,
+      "learning_rate": 0.00028216216216216214,
+      "loss": 0.4759,
+      "step": 406
+    },
+    {
+      "epoch": 0.12201611392167884,
+      "grad_norm": 0.2327415943145752,
+      "learning_rate": 0.0002821171171171171,
+      "loss": 0.4822,
+      "step": 407
+    },
+    {
+      "epoch": 0.1223159078133783,
+      "grad_norm": 0.2592346668243408,
+      "learning_rate": 0.000282072072072072,
+      "loss": 0.5331,
+      "step": 408
+    },
+    {
+      "epoch": 0.12261570170507775,
+      "grad_norm": 0.23039428889751434,
+      "learning_rate": 0.000282027027027027,
+      "loss": 0.4882,
+      "step": 409
+    },
+    {
+      "epoch": 0.12291549559677721,
+      "grad_norm": 0.2345624566078186,
+      "learning_rate": 0.00028198198198198194,
+      "loss": 0.4859,
+      "step": 410
+    },
+    {
+      "epoch": 0.12321528948847667,
+      "grad_norm": 0.22759179770946503,
+      "learning_rate": 0.0002819369369369369,
+      "loss": 0.484,
+      "step": 411
+    },
+    {
+      "epoch": 0.12351508338017612,
+      "grad_norm": 0.24670295417308807,
+      "learning_rate": 0.00028189189189189187,
+      "loss": 0.5047,
+      "step": 412
+    },
+    {
+      "epoch": 0.12381487727187558,
+      "grad_norm": 0.2306670844554901,
+      "learning_rate": 0.0002818468468468468,
+      "loss": 0.5041,
+      "step": 413
+    },
+    {
+      "epoch": 0.12411467116357504,
+      "grad_norm": 0.2440173178911209,
+      "learning_rate": 0.0002818018018018018,
+      "loss": 0.4942,
+      "step": 414
+    },
+    {
+      "epoch": 0.1244144650552745,
+      "grad_norm": 0.2239232212305069,
+      "learning_rate": 0.00028175675675675674,
+      "loss": 0.4677,
+      "step": 415
+    },
+    {
+      "epoch": 0.12471425894697395,
+      "grad_norm": 0.22838331758975983,
+      "learning_rate": 0.0002817117117117117,
+      "loss": 0.5061,
+      "step": 416
+    },
+    {
+      "epoch": 0.12501405283867342,
+      "grad_norm": 0.23874101042747498,
+      "learning_rate": 0.00028166666666666666,
+      "loss": 0.5261,
+      "step": 417
+    },
+    {
+      "epoch": 0.12531384673037288,
+      "grad_norm": 0.2522803544998169,
+      "learning_rate": 0.0002816216216216216,
+      "loss": 0.5525,
+      "step": 418
+    },
+    {
+      "epoch": 0.12561364062207234,
+      "grad_norm": 0.2339329719543457,
+      "learning_rate": 0.00028157657657657654,
+      "loss": 0.5157,
+      "step": 419
+    },
+    {
+      "epoch": 0.1259134345137718,
+      "grad_norm": 0.23074860870838165,
+      "learning_rate": 0.00028153153153153153,
+      "loss": 0.5062,
+      "step": 420
+    },
+    {
+      "epoch": 0.12621322840547125,
+      "grad_norm": 0.22877013683319092,
+      "learning_rate": 0.00028148648648648647,
+      "loss": 0.4827,
+      "step": 421
+    },
+    {
+      "epoch": 0.1265130222971707,
+      "grad_norm": 0.2384471893310547,
+      "learning_rate": 0.0002814414414414414,
+      "loss": 0.5035,
+      "step": 422
+    },
+    {
+      "epoch": 0.12681281618887016,
+      "grad_norm": 0.24479855597019196,
+      "learning_rate": 0.0002813963963963964,
+      "loss": 0.4645,
+      "step": 423
+    },
+    {
+      "epoch": 0.12711261008056962,
+      "grad_norm": 0.23424111306667328,
+      "learning_rate": 0.00028135135135135133,
+      "loss": 0.4927,
+      "step": 424
+    },
+    {
+      "epoch": 0.12741240397226908,
+      "grad_norm": 0.24012430012226105,
+      "learning_rate": 0.0002813063063063063,
+      "loss": 0.4904,
+      "step": 425
+    },
+    {
+      "epoch": 0.12771219786396854,
+      "grad_norm": 0.24606235325336456,
+      "learning_rate": 0.00028126126126126126,
+      "loss": 0.5143,
+      "step": 426
+    },
+    {
+      "epoch": 0.128011991755668,
+      "grad_norm": 0.24893403053283691,
+      "learning_rate": 0.0002812162162162162,
+      "loss": 0.5232,
+      "step": 427
+    },
+    {
+      "epoch": 0.12831178564736742,
+      "grad_norm": 0.2424110770225525,
+      "learning_rate": 0.0002811711711711712,
+      "loss": 0.5232,
+      "step": 428
+    },
+    {
+      "epoch": 0.12861157953906688,
+      "grad_norm": 0.2312486171722412,
+      "learning_rate": 0.0002811261261261261,
+      "loss": 0.4645,
+      "step": 429
+    },
+    {
+      "epoch": 0.12891137343076634,
+      "grad_norm": 0.22639836370944977,
+      "learning_rate": 0.00028108108108108106,
+      "loss": 0.4642,
+      "step": 430
+    },
+    {
+      "epoch": 0.1292111673224658,
+      "grad_norm": 0.23626941442489624,
+      "learning_rate": 0.00028103603603603605,
+      "loss": 0.5038,
+      "step": 431
+    },
+    {
+      "epoch": 0.12951096121416525,
+      "grad_norm": 0.2625383138656616,
+      "learning_rate": 0.000280990990990991,
+      "loss": 0.4867,
+      "step": 432
+    },
+    {
+      "epoch": 0.1298107551058647,
+      "grad_norm": 0.24292655289173126,
+      "learning_rate": 0.0002809459459459459,
+      "loss": 0.5081,
+      "step": 433
+    },
+    {
+      "epoch": 0.13011054899756416,
+      "grad_norm": 0.23609769344329834,
+      "learning_rate": 0.00028090090090090086,
+      "loss": 0.4832,
+      "step": 434
+    },
+    {
+      "epoch": 0.13041034288926362,
+      "grad_norm": 0.22934478521347046,
+      "learning_rate": 0.00028085585585585585,
+      "loss": 0.4872,
+      "step": 435
+    },
+    {
+      "epoch": 0.13071013678096308,
+      "grad_norm": 0.22949008643627167,
+      "learning_rate": 0.0002808108108108108,
+      "loss": 0.5129,
+      "step": 436
+    },
+    {
+      "epoch": 0.13100993067266253,
+      "grad_norm": 0.2302381694316864,
+      "learning_rate": 0.0002807657657657657,
+      "loss": 0.4793,
+      "step": 437
+    },
+    {
+      "epoch": 0.131309724564362,
+      "grad_norm": 0.23368242383003235,
+      "learning_rate": 0.0002807207207207207,
+      "loss": 0.4989,
+      "step": 438
+    },
+    {
+      "epoch": 0.13160951845606145,
+      "grad_norm": 0.21572020649909973,
+      "learning_rate": 0.00028067567567567565,
+      "loss": 0.4546,
+      "step": 439
+    },
+    {
+      "epoch": 0.1319093123477609,
+      "grad_norm": 0.2268449366092682,
+      "learning_rate": 0.0002806306306306306,
+      "loss": 0.4954,
+      "step": 440
+    },
+    {
+      "epoch": 0.13220910623946036,
+      "grad_norm": 0.23617544770240784,
+      "learning_rate": 0.0002805855855855856,
+      "loss": 0.4865,
+      "step": 441
+    },
+    {
+      "epoch": 0.13250890013115982,
+      "grad_norm": 0.24015142023563385,
+      "learning_rate": 0.0002805405405405405,
+      "loss": 0.5211,
+      "step": 442
+    },
+    {
+      "epoch": 0.13280869402285927,
+      "grad_norm": 0.21798421442508698,
+      "learning_rate": 0.00028049549549549545,
+      "loss": 0.4482,
+      "step": 443
+    },
+    {
+      "epoch": 0.13310848791455873,
+      "grad_norm": 0.23476584255695343,
+      "learning_rate": 0.00028045045045045045,
+      "loss": 0.4871,
+      "step": 444
+    },
+    {
+      "epoch": 0.1334082818062582,
+      "grad_norm": 0.2404216080904007,
+      "learning_rate": 0.0002804054054054054,
+      "loss": 0.488,
+      "step": 445
+    },
+    {
+      "epoch": 0.13370807569795765,
+      "grad_norm": 0.25073060393333435,
+      "learning_rate": 0.0002803603603603603,
+      "loss": 0.5194,
+      "step": 446
+    },
+    {
+      "epoch": 0.1340078695896571,
+      "grad_norm": 0.24332286417484283,
+      "learning_rate": 0.0002803153153153153,
+      "loss": 0.5347,
+      "step": 447
+    },
+    {
+      "epoch": 0.13430766348135656,
+      "grad_norm": 0.2420525699853897,
+      "learning_rate": 0.00028027027027027025,
+      "loss": 0.5109,
+      "step": 448
+    },
+    {
+      "epoch": 0.13460745737305602,
+      "grad_norm": 0.22389326989650726,
+      "learning_rate": 0.0002802252252252252,
+      "loss": 0.5075,
+      "step": 449
+    },
+    {
+      "epoch": 0.13490725126475547,
+      "grad_norm": 0.23522739112377167,
+      "learning_rate": 0.0002801801801801802,
+      "loss": 0.4794,
+      "step": 450
+    },
+    {
+      "epoch": 0.13520704515645493,
+      "grad_norm": 0.221591517329216,
+      "learning_rate": 0.0002801351351351351,
+      "loss": 0.4821,
+      "step": 451
+    },
+    {
+      "epoch": 0.1355068390481544,
+      "grad_norm": 0.24136802554130554,
+      "learning_rate": 0.00028009009009009005,
+      "loss": 0.4949,
+      "step": 452
+    },
+    {
+      "epoch": 0.13580663293985384,
+      "grad_norm": 0.23207104206085205,
+      "learning_rate": 0.00028004504504504504,
+      "loss": 0.4803,
+      "step": 453
+    },
+    {
+      "epoch": 0.1361064268315533,
+      "grad_norm": 0.25318193435668945,
+      "learning_rate": 0.00028,
+      "loss": 0.5345,
+      "step": 454
+    },
+    {
+      "epoch": 0.13640622072325276,
+      "grad_norm": 0.25274619460105896,
+      "learning_rate": 0.0002799549549549549,
+      "loss": 0.5159,
+      "step": 455
+    },
+    {
+      "epoch": 0.13670601461495221,
+      "grad_norm": 0.22540399432182312,
+      "learning_rate": 0.0002799099099099099,
+      "loss": 0.4772,
+      "step": 456
+    },
+    {
+      "epoch": 0.13700580850665167,
+      "grad_norm": 0.2346925288438797,
+      "learning_rate": 0.00027986486486486484,
+      "loss": 0.4916,
+      "step": 457
+    },
+    {
+      "epoch": 0.13730560239835113,
+      "grad_norm": 0.2226891815662384,
+      "learning_rate": 0.0002798198198198198,
+      "loss": 0.4651,
+      "step": 458
+    },
+    {
+      "epoch": 0.13760539629005059,
+      "grad_norm": 0.25540515780448914,
+      "learning_rate": 0.0002797747747747747,
+      "loss": 0.5316,
+      "step": 459
+    },
+    {
+      "epoch": 0.13790519018175004,
+      "grad_norm": 0.22934426367282867,
+      "learning_rate": 0.0002797297297297297,
+      "loss": 0.477,
+      "step": 460
+    },
+    {
+      "epoch": 0.1382049840734495,
+      "grad_norm": 0.22268570959568024,
+      "learning_rate": 0.00027968468468468464,
+      "loss": 0.4611,
+      "step": 461
+    },
+    {
+      "epoch": 0.13850477796514896,
+      "grad_norm": 0.23548570275306702,
+      "learning_rate": 0.0002796396396396396,
+      "loss": 0.5012,
+      "step": 462
+    },
+    {
+      "epoch": 0.1388045718568484,
+      "grad_norm": 0.22782792150974274,
+      "learning_rate": 0.00027959459459459457,
+      "loss": 0.4878,
+      "step": 463
+    },
+    {
+      "epoch": 0.13910436574854787,
+      "grad_norm": 0.24569828808307648,
+      "learning_rate": 0.0002795495495495495,
+      "loss": 0.4882,
+      "step": 464
+    },
+    {
+      "epoch": 0.13940415964024733,
+      "grad_norm": 0.23523476719856262,
+      "learning_rate": 0.00027950450450450444,
+      "loss": 0.498,
+      "step": 465
+    },
+    {
+      "epoch": 0.13970395353194678,
+      "grad_norm": 0.24249842762947083,
+      "learning_rate": 0.00027945945945945943,
+      "loss": 0.5156,
+      "step": 466
+    },
+    {
+      "epoch": 0.14000374742364624,
+      "grad_norm": 0.22582505643367767,
+      "learning_rate": 0.00027941441441441437,
+      "loss": 0.4428,
+      "step": 467
+    },
+    {
+      "epoch": 0.1403035413153457,
+      "grad_norm": 0.2527635395526886,
+      "learning_rate": 0.0002793693693693693,
+      "loss": 0.507,
+      "step": 468
+    },
+    {
+      "epoch": 0.14060333520704515,
+      "grad_norm": 0.2490163892507553,
+      "learning_rate": 0.0002793243243243243,
+      "loss": 0.5119,
+      "step": 469
+    },
+    {
+      "epoch": 0.1409031290987446,
+      "grad_norm": 0.26502713561058044,
+      "learning_rate": 0.00027927927927927923,
+      "loss": 0.4994,
+      "step": 470
+    },
+    {
+      "epoch": 0.14120292299044407,
+      "grad_norm": 0.25225281715393066,
+      "learning_rate": 0.0002792342342342342,
+      "loss": 0.5029,
+      "step": 471
+    },
+    {
+      "epoch": 0.14150271688214353,
+      "grad_norm": 0.235763818025589,
+      "learning_rate": 0.00027918918918918916,
+      "loss": 0.5154,
+      "step": 472
+    },
+    {
+      "epoch": 0.14180251077384298,
+      "grad_norm": 0.24403534829616547,
+      "learning_rate": 0.0002791441441441441,
+      "loss": 0.5241,
+      "step": 473
+    },
+    {
+      "epoch": 0.14210230466554244,
+      "grad_norm": 0.24656488001346588,
+      "learning_rate": 0.0002790990990990991,
+      "loss": 0.4767,
+      "step": 474
+    },
+    {
+      "epoch": 0.1424020985572419,
+      "grad_norm": 0.2506386935710907,
+      "learning_rate": 0.000279054054054054,
+      "loss": 0.5066,
+      "step": 475
+    },
+    {
+      "epoch": 0.14270189244894135,
+      "grad_norm": 0.24634157121181488,
+      "learning_rate": 0.00027900900900900896,
+      "loss": 0.4815,
+      "step": 476
+    },
+    {
+      "epoch": 0.1430016863406408,
+      "grad_norm": 0.23619256913661957,
+      "learning_rate": 0.00027896396396396395,
+      "loss": 0.4929,
+      "step": 477
+    },
+    {
+      "epoch": 0.14330148023234027,
+      "grad_norm": 0.23421134054660797,
+      "learning_rate": 0.0002789189189189189,
+      "loss": 0.4841,
+      "step": 478
+    },
+    {
+      "epoch": 0.14360127412403972,
+      "grad_norm": 0.2287687510251999,
+      "learning_rate": 0.00027887387387387383,
+      "loss": 0.4976,
+      "step": 479
+    },
+    {
+      "epoch": 0.14390106801573918,
+      "grad_norm": 0.2362293004989624,
+      "learning_rate": 0.0002788288288288288,
+      "loss": 0.5054,
+      "step": 480
+    },
+    {
+      "epoch": 0.14420086190743864,
+      "grad_norm": 0.23907198011875153,
+      "learning_rate": 0.00027878378378378376,
+      "loss": 0.4879,
+      "step": 481
+    },
+    {
+      "epoch": 0.1445006557991381,
+      "grad_norm": 0.21802479028701782,
+      "learning_rate": 0.00027873873873873875,
+      "loss": 0.4807,
+      "step": 482
+    },
+    {
+      "epoch": 0.14480044969083755,
+      "grad_norm": 0.2445833832025528,
+      "learning_rate": 0.0002786936936936937,
+      "loss": 0.519,
+      "step": 483
+    },
+    {
+      "epoch": 0.145100243582537,
+      "grad_norm": 0.24606822431087494,
+      "learning_rate": 0.0002786486486486486,
+      "loss": 0.4956,
+      "step": 484
+    },
+    {
+      "epoch": 0.14540003747423647,
+      "grad_norm": 0.24663852155208588,
+      "learning_rate": 0.0002786036036036036,
+      "loss": 0.5029,
+      "step": 485
+    },
+    {
+      "epoch": 0.14569983136593592,
+      "grad_norm": 0.22668293118476868,
+      "learning_rate": 0.00027855855855855855,
+      "loss": 0.4987,
+      "step": 486
+    },
+    {
+      "epoch": 0.14599962525763538,
+      "grad_norm": 0.2292596995830536,
+      "learning_rate": 0.0002785135135135135,
+      "loss": 0.4808,
+      "step": 487
+    },
+    {
+      "epoch": 0.14629941914933484,
+      "grad_norm": 0.2296249270439148,
+      "learning_rate": 0.0002784684684684685,
+      "loss": 0.4733,
+      "step": 488
+    },
+    {
+      "epoch": 0.1465992130410343,
+      "grad_norm": 0.2463514357805252,
+      "learning_rate": 0.0002784234234234234,
+      "loss": 0.4856,
+      "step": 489
+    },
+    {
+      "epoch": 0.14689900693273375,
+      "grad_norm": 0.2578481435775757,
+      "learning_rate": 0.00027837837837837835,
+      "loss": 0.52,
+      "step": 490
+    },
+    {
+      "epoch": 0.1471988008244332,
+      "grad_norm": 0.22988936305046082,
+      "learning_rate": 0.00027833333333333334,
+      "loss": 0.469,
+      "step": 491
+    },
+    {
+      "epoch": 0.14749859471613266,
+      "grad_norm": 0.24489805102348328,
+      "learning_rate": 0.0002782882882882883,
+      "loss": 0.5108,
+      "step": 492
+    },
+    {
+      "epoch": 0.14779838860783212,
+      "grad_norm": 0.24594642221927643,
+      "learning_rate": 0.0002782432432432432,
+      "loss": 0.4977,
+      "step": 493
+    },
+    {
+      "epoch": 0.14809818249953158,
+      "grad_norm": 0.23341059684753418,
+      "learning_rate": 0.0002781981981981982,
+      "loss": 0.4784,
+      "step": 494
+    },
+    {
+      "epoch": 0.14839797639123103,
+      "grad_norm": 0.24211278557777405,
+      "learning_rate": 0.00027815315315315314,
+      "loss": 0.4966,
+      "step": 495
+    },
+    {
+      "epoch": 0.1486977702829305,
+      "grad_norm": 0.24049176275730133,
+      "learning_rate": 0.0002781081081081081,
+      "loss": 0.4807,
+      "step": 496
+    },
+    {
+      "epoch": 0.14899756417462995,
+      "grad_norm": 0.2326640784740448,
+      "learning_rate": 0.00027806306306306307,
+      "loss": 0.4571,
+      "step": 497
+    },
+    {
+      "epoch": 0.1492973580663294,
+      "grad_norm": 0.23826268315315247,
+      "learning_rate": 0.000278018018018018,
+      "loss": 0.4656,
+      "step": 498
+    },
+    {
+      "epoch": 0.14959715195802886,
+      "grad_norm": 0.2514077425003052,
+      "learning_rate": 0.00027797297297297294,
+      "loss": 0.5057,
+      "step": 499
+    },
+    {
+      "epoch": 0.14989694584972832,
+      "grad_norm": 0.22455474734306335,
+      "learning_rate": 0.00027792792792792793,
+      "loss": 0.4754,
+      "step": 500
+    },
+    {
+      "epoch": 0.14989694584972832,
+      "eval_loss": 0.49213707447052,
+      "eval_runtime": 564.4362,
+      "eval_samples_per_second": 3.825,
+      "eval_steps_per_second": 0.478,
+      "step": 500
+    },
+    {
+      "epoch": 0.15019673974142778,
+      "grad_norm": 0.24720892310142517,
+      "learning_rate": 0.00027788288288288287,
+      "loss": 0.5022,
+      "step": 501
+    },
+    {
+      "epoch": 0.15049653363312723,
+      "grad_norm": 0.24081699550151825,
+      "learning_rate": 0.0002778378378378378,
+      "loss": 0.4928,
+      "step": 502
+    },
+    {
+      "epoch": 0.1507963275248267,
+      "grad_norm": 0.24245139956474304,
+      "learning_rate": 0.0002777927927927928,
+      "loss": 0.5005,
+      "step": 503
+    },
+    {
+      "epoch": 0.15109612141652615,
+      "grad_norm": 0.23554831743240356,
+      "learning_rate": 0.00027774774774774774,
+      "loss": 0.4914,
+      "step": 504
+    },
+    {
+      "epoch": 0.1513959153082256,
+      "grad_norm": 0.2291078418493271,
+      "learning_rate": 0.00027770270270270267,
+      "loss": 0.4807,
+      "step": 505
+    },
+    {
+      "epoch": 0.15169570919992506,
+      "grad_norm": 0.23393088579177856,
+      "learning_rate": 0.0002776576576576576,
+      "loss": 0.496,
+      "step": 506
+    },
+    {
+      "epoch": 0.15199550309162452,
+      "grad_norm": 0.24490569531917572,
+      "learning_rate": 0.0002776126126126126,
+      "loss": 0.5,
+      "step": 507
+    },
+    {
+      "epoch": 0.15229529698332397,
+      "grad_norm": 0.21680289506912231,
+      "learning_rate": 0.00027756756756756754,
+      "loss": 0.4549,
+      "step": 508
+    },
+    {
+      "epoch": 0.15259509087502343,
+      "grad_norm": 0.22479134798049927,
+      "learning_rate": 0.0002775225225225225,
+      "loss": 0.471,
+      "step": 509
+    },
+    {
+      "epoch": 0.1528948847667229,
+      "grad_norm": 0.2381131947040558,
+      "learning_rate": 0.00027747747747747746,
+      "loss": 0.4949,
+      "step": 510
+    },
+    {
+      "epoch": 0.15319467865842235,
+      "grad_norm": 0.22718404233455658,
+      "learning_rate": 0.0002774324324324324,
+      "loss": 0.4883,
+      "step": 511
+    },
+    {
+      "epoch": 0.1534944725501218,
+      "grad_norm": 0.2359694391489029,
+      "learning_rate": 0.00027738738738738734,
+      "loss": 0.5052,
+      "step": 512
+    },
+    {
+      "epoch": 0.15379426644182126,
+      "grad_norm": 0.229795902967453,
+      "learning_rate": 0.00027734234234234233,
+      "loss": 0.4814,
+      "step": 513
+    },
+    {
+      "epoch": 0.15409406033352072,
+      "grad_norm": 0.2197142243385315,
+      "learning_rate": 0.00027729729729729727,
+      "loss": 0.4617,
+      "step": 514
+    },
+    {
+      "epoch": 0.15439385422522017,
+      "grad_norm": 0.23570996522903442,
+      "learning_rate": 0.0002772522522522522,
+      "loss": 0.4731,
+      "step": 515
+    },
+    {
+      "epoch": 0.15469364811691963,
+      "grad_norm": 0.23566411435604095,
+      "learning_rate": 0.0002772072072072072,
+      "loss": 0.4921,
+      "step": 516
+    },
+    {
+      "epoch": 0.1549934420086191,
+      "grad_norm": 0.21966999769210815,
+      "learning_rate": 0.00027716216216216213,
+      "loss": 0.4683,
+      "step": 517
+    },
+    {
+      "epoch": 0.15529323590031854,
+      "grad_norm": 0.2531338036060333,
+      "learning_rate": 0.00027711711711711707,
+      "loss": 0.5254,
+      "step": 518
+    },
+    {
+      "epoch": 0.155593029792018,
+      "grad_norm": 0.2375670224428177,
+      "learning_rate": 0.00027707207207207206,
+      "loss": 0.4988,
+      "step": 519
+    },
+    {
+      "epoch": 0.15589282368371746,
+      "grad_norm": 0.2455272376537323,
+      "learning_rate": 0.000277027027027027,
+      "loss": 0.501,
+      "step": 520
+    },
+    {
+      "epoch": 0.15619261757541691,
+      "grad_norm": 0.21289831399917603,
+      "learning_rate": 0.00027698198198198193,
+      "loss": 0.4575,
+      "step": 521
+    },
+    {
+      "epoch": 0.15649241146711637,
+      "grad_norm": 0.2653936743736267,
+      "learning_rate": 0.0002769369369369369,
+      "loss": 0.5251,
+      "step": 522
+    },
+    {
+      "epoch": 0.1567922053588158,
+      "grad_norm": 0.23822923004627228,
+      "learning_rate": 0.00027689189189189186,
+      "loss": 0.5095,
+      "step": 523
+    },
+    {
+      "epoch": 0.15709199925051526,
+      "grad_norm": 0.25067201256752014,
+      "learning_rate": 0.0002768468468468468,
+      "loss": 0.4841,
+      "step": 524
+    },
+    {
+      "epoch": 0.15739179314221471,
+      "grad_norm": 0.2340254783630371,
+      "learning_rate": 0.0002768018018018018,
+      "loss": 0.4959,
+      "step": 525
+    },
+    {
+      "epoch": 0.15769158703391417,
+      "grad_norm": 0.2431899458169937,
+      "learning_rate": 0.0002767567567567567,
+      "loss": 0.5035,
+      "step": 526
+    },
+    {
+      "epoch": 0.15799138092561363,
+      "grad_norm": 0.22817112505435944,
+      "learning_rate": 0.00027671171171171166,
+      "loss": 0.49,
+      "step": 527
+    },
+    {
+      "epoch": 0.15829117481731309,
+      "grad_norm": 0.21927404403686523,
+      "learning_rate": 0.00027666666666666665,
+      "loss": 0.4785,
+      "step": 528
+    },
+    {
+      "epoch": 0.15859096870901254,
+      "grad_norm": 0.2402762919664383,
+      "learning_rate": 0.0002766216216216216,
+      "loss": 0.4799,
+      "step": 529
+    },
+    {
+      "epoch": 0.158890762600712,
+      "grad_norm": 0.2559228241443634,
+      "learning_rate": 0.0002765765765765765,
+      "loss": 0.5065,
+      "step": 530
+    },
+    {
+      "epoch": 0.15919055649241146,
+      "grad_norm": 0.22883668541908264,
+      "learning_rate": 0.0002765315315315315,
+      "loss": 0.4797,
+      "step": 531
+    },
+    {
+      "epoch": 0.1594903503841109,
+      "grad_norm": 0.24328212440013885,
+      "learning_rate": 0.00027648648648648645,
+      "loss": 0.4796,
+      "step": 532
+    },
+    {
+      "epoch": 0.15979014427581037,
+      "grad_norm": 0.2543148398399353,
+      "learning_rate": 0.0002764414414414414,
+      "loss": 0.4785,
+      "step": 533
+    },
+    {
+      "epoch": 0.16008993816750983,
+      "grad_norm": 0.24784719944000244,
+      "learning_rate": 0.0002763963963963964,
+      "loss": 0.4981,
+      "step": 534
+    },
+    {
+      "epoch": 0.16038973205920928,
+      "grad_norm": 0.24210456013679504,
+      "learning_rate": 0.0002763513513513513,
+      "loss": 0.4939,
+      "step": 535
+    },
+    {
+      "epoch": 0.16068952595090874,
+      "grad_norm": 0.22924496233463287,
+      "learning_rate": 0.00027630630630630625,
+      "loss": 0.4524,
+      "step": 536
+    },
+    {
+      "epoch": 0.1609893198426082,
+      "grad_norm": 0.270022451877594,
+      "learning_rate": 0.00027626126126126124,
+      "loss": 0.5184,
+      "step": 537
+    },
+    {
+      "epoch": 0.16128911373430765,
+      "grad_norm": 0.2689591646194458,
+      "learning_rate": 0.0002762162162162162,
+      "loss": 0.4966,
+      "step": 538
+    },
+    {
+      "epoch": 0.1615889076260071,
+      "grad_norm": 0.23465842008590698,
+      "learning_rate": 0.00027617117117117117,
+      "loss": 0.4865,
+      "step": 539
+    },
+    {
+      "epoch": 0.16188870151770657,
+      "grad_norm": 0.23281508684158325,
+      "learning_rate": 0.0002761261261261261,
+      "loss": 0.4859,
+      "step": 540
+    },
+    {
+      "epoch": 0.16218849540940602,
+      "grad_norm": 0.25370529294013977,
+      "learning_rate": 0.00027608108108108105,
+      "loss": 0.4609,
+      "step": 541
+    },
+    {
+      "epoch": 0.16248828930110548,
+      "grad_norm": 0.2646511495113373,
+      "learning_rate": 0.00027603603603603604,
+      "loss": 0.4967,
+      "step": 542
+    },
+    {
+      "epoch": 0.16278808319280494,
+      "grad_norm": 0.22836188971996307,
+      "learning_rate": 0.000275990990990991,
+      "loss": 0.4982,
+      "step": 543
+    },
+    {
+      "epoch": 0.1630878770845044,
+      "grad_norm": 0.22948142886161804,
+      "learning_rate": 0.0002759459459459459,
+      "loss": 0.471,
+      "step": 544
+    },
+    {
+      "epoch": 0.16338767097620385,
+      "grad_norm": 0.2623734474182129,
+      "learning_rate": 0.0002759009009009009,
+      "loss": 0.4938,
+      "step": 545
+    },
+    {
+      "epoch": 0.1636874648679033,
+      "grad_norm": 0.2337695211172104,
+      "learning_rate": 0.00027585585585585584,
+      "loss": 0.4584,
+      "step": 546
+    },
+    {
+      "epoch": 0.16398725875960277,
+      "grad_norm": 0.2507021129131317,
+      "learning_rate": 0.0002758108108108108,
+      "loss": 0.5002,
+      "step": 547
+    },
+    {
+      "epoch": 0.16428705265130222,
+      "grad_norm": 0.23930178582668304,
+      "learning_rate": 0.00027576576576576577,
+      "loss": 0.4724,
+      "step": 548
+    },
+    {
+      "epoch": 0.16458684654300168,
+      "grad_norm": 0.24984320998191833,
+      "learning_rate": 0.0002757207207207207,
+      "loss": 0.4768,
+      "step": 549
+    },
+    {
+      "epoch": 0.16488664043470114,
+      "grad_norm": 0.2434365600347519,
+      "learning_rate": 0.00027567567567567564,
+      "loss": 0.4667,
+      "step": 550
+    },
+    {
+      "epoch": 0.1651864343264006,
+      "grad_norm": 0.22952896356582642,
+      "learning_rate": 0.00027563063063063063,
+      "loss": 0.4624,
+      "step": 551
+    },
+    {
+      "epoch": 0.16548622821810005,
+      "grad_norm": 0.2372165471315384,
+      "learning_rate": 0.00027558558558558557,
+      "loss": 0.477,
+      "step": 552
+    },
+    {
+      "epoch": 0.1657860221097995,
+      "grad_norm": 0.24741259217262268,
+      "learning_rate": 0.0002755405405405405,
+      "loss": 0.5077,
+      "step": 553
+    },
+    {
+      "epoch": 0.16608581600149896,
+      "grad_norm": 0.2387109249830246,
+      "learning_rate": 0.0002754954954954955,
+      "loss": 0.4817,
+      "step": 554
+    },
+    {
+      "epoch": 0.16638560989319842,
+      "grad_norm": 0.24962367117404938,
+      "learning_rate": 0.00027545045045045043,
+      "loss": 0.4775,
+      "step": 555
+    },
+    {
+      "epoch": 0.16668540378489788,
+      "grad_norm": 0.2375505119562149,
+      "learning_rate": 0.00027540540540540537,
+      "loss": 0.4843,
+      "step": 556
+    },
+    {
+      "epoch": 0.16698519767659734,
+      "grad_norm": 0.24189910292625427,
+      "learning_rate": 0.00027536036036036036,
+      "loss": 0.4952,
+      "step": 557
+    },
+    {
+      "epoch": 0.1672849915682968,
+      "grad_norm": 0.2314407229423523,
+      "learning_rate": 0.0002753153153153153,
+      "loss": 0.4676,
+      "step": 558
+    },
+    {
+      "epoch": 0.16758478545999625,
+      "grad_norm": 0.24112465977668762,
+      "learning_rate": 0.00027527027027027023,
+      "loss": 0.479,
+      "step": 559
+    },
+    {
+      "epoch": 0.1678845793516957,
+      "grad_norm": 0.22687260806560516,
+      "learning_rate": 0.0002752252252252252,
+      "loss": 0.4651,
+      "step": 560
+    },
+    {
+      "epoch": 0.16818437324339516,
+      "grad_norm": 0.23146574199199677,
+      "learning_rate": 0.00027518018018018016,
+      "loss": 0.4724,
+      "step": 561
+    },
+    {
+      "epoch": 0.16848416713509462,
+      "grad_norm": 0.23164650797843933,
+      "learning_rate": 0.0002751351351351351,
+      "loss": 0.4595,
+      "step": 562
+    },
+    {
+      "epoch": 0.16878396102679408,
+      "grad_norm": 0.2290349006652832,
+      "learning_rate": 0.0002750900900900901,
+      "loss": 0.469,
+      "step": 563
+    },
+    {
+      "epoch": 0.16908375491849353,
+      "grad_norm": 0.22324185073375702,
+      "learning_rate": 0.000275045045045045,
+      "loss": 0.4616,
+      "step": 564
+    },
+    {
+      "epoch": 0.169383548810193,
+      "grad_norm": 0.2320687472820282,
+      "learning_rate": 0.00027499999999999996,
+      "loss": 0.4706,
+      "step": 565
+    },
+    {
+      "epoch": 0.16968334270189245,
+      "grad_norm": 0.2461112141609192,
+      "learning_rate": 0.00027495495495495495,
+      "loss": 0.4886,
+      "step": 566
+    },
+    {
+      "epoch": 0.1699831365935919,
+      "grad_norm": 0.22541654109954834,
+      "learning_rate": 0.0002749099099099099,
+      "loss": 0.4676,
+      "step": 567
+    },
+    {
+      "epoch": 0.17028293048529136,
+      "grad_norm": 0.24664641916751862,
+      "learning_rate": 0.0002748648648648648,
+      "loss": 0.509,
+      "step": 568
+    },
+    {
+      "epoch": 0.17058272437699082,
+      "grad_norm": 0.23051698505878448,
+      "learning_rate": 0.0002748198198198198,
+      "loss": 0.4742,
+      "step": 569
+    },
+    {
+      "epoch": 0.17088251826869028,
+      "grad_norm": 0.21268148720264435,
+      "learning_rate": 0.00027477477477477475,
+      "loss": 0.4536,
+      "step": 570
+    },
+    {
+      "epoch": 0.17118231216038973,
+      "grad_norm": 0.25143638253211975,
+      "learning_rate": 0.0002747297297297297,
+      "loss": 0.5233,
+      "step": 571
+    },
+    {
+      "epoch": 0.1714821060520892,
+      "grad_norm": 0.21673695743083954,
+      "learning_rate": 0.0002746846846846847,
+      "loss": 0.445,
+      "step": 572
+    },
+    {
+      "epoch": 0.17178189994378865,
+      "grad_norm": 0.24307781457901,
+      "learning_rate": 0.0002746396396396396,
+      "loss": 0.493,
+      "step": 573
+    },
+    {
+      "epoch": 0.1720816938354881,
+      "grad_norm": 0.24256987869739532,
+      "learning_rate": 0.00027459459459459456,
+      "loss": 0.5249,
+      "step": 574
+    },
+    {
+      "epoch": 0.17238148772718756,
+      "grad_norm": 0.23426513373851776,
+      "learning_rate": 0.00027454954954954955,
+      "loss": 0.4956,
+      "step": 575
+    },
+    {
+      "epoch": 0.17268128161888702,
+      "grad_norm": 0.23137056827545166,
+      "learning_rate": 0.0002745045045045045,
+      "loss": 0.4909,
+      "step": 576
+    },
+    {
+      "epoch": 0.17298107551058647,
+      "grad_norm": 0.22946982085704803,
+      "learning_rate": 0.0002744594594594594,
+      "loss": 0.4733,
+      "step": 577
+    },
+    {
+      "epoch": 0.17328086940228593,
+      "grad_norm": 0.23843489587306976,
+      "learning_rate": 0.00027441441441441436,
+      "loss": 0.4944,
+      "step": 578
+    },
+    {
+      "epoch": 0.1735806632939854,
+      "grad_norm": 0.21571891009807587,
+      "learning_rate": 0.00027436936936936935,
+      "loss": 0.4488,
+      "step": 579
+    },
+    {
+      "epoch": 0.17388045718568484,
+      "grad_norm": 0.25007542967796326,
+      "learning_rate": 0.0002743243243243243,
+      "loss": 0.488,
+      "step": 580
+    },
+    {
+      "epoch": 0.1741802510773843,
+      "grad_norm": 0.24017852544784546,
+      "learning_rate": 0.0002742792792792792,
+      "loss": 0.4969,
+      "step": 581
+    },
+    {
+      "epoch": 0.17448004496908376,
+      "grad_norm": 0.23361638188362122,
+      "learning_rate": 0.0002742342342342342,
+      "loss": 0.5348,
+      "step": 582
+    },
+    {
+      "epoch": 0.17477983886078322,
+      "grad_norm": 0.22652795910835266,
+      "learning_rate": 0.00027418918918918915,
+      "loss": 0.4776,
+      "step": 583
+    },
+    {
+      "epoch": 0.17507963275248267,
+      "grad_norm": 0.23973309993743896,
+      "learning_rate": 0.0002741441441441441,
+      "loss": 0.4803,
+      "step": 584
+    },
+    {
+      "epoch": 0.17537942664418213,
+      "grad_norm": 0.23487752676010132,
+      "learning_rate": 0.0002740990990990991,
+      "loss": 0.482,
+      "step": 585
+    },
+    {
+      "epoch": 0.1756792205358816,
+      "grad_norm": 0.2464274764060974,
+      "learning_rate": 0.000274054054054054,
+      "loss": 0.5167,
+      "step": 586
+    },
+    {
+      "epoch": 0.17597901442758104,
+      "grad_norm": 0.2280922681093216,
+      "learning_rate": 0.00027400900900900895,
+      "loss": 0.4941,
+      "step": 587
+    },
+    {
+      "epoch": 0.1762788083192805,
+      "grad_norm": 0.24017813801765442,
+      "learning_rate": 0.00027396396396396394,
+      "loss": 0.4721,
+      "step": 588
+    },
+    {
+      "epoch": 0.17657860221097996,
+      "grad_norm": 0.24262118339538574,
+      "learning_rate": 0.0002739189189189189,
+      "loss": 0.5053,
+      "step": 589
+    },
+    {
+      "epoch": 0.1768783961026794,
+      "grad_norm": 0.24060070514678955,
+      "learning_rate": 0.0002738738738738738,
+      "loss": 0.4932,
+      "step": 590
+    },
+    {
+      "epoch": 0.17717818999437887,
+      "grad_norm": 0.2486894428730011,
+      "learning_rate": 0.0002738288288288288,
+      "loss": 0.4963,
+      "step": 591
+    },
+    {
+      "epoch": 0.17747798388607833,
+      "grad_norm": 0.22934255003929138,
+      "learning_rate": 0.00027378378378378374,
+      "loss": 0.4911,
+      "step": 592
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.23473136126995087,
+      "learning_rate": 0.0002737387387387387,
+      "loss": 0.4967,
+      "step": 593
+    },
+    {
+      "epoch": 0.17807757166947724,
+      "grad_norm": 0.24307146668434143,
+      "learning_rate": 0.00027369369369369367,
+      "loss": 0.5051,
+      "step": 594
+    },
+    {
+      "epoch": 0.1783773655611767,
+      "grad_norm": 0.25658494234085083,
+      "learning_rate": 0.0002736486486486486,
+      "loss": 0.4959,
+      "step": 595
+    },
+    {
+      "epoch": 0.17867715945287616,
+      "grad_norm": 0.23326924443244934,
+      "learning_rate": 0.0002736036036036036,
+      "loss": 0.5024,
+      "step": 596
+    },
+    {
+      "epoch": 0.1789769533445756,
+      "grad_norm": 0.2539668083190918,
+      "learning_rate": 0.00027355855855855854,
+      "loss": 0.5059,
+      "step": 597
+    },
+    {
+      "epoch": 0.17927674723627507,
+      "grad_norm": 0.24097499251365662,
+      "learning_rate": 0.00027351351351351347,
+      "loss": 0.4888,
+      "step": 598
+    },
+    {
+      "epoch": 0.17957654112797453,
+      "grad_norm": 0.24816173315048218,
+      "learning_rate": 0.00027346846846846846,
+      "loss": 0.4943,
+      "step": 599
+    },
+    {
+      "epoch": 0.17987633501967398,
+      "grad_norm": 0.25391021370887756,
+      "learning_rate": 0.0002734234234234234,
+      "loss": 0.4909,
+      "step": 600
+    },
+    {
+      "epoch": 0.18017612891137344,
+      "grad_norm": 0.25449302792549133,
+      "learning_rate": 0.00027337837837837834,
+      "loss": 0.499,
+      "step": 601
+    },
+    {
+      "epoch": 0.1804759228030729,
+      "grad_norm": 0.2581718862056732,
+      "learning_rate": 0.00027333333333333333,
+      "loss": 0.5093,
+      "step": 602
+    },
+    {
+      "epoch": 0.18077571669477235,
+      "grad_norm": 0.2509480118751526,
+      "learning_rate": 0.00027328828828828826,
+      "loss": 0.4974,
+      "step": 603
+    },
+    {
+      "epoch": 0.1810755105864718,
+      "grad_norm": 0.23391370475292206,
+      "learning_rate": 0.0002732432432432432,
+      "loss": 0.5176,
+      "step": 604
+    },
+    {
+      "epoch": 0.18137530447817127,
+      "grad_norm": 0.2365102618932724,
+      "learning_rate": 0.0002731981981981982,
+      "loss": 0.5021,
+      "step": 605
+    },
+    {
+      "epoch": 0.18167509836987072,
+      "grad_norm": 0.23687691986560822,
+      "learning_rate": 0.00027315315315315313,
+      "loss": 0.4728,
+      "step": 606
+    },
+    {
+      "epoch": 0.18197489226157018,
+      "grad_norm": 0.24404986202716827,
+      "learning_rate": 0.00027310810810810807,
+      "loss": 0.4842,
+      "step": 607
+    },
+    {
+      "epoch": 0.18227468615326964,
+      "grad_norm": 0.2473643720149994,
+      "learning_rate": 0.00027306306306306306,
+      "loss": 0.4823,
+      "step": 608
+    },
+    {
+      "epoch": 0.1825744800449691,
+      "grad_norm": 0.22777344286441803,
+      "learning_rate": 0.000273018018018018,
+      "loss": 0.4995,
+      "step": 609
+    },
+    {
+      "epoch": 0.18287427393666855,
+      "grad_norm": 0.22545696794986725,
+      "learning_rate": 0.000272972972972973,
+      "loss": 0.4634,
+      "step": 610
+    },
+    {
+      "epoch": 0.183174067828368,
+      "grad_norm": 0.2380336970090866,
+      "learning_rate": 0.0002729279279279279,
+      "loss": 0.4831,
+      "step": 611
+    },
+    {
+      "epoch": 0.18347386172006747,
+      "grad_norm": 0.22387194633483887,
+      "learning_rate": 0.00027288288288288286,
+      "loss": 0.4567,
+      "step": 612
+    },
+    {
+      "epoch": 0.18377365561176692,
+      "grad_norm": 0.2482718676328659,
+      "learning_rate": 0.00027283783783783785,
+      "loss": 0.4974,
+      "step": 613
+    },
+    {
+      "epoch": 0.18407344950346638,
+      "grad_norm": 0.2622338533401489,
+      "learning_rate": 0.0002727927927927928,
+      "loss": 0.5004,
+      "step": 614
+    },
+    {
+      "epoch": 0.18437324339516584,
+      "grad_norm": 0.22443141043186188,
+      "learning_rate": 0.0002727477477477477,
+      "loss": 0.4717,
+      "step": 615
+    },
+    {
+      "epoch": 0.1846730372868653,
+      "grad_norm": 0.23443573713302612,
+      "learning_rate": 0.0002727027027027027,
+      "loss": 0.5023,
+      "step": 616
+    },
+    {
+      "epoch": 0.18497283117856475,
+      "grad_norm": 0.2201087921857834,
+      "learning_rate": 0.00027265765765765765,
+      "loss": 0.4812,
+      "step": 617
+    },
+    {
+      "epoch": 0.18527262507026418,
+      "grad_norm": 0.24011440575122833,
+      "learning_rate": 0.0002726126126126126,
+      "loss": 0.4865,
+      "step": 618
+    },
+    {
+      "epoch": 0.18557241896196364,
+      "grad_norm": 0.2697189450263977,
+      "learning_rate": 0.0002725675675675676,
+      "loss": 0.5256,
+      "step": 619
+    },
+    {
+      "epoch": 0.1858722128536631,
+      "grad_norm": 0.22377456724643707,
+      "learning_rate": 0.0002725225225225225,
+      "loss": 0.4701,
+      "step": 620
+    },
+    {
+      "epoch": 0.18617200674536255,
+      "grad_norm": 0.2551979720592499,
+      "learning_rate": 0.00027247747747747745,
+      "loss": 0.4878,
+      "step": 621
+    },
+    {
+      "epoch": 0.186471800637062,
+      "grad_norm": 0.2368023544549942,
+      "learning_rate": 0.00027243243243243244,
+      "loss": 0.4768,
+      "step": 622
+    },
+    {
+      "epoch": 0.18677159452876146,
+      "grad_norm": 0.23817569017410278,
+      "learning_rate": 0.0002723873873873874,
+      "loss": 0.4914,
+      "step": 623
+    },
+    {
+      "epoch": 0.18707138842046092,
+      "grad_norm": 0.23484331369400024,
+      "learning_rate": 0.0002723423423423423,
+      "loss": 0.4896,
+      "step": 624
+    },
+    {
+      "epoch": 0.18737118231216038,
+      "grad_norm": 0.2473037838935852,
+      "learning_rate": 0.0002722972972972973,
+      "loss": 0.4841,
+      "step": 625
+    },
+    {
+      "epoch": 0.18767097620385984,
+      "grad_norm": 0.2387157678604126,
+      "learning_rate": 0.00027225225225225224,
+      "loss": 0.4913,
+      "step": 626
+    },
+    {
+      "epoch": 0.1879707700955593,
+      "grad_norm": 0.2485678642988205,
+      "learning_rate": 0.0002722072072072072,
+      "loss": 0.5155,
+      "step": 627
+    },
+    {
+      "epoch": 0.18827056398725875,
+      "grad_norm": 0.22908784449100494,
+      "learning_rate": 0.0002721621621621621,
+      "loss": 0.4874,
+      "step": 628
+    },
+    {
+      "epoch": 0.1885703578789582,
+      "grad_norm": 0.22057555615901947,
+      "learning_rate": 0.0002721171171171171,
+      "loss": 0.4846,
+      "step": 629
+    },
+    {
+      "epoch": 0.18887015177065766,
+      "grad_norm": 0.23972582817077637,
+      "learning_rate": 0.00027207207207207204,
+      "loss": 0.4785,
+      "step": 630
+    },
+    {
+      "epoch": 0.18916994566235712,
+      "grad_norm": 0.2453726977109909,
+      "learning_rate": 0.000272027027027027,
+      "loss": 0.4855,
+      "step": 631
+    },
+    {
+      "epoch": 0.18946973955405658,
+      "grad_norm": 0.23710183799266815,
+      "learning_rate": 0.00027198198198198197,
+      "loss": 0.4979,
+      "step": 632
+    },
+    {
+      "epoch": 0.18976953344575603,
+      "grad_norm": 0.24524274468421936,
+      "learning_rate": 0.0002719369369369369,
+      "loss": 0.4806,
+      "step": 633
+    },
+    {
+      "epoch": 0.1900693273374555,
+      "grad_norm": 0.22835515439510345,
+      "learning_rate": 0.00027189189189189185,
+      "loss": 0.4746,
+      "step": 634
+    },
+    {
+      "epoch": 0.19036912122915495,
+      "grad_norm": 0.23380154371261597,
+      "learning_rate": 0.00027184684684684684,
+      "loss": 0.4931,
+      "step": 635
+    },
+    {
+      "epoch": 0.1906689151208544,
+      "grad_norm": 0.22659477591514587,
+      "learning_rate": 0.0002718018018018018,
+      "loss": 0.4443,
+      "step": 636
+    },
+    {
+      "epoch": 0.19096870901255386,
+      "grad_norm": 0.22725367546081543,
+      "learning_rate": 0.0002717567567567567,
+      "loss": 0.4888,
+      "step": 637
+    },
+    {
+      "epoch": 0.19126850290425332,
+      "grad_norm": 0.233082115650177,
+      "learning_rate": 0.0002717117117117117,
+      "loss": 0.4752,
+      "step": 638
+    },
+    {
+      "epoch": 0.19156829679595277,
+      "grad_norm": 0.22560617327690125,
+      "learning_rate": 0.00027166666666666664,
+      "loss": 0.4585,
+      "step": 639
+    },
+    {
+      "epoch": 0.19186809068765223,
+      "grad_norm": 0.22963936626911163,
+      "learning_rate": 0.0002716216216216216,
+      "loss": 0.4774,
+      "step": 640
+    },
+    {
+      "epoch": 0.1921678845793517,
+      "grad_norm": 0.2543715238571167,
+      "learning_rate": 0.00027157657657657657,
+      "loss": 0.4896,
+      "step": 641
+    },
+    {
+      "epoch": 0.19246767847105115,
+      "grad_norm": 0.24594075977802277,
+      "learning_rate": 0.0002715315315315315,
+      "loss": 0.4753,
+      "step": 642
+    },
+    {
+      "epoch": 0.1927674723627506,
+      "grad_norm": 0.2333337366580963,
+      "learning_rate": 0.00027148648648648644,
+      "loss": 0.4759,
+      "step": 643
+    },
+    {
+      "epoch": 0.19306726625445006,
+      "grad_norm": 0.23100800812244415,
+      "learning_rate": 0.00027144144144144143,
+      "loss": 0.4536,
+      "step": 644
+    },
+    {
+      "epoch": 0.19336706014614952,
+      "grad_norm": 0.26426073908805847,
+      "learning_rate": 0.00027139639639639637,
+      "loss": 0.4882,
+      "step": 645
+    },
+    {
+      "epoch": 0.19366685403784897,
+      "grad_norm": 0.22670197486877441,
+      "learning_rate": 0.0002713513513513513,
+      "loss": 0.473,
+      "step": 646
+    },
+    {
+      "epoch": 0.19396664792954843,
+      "grad_norm": 0.23645341396331787,
+      "learning_rate": 0.0002713063063063063,
+      "loss": 0.4964,
+      "step": 647
+    },
+    {
+      "epoch": 0.1942664418212479,
+      "grad_norm": 0.2535463869571686,
+      "learning_rate": 0.00027126126126126123,
+      "loss": 0.5001,
+      "step": 648
+    },
+    {
+      "epoch": 0.19456623571294734,
+      "grad_norm": 0.25858014822006226,
+      "learning_rate": 0.00027121621621621617,
+      "loss": 0.5007,
+      "step": 649
+    },
+    {
+      "epoch": 0.1948660296046468,
+      "grad_norm": 0.2457359880208969,
+      "learning_rate": 0.0002711711711711711,
+      "loss": 0.4849,
+      "step": 650
+    },
+    {
+      "epoch": 0.19516582349634626,
+      "grad_norm": 0.25070154666900635,
+      "learning_rate": 0.0002711261261261261,
+      "loss": 0.4888,
+      "step": 651
+    },
+    {
+      "epoch": 0.19546561738804571,
+      "grad_norm": 0.22972378134727478,
+      "learning_rate": 0.00027108108108108103,
+      "loss": 0.4719,
+      "step": 652
+    },
+    {
+      "epoch": 0.19576541127974517,
+      "grad_norm": 0.23261764645576477,
+      "learning_rate": 0.000271036036036036,
+      "loss": 0.4633,
+      "step": 653
+    },
+    {
+      "epoch": 0.19606520517144463,
+      "grad_norm": 0.25028276443481445,
+      "learning_rate": 0.00027099099099099096,
+      "loss": 0.5209,
+      "step": 654
+    },
+    {
+      "epoch": 0.19636499906314409,
+      "grad_norm": 0.23486949503421783,
+      "learning_rate": 0.0002709459459459459,
+      "loss": 0.4757,
+      "step": 655
+    },
+    {
+      "epoch": 0.19666479295484354,
+      "grad_norm": 0.2393907606601715,
+      "learning_rate": 0.0002709009009009009,
+      "loss": 0.464,
+      "step": 656
+    },
+    {
+      "epoch": 0.196964586846543,
+      "grad_norm": 0.24317681789398193,
+      "learning_rate": 0.0002708558558558558,
+      "loss": 0.4843,
+      "step": 657
+    },
+    {
+      "epoch": 0.19726438073824246,
+      "grad_norm": 0.23742252588272095,
+      "learning_rate": 0.00027081081081081076,
+      "loss": 0.5025,
+      "step": 658
+    },
+    {
+      "epoch": 0.1975641746299419,
+      "grad_norm": 0.23426878452301025,
+      "learning_rate": 0.00027076576576576575,
+      "loss": 0.4776,
+      "step": 659
+    },
+    {
+      "epoch": 0.19786396852164137,
+      "grad_norm": 0.250949501991272,
+      "learning_rate": 0.0002707207207207207,
+      "loss": 0.4968,
+      "step": 660
+    },
+    {
+      "epoch": 0.19816376241334083,
+      "grad_norm": 0.23000746965408325,
+      "learning_rate": 0.0002706756756756756,
+      "loss": 0.4895,
+      "step": 661
+    },
+    {
+      "epoch": 0.19846355630504028,
+      "grad_norm": 0.26243215799331665,
+      "learning_rate": 0.0002706306306306306,
+      "loss": 0.5076,
+      "step": 662
+    },
+    {
+      "epoch": 0.19876335019673974,
+      "grad_norm": 0.22115159034729004,
+      "learning_rate": 0.00027058558558558555,
+      "loss": 0.4823,
+      "step": 663
+    },
+    {
+      "epoch": 0.1990631440884392,
+      "grad_norm": 0.22618655860424042,
+      "learning_rate": 0.0002705405405405405,
+      "loss": 0.4994,
+      "step": 664
+    },
+    {
+      "epoch": 0.19936293798013865,
+      "grad_norm": 0.22989815473556519,
+      "learning_rate": 0.0002704954954954955,
+      "loss": 0.4701,
+      "step": 665
+    },
+    {
+      "epoch": 0.1996627318718381,
+      "grad_norm": 0.24214977025985718,
+      "learning_rate": 0.0002704504504504504,
+      "loss": 0.4806,
+      "step": 666
+    },
+    {
+      "epoch": 0.19996252576353757,
+      "grad_norm": 0.21489010751247406,
+      "learning_rate": 0.0002704054054054054,
+      "loss": 0.4511,
+      "step": 667
+    },
+    {
+      "epoch": 0.20026231965523703,
+      "grad_norm": 0.2397059053182602,
+      "learning_rate": 0.00027036036036036035,
+      "loss": 0.4427,
+      "step": 668
+    },
+    {
+      "epoch": 0.20056211354693648,
+      "grad_norm": 0.2419203370809555,
+      "learning_rate": 0.0002703153153153153,
+      "loss": 0.4993,
+      "step": 669
+    },
+    {
+      "epoch": 0.20086190743863594,
+      "grad_norm": 0.24709810316562653,
+      "learning_rate": 0.0002702702702702703,
+      "loss": 0.5195,
+      "step": 670
+    },
+    {
+      "epoch": 0.2011617013303354,
+      "grad_norm": 0.25068792700767517,
+      "learning_rate": 0.0002702252252252252,
+      "loss": 0.508,
+      "step": 671
+    },
+    {
+      "epoch": 0.20146149522203485,
+      "grad_norm": 0.210756316781044,
+      "learning_rate": 0.00027018018018018015,
+      "loss": 0.4413,
+      "step": 672
+    },
+    {
+      "epoch": 0.2017612891137343,
+      "grad_norm": 0.2557854652404785,
+      "learning_rate": 0.00027013513513513514,
+      "loss": 0.4971,
+      "step": 673
+    },
+    {
+      "epoch": 0.20206108300543377,
+      "grad_norm": 0.23103776574134827,
+      "learning_rate": 0.0002700900900900901,
+      "loss": 0.4683,
+      "step": 674
+    },
+    {
+      "epoch": 0.20236087689713322,
+      "grad_norm": 0.21560733020305634,
+      "learning_rate": 0.000270045045045045,
+      "loss": 0.4443,
+      "step": 675
+    },
+    {
+      "epoch": 0.20266067078883268,
+      "grad_norm": 0.2477121204137802,
+      "learning_rate": 0.00027,
+      "loss": 0.5131,
+      "step": 676
+    },
+    {
+      "epoch": 0.20296046468053214,
+      "grad_norm": 0.24966078996658325,
+      "learning_rate": 0.00026995495495495494,
+      "loss": 0.4856,
+      "step": 677
+    },
+    {
+      "epoch": 0.2032602585722316,
+      "grad_norm": 0.23841539025306702,
+      "learning_rate": 0.0002699099099099099,
+      "loss": 0.4658,
+      "step": 678
+    },
+    {
+      "epoch": 0.20356005246393105,
+      "grad_norm": 0.2685762047767639,
+      "learning_rate": 0.00026986486486486487,
+      "loss": 0.5096,
+      "step": 679
+    },
+    {
+      "epoch": 0.2038598463556305,
+      "grad_norm": 0.25834083557128906,
+      "learning_rate": 0.0002698198198198198,
+      "loss": 0.5036,
+      "step": 680
+    },
+    {
+      "epoch": 0.20415964024732997,
+      "grad_norm": 0.2324528843164444,
+      "learning_rate": 0.00026977477477477474,
+      "loss": 0.4618,
+      "step": 681
+    },
+    {
+      "epoch": 0.20445943413902942,
+      "grad_norm": 0.22903920710086823,
+      "learning_rate": 0.00026972972972972973,
+      "loss": 0.4662,
+      "step": 682
+    },
+    {
+      "epoch": 0.20475922803072888,
+      "grad_norm": 0.24908147752285004,
+      "learning_rate": 0.00026968468468468467,
+      "loss": 0.4684,
+      "step": 683
+    },
+    {
+      "epoch": 0.20505902192242834,
+      "grad_norm": 0.2278299629688263,
+      "learning_rate": 0.0002696396396396396,
+      "loss": 0.4447,
+      "step": 684
+    },
+    {
+      "epoch": 0.2053588158141278,
+      "grad_norm": 0.2315731793642044,
+      "learning_rate": 0.0002695945945945946,
+      "loss": 0.4875,
+      "step": 685
+    },
+    {
+      "epoch": 0.20565860970582725,
+      "grad_norm": 0.23152673244476318,
+      "learning_rate": 0.00026954954954954953,
+      "loss": 0.4796,
+      "step": 686
+    },
+    {
+      "epoch": 0.2059584035975267,
+      "grad_norm": 0.23902982473373413,
+      "learning_rate": 0.00026950450450450447,
+      "loss": 0.5169,
+      "step": 687
+    },
+    {
+      "epoch": 0.20625819748922616,
+      "grad_norm": 0.23636193573474884,
+      "learning_rate": 0.00026945945945945946,
+      "loss": 0.493,
+      "step": 688
+    },
+    {
+      "epoch": 0.20655799138092562,
+      "grad_norm": 0.21632736921310425,
+      "learning_rate": 0.0002694144144144144,
+      "loss": 0.4594,
+      "step": 689
+    },
+    {
+      "epoch": 0.20685778527262508,
+      "grad_norm": 0.2258147895336151,
+      "learning_rate": 0.00026936936936936934,
+      "loss": 0.4625,
+      "step": 690
+    },
+    {
+      "epoch": 0.20715757916432453,
+      "grad_norm": 0.21552099287509918,
+      "learning_rate": 0.0002693243243243243,
+      "loss": 0.4481,
+      "step": 691
+    },
+    {
+      "epoch": 0.207457373056024,
+      "grad_norm": 0.23030760884284973,
+      "learning_rate": 0.00026927927927927926,
+      "loss": 0.4644,
+      "step": 692
+    },
+    {
+      "epoch": 0.20775716694772345,
+      "grad_norm": 0.23163190484046936,
+      "learning_rate": 0.0002692342342342342,
+      "loss": 0.4483,
+      "step": 693
+    },
+    {
+      "epoch": 0.2080569608394229,
+      "grad_norm": 0.2412249743938446,
+      "learning_rate": 0.0002691891891891892,
+      "loss": 0.4886,
+      "step": 694
+    },
+    {
+      "epoch": 0.20835675473112236,
+      "grad_norm": 0.23279330134391785,
+      "learning_rate": 0.00026914414414414413,
+      "loss": 0.4733,
+      "step": 695
+    },
+    {
+      "epoch": 0.20865654862282182,
+      "grad_norm": 0.2269987165927887,
+      "learning_rate": 0.00026909909909909906,
+      "loss": 0.4866,
+      "step": 696
+    },
+    {
+      "epoch": 0.20895634251452128,
+      "grad_norm": 0.23355835676193237,
+      "learning_rate": 0.00026905405405405406,
+      "loss": 0.4918,
+      "step": 697
+    },
+    {
+      "epoch": 0.20925613640622073,
+      "grad_norm": 0.26988187432289124,
+      "learning_rate": 0.000269009009009009,
+      "loss": 0.4953,
+      "step": 698
+    },
+    {
+      "epoch": 0.2095559302979202,
+      "grad_norm": 0.22978806495666504,
+      "learning_rate": 0.00026896396396396393,
+      "loss": 0.4622,
+      "step": 699
+    },
+    {
+      "epoch": 0.20985572418961965,
+      "grad_norm": 0.2823212146759033,
+      "learning_rate": 0.00026891891891891887,
+      "loss": 0.5105,
+      "step": 700
+    },
+    {
+      "epoch": 0.2101555180813191,
+      "grad_norm": 0.23818424344062805,
+      "learning_rate": 0.00026887387387387386,
+      "loss": 0.479,
+      "step": 701
+    },
+    {
+      "epoch": 0.21045531197301856,
+      "grad_norm": 0.23730318248271942,
+      "learning_rate": 0.0002688288288288288,
+      "loss": 0.4991,
+      "step": 702
+    },
+    {
+      "epoch": 0.21075510586471802,
+      "grad_norm": 0.23275551199913025,
+      "learning_rate": 0.00026878378378378373,
+      "loss": 0.466,
+      "step": 703
+    },
+    {
+      "epoch": 0.21105489975641747,
+      "grad_norm": 0.2296077013015747,
+      "learning_rate": 0.0002687387387387387,
+      "loss": 0.4964,
+      "step": 704
+    },
+    {
+      "epoch": 0.21135469364811693,
+      "grad_norm": 0.24341174960136414,
+      "learning_rate": 0.00026869369369369366,
+      "loss": 0.505,
+      "step": 705
+    },
+    {
+      "epoch": 0.2116544875398164,
+      "grad_norm": 0.22542104125022888,
+      "learning_rate": 0.0002686486486486486,
+      "loss": 0.4533,
+      "step": 706
+    },
+    {
+      "epoch": 0.21195428143151585,
+      "grad_norm": 0.22414691746234894,
+      "learning_rate": 0.0002686036036036036,
+      "loss": 0.4504,
+      "step": 707
+    },
+    {
+      "epoch": 0.2122540753232153,
+      "grad_norm": 0.2119213044643402,
+      "learning_rate": 0.0002685585585585585,
+      "loss": 0.4457,
+      "step": 708
+    },
+    {
+      "epoch": 0.21255386921491476,
+      "grad_norm": 0.21771720051765442,
+      "learning_rate": 0.00026851351351351346,
+      "loss": 0.4411,
+      "step": 709
+    },
+    {
+      "epoch": 0.21285366310661422,
+      "grad_norm": 0.2484733760356903,
+      "learning_rate": 0.00026846846846846845,
+      "loss": 0.5016,
+      "step": 710
+    },
+    {
+      "epoch": 0.21315345699831367,
+      "grad_norm": 0.23043473064899445,
+      "learning_rate": 0.0002684234234234234,
+      "loss": 0.4717,
+      "step": 711
+    },
+    {
+      "epoch": 0.21345325089001313,
+      "grad_norm": 0.2769162952899933,
+      "learning_rate": 0.0002683783783783783,
+      "loss": 0.5135,
+      "step": 712
+    },
+    {
+      "epoch": 0.21375304478171256,
+      "grad_norm": 0.22561033070087433,
+      "learning_rate": 0.0002683333333333333,
+      "loss": 0.4928,
+      "step": 713
+    },
+    {
+      "epoch": 0.21405283867341202,
+      "grad_norm": 0.2365075945854187,
+      "learning_rate": 0.00026828828828828825,
+      "loss": 0.4843,
+      "step": 714
+    },
+    {
+      "epoch": 0.21435263256511147,
+      "grad_norm": 0.23628349602222443,
+      "learning_rate": 0.0002682432432432432,
+      "loss": 0.4782,
+      "step": 715
+    },
+    {
+      "epoch": 0.21465242645681093,
+      "grad_norm": 0.22449685633182526,
+      "learning_rate": 0.0002681981981981982,
+      "loss": 0.4533,
+      "step": 716
+    },
+    {
+      "epoch": 0.2149522203485104,
+      "grad_norm": 0.23336151242256165,
+      "learning_rate": 0.0002681531531531531,
+      "loss": 0.4818,
+      "step": 717
+    },
+    {
+      "epoch": 0.21525201424020984,
+      "grad_norm": 0.2387206107378006,
+      "learning_rate": 0.00026810810810810805,
+      "loss": 0.4537,
+      "step": 718
+    },
+    {
+      "epoch": 0.2155518081319093,
+      "grad_norm": 0.23359011113643646,
+      "learning_rate": 0.00026806306306306304,
+      "loss": 0.4895,
+      "step": 719
+    },
+    {
+      "epoch": 0.21585160202360876,
+      "grad_norm": 0.240494042634964,
+      "learning_rate": 0.000268018018018018,
+      "loss": 0.4878,
+      "step": 720
+    },
+    {
+      "epoch": 0.21615139591530821,
+      "grad_norm": 0.23335424065589905,
+      "learning_rate": 0.0002679729729729729,
+      "loss": 0.4747,
+      "step": 721
+    },
+    {
+      "epoch": 0.21645118980700767,
+      "grad_norm": 0.2620643079280853,
+      "learning_rate": 0.0002679279279279279,
+      "loss": 0.4968,
+      "step": 722
+    },
+    {
+      "epoch": 0.21675098369870713,
+      "grad_norm": 0.2350034862756729,
+      "learning_rate": 0.00026788288288288284,
+      "loss": 0.4801,
+      "step": 723
+    },
+    {
+      "epoch": 0.21705077759040659,
+      "grad_norm": 0.2358752340078354,
+      "learning_rate": 0.00026783783783783784,
+      "loss": 0.5265,
+      "step": 724
+    },
+    {
+      "epoch": 0.21735057148210604,
+      "grad_norm": 0.2392471730709076,
+      "learning_rate": 0.00026779279279279277,
+      "loss": 0.4667,
+      "step": 725
+    },
+    {
+      "epoch": 0.2176503653738055,
+      "grad_norm": 0.23733973503112793,
+      "learning_rate": 0.0002677477477477477,
+      "loss": 0.4712,
+      "step": 726
+    },
+    {
+      "epoch": 0.21795015926550496,
+      "grad_norm": 0.2224283516407013,
+      "learning_rate": 0.0002677027027027027,
+      "loss": 0.4519,
+      "step": 727
+    },
+    {
+      "epoch": 0.2182499531572044,
+      "grad_norm": 0.22749215364456177,
+      "learning_rate": 0.00026765765765765764,
+      "loss": 0.4656,
+      "step": 728
+    },
+    {
+      "epoch": 0.21854974704890387,
+      "grad_norm": 0.29321786761283875,
+      "learning_rate": 0.0002676126126126126,
+      "loss": 0.5085,
+      "step": 729
+    },
+    {
+      "epoch": 0.21884954094060333,
+      "grad_norm": 0.23674741387367249,
+      "learning_rate": 0.00026756756756756756,
+      "loss": 0.4958,
+      "step": 730
+    },
+    {
+      "epoch": 0.21914933483230278,
+      "grad_norm": 0.21558566391468048,
+      "learning_rate": 0.0002675225225225225,
+      "loss": 0.4679,
+      "step": 731
+    },
+    {
+      "epoch": 0.21944912872400224,
+      "grad_norm": 0.2383924126625061,
+      "learning_rate": 0.00026747747747747744,
+      "loss": 0.4827,
+      "step": 732
+    },
+    {
+      "epoch": 0.2197489226157017,
+      "grad_norm": 0.23788924515247345,
+      "learning_rate": 0.00026743243243243243,
+      "loss": 0.486,
+      "step": 733
+    },
+    {
+      "epoch": 0.22004871650740115,
+      "grad_norm": 0.23550404608249664,
+      "learning_rate": 0.00026738738738738737,
+      "loss": 0.4603,
+      "step": 734
+    },
+    {
+      "epoch": 0.2203485103991006,
+      "grad_norm": 0.2342066466808319,
+      "learning_rate": 0.00026734234234234236,
+      "loss": 0.4591,
+      "step": 735
+    },
+    {
+      "epoch": 0.22064830429080007,
+      "grad_norm": 0.25759053230285645,
+      "learning_rate": 0.0002672972972972973,
+      "loss": 0.483,
+      "step": 736
+    },
+    {
+      "epoch": 0.22094809818249952,
+      "grad_norm": 0.22325725853443146,
+      "learning_rate": 0.00026725225225225223,
+      "loss": 0.4609,
+      "step": 737
+    },
+    {
+      "epoch": 0.22124789207419898,
+      "grad_norm": 0.22235055267810822,
+      "learning_rate": 0.0002672072072072072,
+      "loss": 0.4512,
+      "step": 738
+    },
+    {
+      "epoch": 0.22154768596589844,
+      "grad_norm": 0.23441246151924133,
+      "learning_rate": 0.00026716216216216216,
+      "loss": 0.4517,
+      "step": 739
+    },
+    {
+      "epoch": 0.2218474798575979,
+      "grad_norm": 0.2520740330219269,
+      "learning_rate": 0.0002671171171171171,
+      "loss": 0.4712,
+      "step": 740
+    },
+    {
+      "epoch": 0.22214727374929735,
+      "grad_norm": 0.22782452404499054,
+      "learning_rate": 0.0002670720720720721,
+      "loss": 0.4723,
+      "step": 741
+    },
+    {
+      "epoch": 0.2224470676409968,
+      "grad_norm": 0.2406499981880188,
+      "learning_rate": 0.000267027027027027,
+      "loss": 0.4909,
+      "step": 742
+    },
+    {
+      "epoch": 0.22274686153269627,
+      "grad_norm": 0.21733756363391876,
+      "learning_rate": 0.00026698198198198196,
+      "loss": 0.4402,
+      "step": 743
+    },
+    {
+      "epoch": 0.22304665542439572,
+      "grad_norm": 0.2329728901386261,
+      "learning_rate": 0.00026693693693693695,
+      "loss": 0.4659,
+      "step": 744
+    },
+    {
+      "epoch": 0.22334644931609518,
+      "grad_norm": 0.23359104990959167,
+      "learning_rate": 0.0002668918918918919,
+      "loss": 0.4848,
+      "step": 745
+    },
+    {
+      "epoch": 0.22364624320779464,
+      "grad_norm": 0.23723845183849335,
+      "learning_rate": 0.0002668468468468468,
+      "loss": 0.4674,
+      "step": 746
+    },
+    {
+      "epoch": 0.2239460370994941,
+      "grad_norm": 0.2128835916519165,
+      "learning_rate": 0.00026680180180180176,
+      "loss": 0.4617,
+      "step": 747
+    },
+    {
+      "epoch": 0.22424583099119355,
+      "grad_norm": 0.2343822568655014,
+      "learning_rate": 0.00026675675675675675,
+      "loss": 0.4624,
+      "step": 748
+    },
+    {
+      "epoch": 0.224545624882893,
+      "grad_norm": 0.24932916462421417,
+      "learning_rate": 0.0002667117117117117,
+      "loss": 0.476,
+      "step": 749
+    },
+    {
+      "epoch": 0.22484541877459246,
+      "grad_norm": 0.24181415140628815,
+      "learning_rate": 0.0002666666666666666,
+      "loss": 0.4681,
+      "step": 750
+    },
+    {
+      "epoch": 0.22514521266629192,
+      "grad_norm": 0.23665620386600494,
+      "learning_rate": 0.0002666216216216216,
+      "loss": 0.4858,
+      "step": 751
+    },
+    {
+      "epoch": 0.22544500655799138,
+      "grad_norm": 0.24904295802116394,
+      "learning_rate": 0.00026657657657657655,
+      "loss": 0.4957,
+      "step": 752
+    },
+    {
+      "epoch": 0.22574480044969084,
+      "grad_norm": 0.2285979986190796,
+      "learning_rate": 0.0002665315315315315,
+      "loss": 0.4515,
+      "step": 753
+    },
+    {
+      "epoch": 0.2260445943413903,
+      "grad_norm": 0.2505464553833008,
+      "learning_rate": 0.0002664864864864865,
+      "loss": 0.4884,
+      "step": 754
+    },
+    {
+      "epoch": 0.22634438823308975,
+      "grad_norm": 0.22328858077526093,
+      "learning_rate": 0.0002664414414414414,
+      "loss": 0.4463,
+      "step": 755
+    },
+    {
+      "epoch": 0.2266441821247892,
+      "grad_norm": 0.2543044984340668,
+      "learning_rate": 0.00026639639639639635,
+      "loss": 0.493,
+      "step": 756
+    },
+    {
+      "epoch": 0.22694397601648866,
+      "grad_norm": 0.2348204255104065,
+      "learning_rate": 0.00026635135135135135,
+      "loss": 0.4511,
+      "step": 757
+    },
+    {
+      "epoch": 0.22724376990818812,
+      "grad_norm": 0.25663718581199646,
+      "learning_rate": 0.0002663063063063063,
+      "loss": 0.5006,
+      "step": 758
+    },
+    {
+      "epoch": 0.22754356379988758,
+      "grad_norm": 0.24245639145374298,
+      "learning_rate": 0.0002662612612612612,
+      "loss": 0.4687,
+      "step": 759
+    },
+    {
+      "epoch": 0.22784335769158703,
+      "grad_norm": 0.2461511641740799,
+      "learning_rate": 0.0002662162162162162,
+      "loss": 0.5136,
+      "step": 760
+    },
+    {
+      "epoch": 0.2281431515832865,
+      "grad_norm": 0.22325679659843445,
+      "learning_rate": 0.00026617117117117115,
+      "loss": 0.472,
+      "step": 761
+    },
+    {
+      "epoch": 0.22844294547498595,
+      "grad_norm": 0.2652730345726013,
+      "learning_rate": 0.0002661261261261261,
+      "loss": 0.4732,
+      "step": 762
+    },
+    {
+      "epoch": 0.2287427393666854,
+      "grad_norm": 0.24870134890079498,
+      "learning_rate": 0.0002660810810810811,
+      "loss": 0.467,
+      "step": 763
+    },
+    {
+      "epoch": 0.22904253325838486,
+      "grad_norm": 0.23315280675888062,
+      "learning_rate": 0.000266036036036036,
+      "loss": 0.4994,
+      "step": 764
+    },
+    {
+      "epoch": 0.22934232715008432,
+      "grad_norm": 0.23781219124794006,
+      "learning_rate": 0.00026599099099099095,
+      "loss": 0.4923,
+      "step": 765
+    },
+    {
+      "epoch": 0.22964212104178378,
+      "grad_norm": 0.2272898256778717,
+      "learning_rate": 0.00026594594594594594,
+      "loss": 0.4683,
+      "step": 766
+    },
+    {
+      "epoch": 0.22994191493348323,
+      "grad_norm": 0.2321631759405136,
+      "learning_rate": 0.0002659009009009009,
+      "loss": 0.5018,
+      "step": 767
+    },
+    {
+      "epoch": 0.2302417088251827,
+      "grad_norm": 0.22698219120502472,
+      "learning_rate": 0.0002658558558558558,
+      "loss": 0.479,
+      "step": 768
+    },
+    {
+      "epoch": 0.23054150271688215,
+      "grad_norm": 0.23421627283096313,
+      "learning_rate": 0.0002658108108108108,
+      "loss": 0.4613,
+      "step": 769
+    },
+    {
+      "epoch": 0.2308412966085816,
+      "grad_norm": 0.21950644254684448,
+      "learning_rate": 0.00026576576576576574,
+      "loss": 0.4497,
+      "step": 770
+    },
+    {
+      "epoch": 0.23114109050028106,
+      "grad_norm": 0.2207535058259964,
+      "learning_rate": 0.0002657207207207207,
+      "loss": 0.4777,
+      "step": 771
+    },
+    {
+      "epoch": 0.23144088439198052,
+      "grad_norm": 0.22216112911701202,
+      "learning_rate": 0.0002656756756756756,
+      "loss": 0.4682,
+      "step": 772
+    },
+    {
+      "epoch": 0.23174067828367997,
+      "grad_norm": 0.2619054317474365,
+      "learning_rate": 0.0002656306306306306,
+      "loss": 0.4791,
+      "step": 773
+    },
+    {
+      "epoch": 0.23204047217537943,
+      "grad_norm": 0.2443225234746933,
+      "learning_rate": 0.00026558558558558554,
+      "loss": 0.4777,
+      "step": 774
+    },
+    {
+      "epoch": 0.2323402660670789,
+      "grad_norm": 0.21427664160728455,
+      "learning_rate": 0.0002655405405405405,
+      "loss": 0.4407,
+      "step": 775
+    },
+    {
+      "epoch": 0.23264005995877834,
+      "grad_norm": 0.21477638185024261,
+      "learning_rate": 0.00026549549549549547,
+      "loss": 0.4333,
+      "step": 776
+    },
+    {
+      "epoch": 0.2329398538504778,
+      "grad_norm": 0.23390546441078186,
+      "learning_rate": 0.0002654504504504504,
+      "loss": 0.4839,
+      "step": 777
+    },
+    {
+      "epoch": 0.23323964774217726,
+      "grad_norm": 0.2529938220977783,
+      "learning_rate": 0.00026540540540540534,
+      "loss": 0.4701,
+      "step": 778
+    },
+    {
+      "epoch": 0.23353944163387672,
+      "grad_norm": 0.24290771782398224,
+      "learning_rate": 0.00026536036036036033,
+      "loss": 0.492,
+      "step": 779
+    },
+    {
+      "epoch": 0.23383923552557617,
+      "grad_norm": 0.2573592960834503,
+      "learning_rate": 0.00026531531531531527,
+      "loss": 0.5105,
+      "step": 780
+    },
+    {
+      "epoch": 0.23413902941727563,
+      "grad_norm": 0.25404611229896545,
+      "learning_rate": 0.00026527027027027026,
+      "loss": 0.5054,
+      "step": 781
+    },
+    {
+      "epoch": 0.2344388233089751,
+      "grad_norm": 0.2394997775554657,
+      "learning_rate": 0.0002652252252252252,
+      "loss": 0.4786,
+      "step": 782
+    },
+    {
+      "epoch": 0.23473861720067454,
+      "grad_norm": 0.2353266179561615,
+      "learning_rate": 0.00026518018018018013,
+      "loss": 0.4613,
+      "step": 783
+    },
+    {
+      "epoch": 0.235038411092374,
+      "grad_norm": 0.22989432513713837,
+      "learning_rate": 0.0002651351351351351,
+      "loss": 0.4505,
+      "step": 784
+    },
+    {
+      "epoch": 0.23533820498407346,
+      "grad_norm": 0.21917951107025146,
+      "learning_rate": 0.00026509009009009006,
+      "loss": 0.438,
+      "step": 785
+    },
+    {
+      "epoch": 0.2356379988757729,
+      "grad_norm": 0.23011858761310577,
+      "learning_rate": 0.000265045045045045,
+      "loss": 0.4477,
+      "step": 786
+    },
+    {
+      "epoch": 0.23593779276747237,
+      "grad_norm": 0.22732798755168915,
+      "learning_rate": 0.000265,
+      "loss": 0.454,
+      "step": 787
+    },
+    {
+      "epoch": 0.23623758665917183,
+      "grad_norm": 0.22975054383277893,
+      "learning_rate": 0.0002649549549549549,
+      "loss": 0.4419,
+      "step": 788
+    },
+    {
+      "epoch": 0.23653738055087128,
+      "grad_norm": 0.25520968437194824,
+      "learning_rate": 0.00026490990990990986,
+      "loss": 0.4987,
+      "step": 789
+    },
+    {
+      "epoch": 0.23683717444257074,
+      "grad_norm": 0.2375541776418686,
+      "learning_rate": 0.00026486486486486485,
+      "loss": 0.473,
+      "step": 790
+    },
+    {
+      "epoch": 0.2371369683342702,
+      "grad_norm": 0.2304588407278061,
+      "learning_rate": 0.0002648198198198198,
+      "loss": 0.4734,
+      "step": 791
+    },
+    {
+      "epoch": 0.23743676222596966,
+      "grad_norm": 0.22878654301166534,
+      "learning_rate": 0.0002647747747747748,
+      "loss": 0.4684,
+      "step": 792
+    },
+    {
+      "epoch": 0.2377365561176691,
+      "grad_norm": 0.25825339555740356,
+      "learning_rate": 0.0002647297297297297,
+      "loss": 0.5006,
+      "step": 793
+    },
+    {
+      "epoch": 0.23803635000936857,
+      "grad_norm": 0.2332850843667984,
+      "learning_rate": 0.00026468468468468466,
+      "loss": 0.4603,
+      "step": 794
+    },
+    {
+      "epoch": 0.23833614390106803,
+      "grad_norm": 0.23115694522857666,
+      "learning_rate": 0.00026463963963963965,
+      "loss": 0.4879,
+      "step": 795
+    },
+    {
+      "epoch": 0.23863593779276748,
+      "grad_norm": 0.2409309297800064,
+      "learning_rate": 0.0002645945945945946,
+      "loss": 0.4662,
+      "step": 796
+    },
+    {
+      "epoch": 0.23893573168446694,
+      "grad_norm": 0.23094283044338226,
+      "learning_rate": 0.0002645495495495495,
+      "loss": 0.4477,
+      "step": 797
+    },
+    {
+      "epoch": 0.2392355255761664,
+      "grad_norm": 0.2324245125055313,
+      "learning_rate": 0.0002645045045045045,
+      "loss": 0.4539,
+      "step": 798
+    },
+    {
+      "epoch": 0.23953531946786585,
+      "grad_norm": 0.23273488879203796,
+      "learning_rate": 0.00026445945945945945,
+      "loss": 0.4926,
+      "step": 799
+    },
+    {
+      "epoch": 0.2398351133595653,
+      "grad_norm": 0.2254081666469574,
+      "learning_rate": 0.0002644144144144144,
+      "loss": 0.4487,
+      "step": 800
+    },
+    {
+      "epoch": 0.24013490725126477,
+      "grad_norm": 0.22009852528572083,
+      "learning_rate": 0.0002643693693693694,
+      "loss": 0.4373,
+      "step": 801
+    },
+    {
+      "epoch": 0.24043470114296422,
+      "grad_norm": 0.24840936064720154,
+      "learning_rate": 0.0002643243243243243,
+      "loss": 0.4803,
+      "step": 802
+    },
+    {
+      "epoch": 0.24073449503466368,
+      "grad_norm": 0.2305980920791626,
+      "learning_rate": 0.00026427927927927925,
+      "loss": 0.4727,
+      "step": 803
+    },
+    {
+      "epoch": 0.24103428892636314,
+      "grad_norm": 0.23277850449085236,
+      "learning_rate": 0.00026423423423423424,
+      "loss": 0.4775,
+      "step": 804
+    },
+    {
+      "epoch": 0.2413340828180626,
+      "grad_norm": 0.24016259610652924,
+      "learning_rate": 0.0002641891891891892,
+      "loss": 0.5002,
+      "step": 805
+    },
+    {
+      "epoch": 0.24163387670976205,
+      "grad_norm": 0.239017054438591,
+      "learning_rate": 0.0002641441441441441,
+      "loss": 0.4586,
+      "step": 806
+    },
+    {
+      "epoch": 0.2419336706014615,
+      "grad_norm": 0.23575717210769653,
+      "learning_rate": 0.0002640990990990991,
+      "loss": 0.481,
+      "step": 807
+    },
+    {
+      "epoch": 0.24223346449316094,
+      "grad_norm": 0.23028531670570374,
+      "learning_rate": 0.00026405405405405404,
+      "loss": 0.4546,
+      "step": 808
+    },
+    {
+      "epoch": 0.2425332583848604,
+      "grad_norm": 0.23798401653766632,
+      "learning_rate": 0.000264009009009009,
+      "loss": 0.4806,
+      "step": 809
+    },
+    {
+      "epoch": 0.24283305227655985,
+      "grad_norm": 0.23191827535629272,
+      "learning_rate": 0.00026396396396396397,
+      "loss": 0.4509,
+      "step": 810
+    },
+    {
+      "epoch": 0.2431328461682593,
+      "grad_norm": 0.2182149440050125,
+      "learning_rate": 0.0002639189189189189,
+      "loss": 0.4448,
+      "step": 811
+    },
+    {
+      "epoch": 0.24343264005995877,
+      "grad_norm": 0.2463945746421814,
+      "learning_rate": 0.00026387387387387384,
+      "loss": 0.5088,
+      "step": 812
+    },
+    {
+      "epoch": 0.24373243395165822,
+      "grad_norm": 0.2388424575328827,
+      "learning_rate": 0.00026382882882882883,
+      "loss": 0.4931,
+      "step": 813
+    },
+    {
+      "epoch": 0.24403222784335768,
+      "grad_norm": 0.2515762746334076,
+      "learning_rate": 0.00026378378378378377,
+      "loss": 0.4742,
+      "step": 814
+    },
+    {
+      "epoch": 0.24433202173505714,
+      "grad_norm": 0.23625001311302185,
+      "learning_rate": 0.0002637387387387387,
+      "loss": 0.4937,
+      "step": 815
+    },
+    {
+      "epoch": 0.2446318156267566,
+      "grad_norm": 0.2393738478422165,
+      "learning_rate": 0.0002636936936936937,
+      "loss": 0.4588,
+      "step": 816
+    },
+    {
+      "epoch": 0.24493160951845605,
+      "grad_norm": 0.23316219449043274,
+      "learning_rate": 0.00026364864864864864,
+      "loss": 0.4577,
+      "step": 817
+    },
+    {
+      "epoch": 0.2452314034101555,
+      "grad_norm": 0.2306746244430542,
+      "learning_rate": 0.00026360360360360357,
+      "loss": 0.4454,
+      "step": 818
+    },
+    {
+      "epoch": 0.24553119730185496,
+      "grad_norm": 0.26293689012527466,
+      "learning_rate": 0.0002635585585585585,
+      "loss": 0.4723,
+      "step": 819
+    },
+    {
+      "epoch": 0.24583099119355442,
+      "grad_norm": 0.23483715951442719,
+      "learning_rate": 0.0002635135135135135,
+      "loss": 0.4704,
+      "step": 820
+    },
+    {
+      "epoch": 0.24613078508525388,
+      "grad_norm": 0.2556680738925934,
+      "learning_rate": 0.00026346846846846844,
+      "loss": 0.5077,
+      "step": 821
+    },
+    {
+      "epoch": 0.24643057897695334,
+      "grad_norm": 0.25275811553001404,
+      "learning_rate": 0.0002634234234234234,
+      "loss": 0.483,
+      "step": 822
+    },
+    {
+      "epoch": 0.2467303728686528,
+      "grad_norm": 0.22292107343673706,
+      "learning_rate": 0.00026337837837837836,
+      "loss": 0.4684,
+      "step": 823
+    },
+    {
+      "epoch": 0.24703016676035225,
+      "grad_norm": 0.23125959932804108,
+      "learning_rate": 0.0002633333333333333,
+      "loss": 0.4618,
+      "step": 824
+    },
+    {
+      "epoch": 0.2473299606520517,
+      "grad_norm": 0.2515474259853363,
+      "learning_rate": 0.00026328828828828824,
+      "loss": 0.5111,
+      "step": 825
+    },
+    {
+      "epoch": 0.24762975454375116,
+      "grad_norm": 0.23193036019802094,
+      "learning_rate": 0.00026324324324324323,
+      "loss": 0.451,
+      "step": 826
+    },
+    {
+      "epoch": 0.24792954843545062,
+      "grad_norm": 0.22238105535507202,
+      "learning_rate": 0.00026319819819819817,
+      "loss": 0.4472,
+      "step": 827
+    },
+    {
+      "epoch": 0.24822934232715008,
+      "grad_norm": 0.23125764727592468,
+      "learning_rate": 0.0002631531531531531,
+      "loss": 0.4711,
+      "step": 828
+    },
+    {
+      "epoch": 0.24852913621884953,
+      "grad_norm": 0.23620037734508514,
+      "learning_rate": 0.0002631081081081081,
+      "loss": 0.4723,
+      "step": 829
+    },
+    {
+      "epoch": 0.248828930110549,
+      "grad_norm": 0.22470439970493317,
+      "learning_rate": 0.00026306306306306303,
+      "loss": 0.4814,
+      "step": 830
+    },
+    {
+      "epoch": 0.24912872400224845,
+      "grad_norm": 0.23267348110675812,
+      "learning_rate": 0.00026301801801801797,
+      "loss": 0.4669,
+      "step": 831
+    },
+    {
+      "epoch": 0.2494285178939479,
+      "grad_norm": 0.23558740317821503,
+      "learning_rate": 0.00026297297297297296,
+      "loss": 0.4407,
+      "step": 832
+    },
+    {
+      "epoch": 0.24972831178564736,
+      "grad_norm": 0.2202112227678299,
+      "learning_rate": 0.0002629279279279279,
+      "loss": 0.4546,
+      "step": 833
+    },
+    {
+      "epoch": 0.25002810567734685,
+      "grad_norm": 0.2349451333284378,
+      "learning_rate": 0.00026288288288288283,
+      "loss": 0.4741,
+      "step": 834
+    },
+    {
+      "epoch": 0.2503278995690463,
+      "grad_norm": 0.2210862636566162,
+      "learning_rate": 0.0002628378378378378,
+      "loss": 0.4624,
+      "step": 835
+    },
+    {
+      "epoch": 0.25062769346074576,
+      "grad_norm": 0.25249290466308594,
+      "learning_rate": 0.00026279279279279276,
+      "loss": 0.5213,
+      "step": 836
+    },
+    {
+      "epoch": 0.2509274873524452,
+      "grad_norm": 0.2458237111568451,
+      "learning_rate": 0.0002627477477477477,
+      "loss": 0.4937,
+      "step": 837
+    },
+    {
+      "epoch": 0.2512272812441447,
+      "grad_norm": 0.22827856242656708,
+      "learning_rate": 0.0002627027027027027,
+      "loss": 0.4286,
+      "step": 838
+    },
+    {
+      "epoch": 0.2515270751358441,
+      "grad_norm": 0.22871458530426025,
+      "learning_rate": 0.0002626576576576576,
+      "loss": 0.4181,
+      "step": 839
+    },
+    {
+      "epoch": 0.2518268690275436,
+      "grad_norm": 0.24196332693099976,
+      "learning_rate": 0.00026261261261261256,
+      "loss": 0.4704,
+      "step": 840
+    },
+    {
+      "epoch": 0.252126662919243,
+      "grad_norm": 0.24222321808338165,
+      "learning_rate": 0.00026256756756756755,
+      "loss": 0.474,
+      "step": 841
+    },
+    {
+      "epoch": 0.2524264568109425,
+      "grad_norm": 0.2258533090353012,
+      "learning_rate": 0.0002625225225225225,
+      "loss": 0.4644,
+      "step": 842
+    },
+    {
+      "epoch": 0.25272625070264193,
+      "grad_norm": 0.2234419882297516,
+      "learning_rate": 0.0002624774774774774,
+      "loss": 0.4506,
+      "step": 843
+    },
+    {
+      "epoch": 0.2530260445943414,
+      "grad_norm": 0.24231363832950592,
+      "learning_rate": 0.0002624324324324324,
+      "loss": 0.4975,
+      "step": 844
+    },
+    {
+      "epoch": 0.25332583848604084,
+      "grad_norm": 0.2430192083120346,
+      "learning_rate": 0.00026238738738738735,
+      "loss": 0.462,
+      "step": 845
+    },
+    {
+      "epoch": 0.25362563237774033,
+      "grad_norm": 0.23717942833900452,
+      "learning_rate": 0.0002623423423423423,
+      "loss": 0.48,
+      "step": 846
+    },
+    {
+      "epoch": 0.25392542626943976,
+      "grad_norm": 0.23983192443847656,
+      "learning_rate": 0.0002622972972972973,
+      "loss": 0.491,
+      "step": 847
+    },
+    {
+      "epoch": 0.25422522016113924,
+      "grad_norm": 0.24544605612754822,
+      "learning_rate": 0.0002622522522522522,
+      "loss": 0.4525,
+      "step": 848
+    },
+    {
+      "epoch": 0.25452501405283867,
+      "grad_norm": 0.25106650590896606,
+      "learning_rate": 0.0002622072072072072,
+      "loss": 0.4716,
+      "step": 849
+    },
+    {
+      "epoch": 0.25482480794453816,
+      "grad_norm": 0.2644721269607544,
+      "learning_rate": 0.00026216216216216215,
+      "loss": 0.4888,
+      "step": 850
+    },
+    {
+      "epoch": 0.2551246018362376,
+      "grad_norm": 0.2338344007730484,
+      "learning_rate": 0.0002621171171171171,
+      "loss": 0.4779,
+      "step": 851
+    },
+    {
+      "epoch": 0.25542439572793707,
+      "grad_norm": 0.2368081659078598,
+      "learning_rate": 0.00026207207207207207,
+      "loss": 0.458,
+      "step": 852
+    },
+    {
+      "epoch": 0.2557241896196365,
+      "grad_norm": 0.2628321051597595,
+      "learning_rate": 0.000262027027027027,
+      "loss": 0.4786,
+      "step": 853
+    },
+    {
+      "epoch": 0.256023983511336,
+      "grad_norm": 0.23109877109527588,
+      "learning_rate": 0.00026198198198198195,
+      "loss": 0.4396,
+      "step": 854
+    },
+    {
+      "epoch": 0.2563237774030354,
+      "grad_norm": 0.23273521661758423,
+      "learning_rate": 0.00026193693693693694,
+      "loss": 0.4579,
+      "step": 855
+    },
+    {
+      "epoch": 0.25662357129473484,
+      "grad_norm": 0.229389026761055,
+      "learning_rate": 0.0002618918918918919,
+      "loss": 0.4515,
+      "step": 856
+    },
+    {
+      "epoch": 0.2569233651864343,
+      "grad_norm": 0.24866041541099548,
+      "learning_rate": 0.0002618468468468468,
+      "loss": 0.4402,
+      "step": 857
+    },
+    {
+      "epoch": 0.25722315907813376,
+      "grad_norm": 0.24374257028102875,
+      "learning_rate": 0.0002618018018018018,
+      "loss": 0.5025,
+      "step": 858
+    },
+    {
+      "epoch": 0.25752295296983324,
+      "grad_norm": 0.2753133177757263,
+      "learning_rate": 0.00026175675675675674,
+      "loss": 0.5058,
+      "step": 859
+    },
+    {
+      "epoch": 0.25782274686153267,
+      "grad_norm": 0.236386239528656,
+      "learning_rate": 0.0002617117117117117,
+      "loss": 0.4741,
+      "step": 860
+    },
+    {
+      "epoch": 0.25812254075323215,
+      "grad_norm": 0.21907605230808258,
+      "learning_rate": 0.00026166666666666667,
+      "loss": 0.463,
+      "step": 861
+    },
+    {
+      "epoch": 0.2584223346449316,
+      "grad_norm": 0.25744542479515076,
+      "learning_rate": 0.0002616216216216216,
+      "loss": 0.4585,
+      "step": 862
+    },
+    {
+      "epoch": 0.25872212853663107,
+      "grad_norm": 0.25060373544692993,
+      "learning_rate": 0.0002615765765765766,
+      "loss": 0.476,
+      "step": 863
+    },
+    {
+      "epoch": 0.2590219224283305,
+      "grad_norm": 0.21545180678367615,
+      "learning_rate": 0.00026153153153153153,
+      "loss": 0.4379,
+      "step": 864
+    },
+    {
+      "epoch": 0.25932171632003,
+      "grad_norm": 0.2536545991897583,
+      "learning_rate": 0.00026148648648648647,
+      "loss": 0.4965,
+      "step": 865
+    },
+    {
+      "epoch": 0.2596215102117294,
+      "grad_norm": 0.22960424423217773,
+      "learning_rate": 0.00026144144144144146,
+      "loss": 0.4441,
+      "step": 866
+    },
+    {
+      "epoch": 0.2599213041034289,
+      "grad_norm": 0.22601282596588135,
+      "learning_rate": 0.0002613963963963964,
+      "loss": 0.4462,
+      "step": 867
+    },
+    {
+      "epoch": 0.2602210979951283,
+      "grad_norm": 0.23997683823108673,
+      "learning_rate": 0.00026135135135135133,
+      "loss": 0.4496,
+      "step": 868
+    },
+    {
+      "epoch": 0.2605208918868278,
+      "grad_norm": 0.24241064488887787,
+      "learning_rate": 0.00026130630630630627,
+      "loss": 0.4538,
+      "step": 869
+    },
+    {
+      "epoch": 0.26082068577852724,
+      "grad_norm": 0.2302720993757248,
+      "learning_rate": 0.00026126126126126126,
+      "loss": 0.4547,
+      "step": 870
+    },
+    {
+      "epoch": 0.2611204796702267,
+      "grad_norm": 0.2532520294189453,
+      "learning_rate": 0.0002612162162162162,
+      "loss": 0.4919,
+      "step": 871
+    },
+    {
+      "epoch": 0.26142027356192615,
+      "grad_norm": 0.2543450891971588,
+      "learning_rate": 0.00026117117117117113,
+      "loss": 0.4638,
+      "step": 872
+    },
+    {
+      "epoch": 0.26172006745362564,
+      "grad_norm": 0.24088874459266663,
+      "learning_rate": 0.0002611261261261261,
+      "loss": 0.4603,
+      "step": 873
+    },
+    {
+      "epoch": 0.26201986134532507,
+      "grad_norm": 0.22305645048618317,
+      "learning_rate": 0.00026108108108108106,
+      "loss": 0.4394,
+      "step": 874
+    },
+    {
+      "epoch": 0.26231965523702455,
+      "grad_norm": 0.261001318693161,
+      "learning_rate": 0.000261036036036036,
+      "loss": 0.5244,
+      "step": 875
+    },
+    {
+      "epoch": 0.262619449128724,
+      "grad_norm": 0.2550908029079437,
+      "learning_rate": 0.000260990990990991,
+      "loss": 0.4716,
+      "step": 876
+    },
+    {
+      "epoch": 0.26291924302042347,
+      "grad_norm": 0.2264460027217865,
+      "learning_rate": 0.0002609459459459459,
+      "loss": 0.4527,
+      "step": 877
+    },
+    {
+      "epoch": 0.2632190369121229,
+      "grad_norm": 0.2598486542701721,
+      "learning_rate": 0.00026090090090090086,
+      "loss": 0.51,
+      "step": 878
+    },
+    {
+      "epoch": 0.2635188308038224,
+      "grad_norm": 0.2528247833251953,
+      "learning_rate": 0.00026085585585585585,
+      "loss": 0.4619,
+      "step": 879
+    },
+    {
+      "epoch": 0.2638186246955218,
+      "grad_norm": 0.22703434526920319,
+      "learning_rate": 0.0002608108108108108,
+      "loss": 0.4519,
+      "step": 880
+    },
+    {
+      "epoch": 0.2641184185872213,
+      "grad_norm": 0.24291987717151642,
+      "learning_rate": 0.0002607657657657657,
+      "loss": 0.4588,
+      "step": 881
+    },
+    {
+      "epoch": 0.2644182124789207,
+      "grad_norm": 0.265899121761322,
+      "learning_rate": 0.0002607207207207207,
+      "loss": 0.4741,
+      "step": 882
+    },
+    {
+      "epoch": 0.2647180063706202,
+      "grad_norm": 0.24852798879146576,
+      "learning_rate": 0.00026067567567567565,
+      "loss": 0.4772,
+      "step": 883
+    },
+    {
+      "epoch": 0.26501780026231964,
+      "grad_norm": 0.24373799562454224,
+      "learning_rate": 0.0002606306306306306,
+      "loss": 0.475,
+      "step": 884
+    },
+    {
+      "epoch": 0.2653175941540191,
+      "grad_norm": 0.24994871020317078,
+      "learning_rate": 0.0002605855855855856,
+      "loss": 0.5094,
+      "step": 885
+    },
+    {
+      "epoch": 0.26561738804571855,
+      "grad_norm": 0.23686103522777557,
+      "learning_rate": 0.0002605405405405405,
+      "loss": 0.4583,
+      "step": 886
+    },
+    {
+      "epoch": 0.26591718193741803,
+      "grad_norm": 0.2280004322528839,
+      "learning_rate": 0.00026049549549549546,
+      "loss": 0.4474,
+      "step": 887
+    },
+    {
+      "epoch": 0.26621697582911746,
+      "grad_norm": 0.25110939145088196,
+      "learning_rate": 0.00026045045045045045,
+      "loss": 0.4722,
+      "step": 888
+    },
+    {
+      "epoch": 0.26651676972081695,
+      "grad_norm": 0.25370022654533386,
+      "learning_rate": 0.0002604054054054054,
+      "loss": 0.4651,
+      "step": 889
+    },
+    {
+      "epoch": 0.2668165636125164,
+      "grad_norm": 0.24179215729236603,
+      "learning_rate": 0.0002603603603603603,
+      "loss": 0.4727,
+      "step": 890
+    },
+    {
+      "epoch": 0.26711635750421586,
+      "grad_norm": 0.2525777518749237,
+      "learning_rate": 0.00026031531531531526,
+      "loss": 0.4681,
+      "step": 891
+    },
+    {
+      "epoch": 0.2674161513959153,
+      "grad_norm": 0.21325957775115967,
+      "learning_rate": 0.00026027027027027025,
+      "loss": 0.4374,
+      "step": 892
+    },
+    {
+      "epoch": 0.2677159452876148,
+      "grad_norm": 0.2358642816543579,
+      "learning_rate": 0.0002602252252252252,
+      "loss": 0.4492,
+      "step": 893
+    },
+    {
+      "epoch": 0.2680157391793142,
+      "grad_norm": 0.2625977694988251,
+      "learning_rate": 0.0002601801801801801,
+      "loss": 0.522,
+      "step": 894
+    },
+    {
+      "epoch": 0.2683155330710137,
+      "grad_norm": 0.22606413066387177,
+      "learning_rate": 0.0002601351351351351,
+      "loss": 0.4539,
+      "step": 895
+    },
+    {
+      "epoch": 0.2686153269627131,
+      "grad_norm": 0.24337491393089294,
+      "learning_rate": 0.00026009009009009005,
+      "loss": 0.4988,
+      "step": 896
+    },
+    {
+      "epoch": 0.2689151208544126,
+      "grad_norm": 0.23522725701332092,
+      "learning_rate": 0.000260045045045045,
+      "loss": 0.4665,
+      "step": 897
+    },
+    {
+      "epoch": 0.26921491474611203,
+      "grad_norm": 0.25222131609916687,
+      "learning_rate": 0.00026,
+      "loss": 0.4715,
+      "step": 898
+    },
+    {
+      "epoch": 0.2695147086378115,
+      "grad_norm": 0.22760646045207977,
+      "learning_rate": 0.0002599549549549549,
+      "loss": 0.4633,
+      "step": 899
+    },
+    {
+      "epoch": 0.26981450252951095,
+      "grad_norm": 0.2398597002029419,
+      "learning_rate": 0.00025990990990990985,
+      "loss": 0.4682,
+      "step": 900
+    },
+    {
+      "epoch": 0.27011429642121043,
+      "grad_norm": 0.24494816362857819,
+      "learning_rate": 0.00025986486486486484,
+      "loss": 0.4833,
+      "step": 901
+    },
+    {
+      "epoch": 0.27041409031290986,
+      "grad_norm": 0.23173199594020844,
+      "learning_rate": 0.0002598198198198198,
+      "loss": 0.4583,
+      "step": 902
+    },
+    {
+      "epoch": 0.27071388420460935,
+      "grad_norm": 0.242969810962677,
+      "learning_rate": 0.0002597747747747747,
+      "loss": 0.4748,
+      "step": 903
+    },
+    {
+      "epoch": 0.2710136780963088,
+      "grad_norm": 0.2286025583744049,
+      "learning_rate": 0.0002597297297297297,
+      "loss": 0.4802,
+      "step": 904
+    },
+    {
+      "epoch": 0.27131347198800826,
+      "grad_norm": 0.241167351603508,
+      "learning_rate": 0.00025968468468468464,
+      "loss": 0.4947,
+      "step": 905
+    },
+    {
+      "epoch": 0.2716132658797077,
+      "grad_norm": 0.2599638104438782,
+      "learning_rate": 0.00025963963963963963,
+      "loss": 0.5028,
+      "step": 906
+    },
+    {
+      "epoch": 0.2719130597714072,
+      "grad_norm": 0.22766104340553284,
+      "learning_rate": 0.00025959459459459457,
+      "loss": 0.4586,
+      "step": 907
+    },
+    {
+      "epoch": 0.2722128536631066,
+      "grad_norm": 0.24524454772472382,
+      "learning_rate": 0.0002595495495495495,
+      "loss": 0.4918,
+      "step": 908
+    },
+    {
+      "epoch": 0.2725126475548061,
+      "grad_norm": 0.24995583295822144,
+      "learning_rate": 0.0002595045045045045,
+      "loss": 0.4851,
+      "step": 909
+    },
+    {
+      "epoch": 0.2728124414465055,
+      "grad_norm": 0.24542704224586487,
+      "learning_rate": 0.00025945945945945944,
+      "loss": 0.4891,
+      "step": 910
+    },
+    {
+      "epoch": 0.273112235338205,
+      "grad_norm": 0.2262720763683319,
+      "learning_rate": 0.00025941441441441437,
+      "loss": 0.457,
+      "step": 911
+    },
+    {
+      "epoch": 0.27341202922990443,
+      "grad_norm": 0.2677282989025116,
+      "learning_rate": 0.00025936936936936936,
+      "loss": 0.4781,
+      "step": 912
+    },
+    {
+      "epoch": 0.2737118231216039,
+      "grad_norm": 0.22617483139038086,
+      "learning_rate": 0.0002593243243243243,
+      "loss": 0.4666,
+      "step": 913
+    },
+    {
+      "epoch": 0.27401161701330334,
+      "grad_norm": 0.24579745531082153,
+      "learning_rate": 0.00025927927927927924,
+      "loss": 0.4991,
+      "step": 914
+    },
+    {
+      "epoch": 0.27431141090500283,
+      "grad_norm": 0.22964158654212952,
+      "learning_rate": 0.00025923423423423423,
+      "loss": 0.4757,
+      "step": 915
+    },
+    {
+      "epoch": 0.27461120479670226,
+      "grad_norm": 0.24435275793075562,
+      "learning_rate": 0.00025918918918918916,
+      "loss": 0.4636,
+      "step": 916
+    },
+    {
+      "epoch": 0.27491099868840174,
+      "grad_norm": 0.23039105534553528,
+      "learning_rate": 0.0002591441441441441,
+      "loss": 0.4591,
+      "step": 917
+    },
+    {
+      "epoch": 0.27521079258010117,
+      "grad_norm": 0.24856770038604736,
+      "learning_rate": 0.0002590990990990991,
+      "loss": 0.4866,
+      "step": 918
+    },
+    {
+      "epoch": 0.27551058647180066,
+      "grad_norm": 0.22115269303321838,
+      "learning_rate": 0.00025905405405405403,
+      "loss": 0.4322,
+      "step": 919
+    },
+    {
+      "epoch": 0.2758103803635001,
+      "grad_norm": 0.2645402252674103,
+      "learning_rate": 0.000259009009009009,
+      "loss": 0.4891,
+      "step": 920
+    },
+    {
+      "epoch": 0.27611017425519957,
+      "grad_norm": 0.24427354335784912,
+      "learning_rate": 0.00025896396396396396,
+      "loss": 0.4636,
+      "step": 921
+    },
+    {
+      "epoch": 0.276409968146899,
+      "grad_norm": 0.23059400916099548,
+      "learning_rate": 0.0002589189189189189,
+      "loss": 0.4502,
+      "step": 922
+    },
+    {
+      "epoch": 0.2767097620385985,
+      "grad_norm": 0.21996812522411346,
+      "learning_rate": 0.0002588738738738739,
+      "loss": 0.4461,
+      "step": 923
+    },
+    {
+      "epoch": 0.2770095559302979,
+      "grad_norm": 0.24204552173614502,
+      "learning_rate": 0.0002588288288288288,
+      "loss": 0.4678,
+      "step": 924
+    },
+    {
+      "epoch": 0.2773093498219974,
+      "grad_norm": 0.26428595185279846,
+      "learning_rate": 0.00025878378378378376,
+      "loss": 0.476,
+      "step": 925
+    },
+    {
+      "epoch": 0.2776091437136968,
+      "grad_norm": 0.2542773187160492,
+      "learning_rate": 0.00025873873873873875,
+      "loss": 0.464,
+      "step": 926
+    },
+    {
+      "epoch": 0.2779089376053963,
+      "grad_norm": 0.2621975839138031,
+      "learning_rate": 0.0002586936936936937,
+      "loss": 0.4735,
+      "step": 927
+    },
+    {
+      "epoch": 0.27820873149709574,
+      "grad_norm": 0.24359507858753204,
+      "learning_rate": 0.0002586486486486486,
+      "loss": 0.4768,
+      "step": 928
+    },
+    {
+      "epoch": 0.2785085253887952,
+      "grad_norm": 0.24825096130371094,
+      "learning_rate": 0.0002586036036036036,
+      "loss": 0.471,
+      "step": 929
+    },
+    {
+      "epoch": 0.27880831928049465,
+      "grad_norm": 0.2950778007507324,
+      "learning_rate": 0.00025855855855855855,
+      "loss": 0.4625,
+      "step": 930
+    },
+    {
+      "epoch": 0.27910811317219414,
+      "grad_norm": 0.23210273683071136,
+      "learning_rate": 0.0002585135135135135,
+      "loss": 0.4325,
+      "step": 931
+    },
+    {
+      "epoch": 0.27940790706389357,
+      "grad_norm": 0.26627883315086365,
+      "learning_rate": 0.0002584684684684685,
+      "loss": 0.4903,
+      "step": 932
+    },
+    {
+      "epoch": 0.27970770095559305,
+      "grad_norm": 0.2619936466217041,
+      "learning_rate": 0.0002584234234234234,
+      "loss": 0.4777,
+      "step": 933
+    },
+    {
+      "epoch": 0.2800074948472925,
+      "grad_norm": 0.23771123588085175,
+      "learning_rate": 0.00025837837837837835,
+      "loss": 0.4319,
+      "step": 934
+    },
+    {
+      "epoch": 0.28030728873899197,
+      "grad_norm": 0.2495034784078598,
+      "learning_rate": 0.00025833333333333334,
+      "loss": 0.4503,
+      "step": 935
+    },
+    {
+      "epoch": 0.2806070826306914,
+      "grad_norm": 0.26627448201179504,
+      "learning_rate": 0.0002582882882882883,
+      "loss": 0.4501,
+      "step": 936
+    },
+    {
+      "epoch": 0.2809068765223909,
+      "grad_norm": 0.22482304275035858,
+      "learning_rate": 0.0002582432432432432,
+      "loss": 0.4615,
+      "step": 937
+    },
+    {
+      "epoch": 0.2812066704140903,
+      "grad_norm": 0.23891392350196838,
+      "learning_rate": 0.0002581981981981982,
+      "loss": 0.4732,
+      "step": 938
+    },
+    {
+      "epoch": 0.2815064643057898,
+      "grad_norm": 0.2233395278453827,
+      "learning_rate": 0.00025815315315315314,
+      "loss": 0.4521,
+      "step": 939
+    },
+    {
+      "epoch": 0.2818062581974892,
+      "grad_norm": 0.22510269284248352,
+      "learning_rate": 0.0002581081081081081,
+      "loss": 0.4307,
+      "step": 940
+    },
+    {
+      "epoch": 0.2821060520891887,
+      "grad_norm": 0.24909009039402008,
+      "learning_rate": 0.000258063063063063,
+      "loss": 0.4574,
+      "step": 941
+    },
+    {
+      "epoch": 0.28240584598088814,
+      "grad_norm": 0.2461954653263092,
+      "learning_rate": 0.000258018018018018,
+      "loss": 0.4782,
+      "step": 942
+    },
+    {
+      "epoch": 0.2827056398725876,
+      "grad_norm": 0.23391996324062347,
+      "learning_rate": 0.00025797297297297294,
+      "loss": 0.4692,
+      "step": 943
+    },
+    {
+      "epoch": 0.28300543376428705,
+      "grad_norm": 0.2419288158416748,
+      "learning_rate": 0.0002579279279279279,
+      "loss": 0.4742,
+      "step": 944
+    },
+    {
+      "epoch": 0.28330522765598654,
+      "grad_norm": 0.24654226005077362,
+      "learning_rate": 0.00025788288288288287,
+      "loss": 0.4793,
+      "step": 945
+    },
+    {
+      "epoch": 0.28360502154768596,
+      "grad_norm": 0.23362454771995544,
+      "learning_rate": 0.0002578378378378378,
+      "loss": 0.4398,
+      "step": 946
+    },
+    {
+      "epoch": 0.28390481543938545,
+      "grad_norm": 0.23269514739513397,
+      "learning_rate": 0.00025779279279279275,
+      "loss": 0.4642,
+      "step": 947
+    },
+    {
+      "epoch": 0.2842046093310849,
+      "grad_norm": 0.22531503438949585,
+      "learning_rate": 0.00025774774774774774,
+      "loss": 0.436,
+      "step": 948
+    },
+    {
+      "epoch": 0.28450440322278436,
+      "grad_norm": 0.24250438809394836,
+      "learning_rate": 0.0002577027027027027,
+      "loss": 0.4708,
+      "step": 949
+    },
+    {
+      "epoch": 0.2848041971144838,
+      "grad_norm": 0.2370329648256302,
+      "learning_rate": 0.0002576576576576576,
+      "loss": 0.4737,
+      "step": 950
+    },
+    {
+      "epoch": 0.2851039910061832,
+      "grad_norm": 0.2553395628929138,
+      "learning_rate": 0.0002576126126126126,
+      "loss": 0.4927,
+      "step": 951
+    },
+    {
+      "epoch": 0.2854037848978827,
+      "grad_norm": 0.24398140609264374,
+      "learning_rate": 0.00025756756756756754,
+      "loss": 0.4471,
+      "step": 952
+    },
+    {
+      "epoch": 0.28570357878958214,
+      "grad_norm": 0.2420070916414261,
+      "learning_rate": 0.0002575225225225225,
+      "loss": 0.4573,
+      "step": 953
+    },
+    {
+      "epoch": 0.2860033726812816,
+      "grad_norm": 0.22280406951904297,
+      "learning_rate": 0.00025747747747747747,
+      "loss": 0.4637,
+      "step": 954
+    },
+    {
+      "epoch": 0.28630316657298105,
+      "grad_norm": 0.268107146024704,
+      "learning_rate": 0.0002574324324324324,
+      "loss": 0.4751,
+      "step": 955
+    },
+    {
+      "epoch": 0.28660296046468053,
+      "grad_norm": 0.224797785282135,
+      "learning_rate": 0.00025738738738738734,
+      "loss": 0.4567,
+      "step": 956
+    },
+    {
+      "epoch": 0.28690275435637996,
+      "grad_norm": 0.2350010722875595,
+      "learning_rate": 0.00025734234234234233,
+      "loss": 0.4669,
+      "step": 957
+    },
+    {
+      "epoch": 0.28720254824807945,
+      "grad_norm": 0.23346953094005585,
+      "learning_rate": 0.00025729729729729727,
+      "loss": 0.4711,
+      "step": 958
+    },
+    {
+      "epoch": 0.2875023421397789,
+      "grad_norm": 0.26031309366226196,
+      "learning_rate": 0.0002572522522522522,
+      "loss": 0.4921,
+      "step": 959
+    },
+    {
+      "epoch": 0.28780213603147836,
+      "grad_norm": 0.21255329251289368,
+      "learning_rate": 0.0002572072072072072,
+      "loss": 0.444,
+      "step": 960
+    },
+    {
+      "epoch": 0.2881019299231778,
+      "grad_norm": 0.24799884855747223,
+      "learning_rate": 0.00025716216216216213,
+      "loss": 0.469,
+      "step": 961
+    },
+    {
+      "epoch": 0.2884017238148773,
+      "grad_norm": 0.2208838164806366,
+      "learning_rate": 0.00025711711711711707,
+      "loss": 0.442,
+      "step": 962
+    },
+    {
+      "epoch": 0.2887015177065767,
+      "grad_norm": 0.2880913317203522,
+      "learning_rate": 0.00025707207207207206,
+      "loss": 0.4424,
+      "step": 963
+    },
+    {
+      "epoch": 0.2890013115982762,
+      "grad_norm": 0.26574239134788513,
+      "learning_rate": 0.000257027027027027,
+      "loss": 0.452,
+      "step": 964
+    },
+    {
+      "epoch": 0.2893011054899756,
+      "grad_norm": 0.23267340660095215,
+      "learning_rate": 0.00025698198198198193,
+      "loss": 0.455,
+      "step": 965
+    },
+    {
+      "epoch": 0.2896008993816751,
+      "grad_norm": 0.26304900646209717,
+      "learning_rate": 0.0002569369369369369,
+      "loss": 0.5211,
+      "step": 966
+    },
+    {
+      "epoch": 0.28990069327337453,
+      "grad_norm": 0.2575905919075012,
+      "learning_rate": 0.00025689189189189186,
+      "loss": 0.483,
+      "step": 967
+    },
+    {
+      "epoch": 0.290200487165074,
+      "grad_norm": 0.22459660470485687,
+      "learning_rate": 0.0002568468468468468,
+      "loss": 0.4636,
+      "step": 968
+    },
+    {
+      "epoch": 0.29050028105677345,
+      "grad_norm": 0.220341295003891,
+      "learning_rate": 0.0002568018018018018,
+      "loss": 0.4615,
+      "step": 969
+    },
+    {
+      "epoch": 0.29080007494847293,
+      "grad_norm": 0.239531472325325,
+      "learning_rate": 0.0002567567567567567,
+      "loss": 0.4446,
+      "step": 970
+    },
+    {
+      "epoch": 0.29109986884017236,
+      "grad_norm": 0.23338812589645386,
+      "learning_rate": 0.00025671171171171166,
+      "loss": 0.4704,
+      "step": 971
+    },
+    {
+      "epoch": 0.29139966273187184,
+      "grad_norm": 0.24035978317260742,
+      "learning_rate": 0.00025666666666666665,
+      "loss": 0.4717,
+      "step": 972
+    },
+    {
+      "epoch": 0.2916994566235713,
+      "grad_norm": 0.23094506561756134,
+      "learning_rate": 0.0002566216216216216,
+      "loss": 0.4493,
+      "step": 973
+    },
+    {
+      "epoch": 0.29199925051527076,
+      "grad_norm": 0.25101473927497864,
+      "learning_rate": 0.0002565765765765765,
+      "loss": 0.4979,
+      "step": 974
+    },
+    {
+      "epoch": 0.2922990444069702,
+      "grad_norm": 0.24172839522361755,
+      "learning_rate": 0.0002565315315315315,
+      "loss": 0.4666,
+      "step": 975
+    },
+    {
+      "epoch": 0.2925988382986697,
+      "grad_norm": 0.2213078737258911,
+      "learning_rate": 0.00025648648648648645,
+      "loss": 0.4335,
+      "step": 976
+    },
+    {
+      "epoch": 0.2928986321903691,
+      "grad_norm": 0.2230203002691269,
+      "learning_rate": 0.00025644144144144145,
+      "loss": 0.4183,
+      "step": 977
+    },
+    {
+      "epoch": 0.2931984260820686,
+      "grad_norm": 0.24735258519649506,
+      "learning_rate": 0.0002563963963963964,
+      "loss": 0.4612,
+      "step": 978
+    },
+    {
+      "epoch": 0.293498219973768,
+      "grad_norm": 0.24861575663089752,
+      "learning_rate": 0.0002563513513513513,
+      "loss": 0.4713,
+      "step": 979
+    },
+    {
+      "epoch": 0.2937980138654675,
+      "grad_norm": 0.2333897352218628,
+      "learning_rate": 0.0002563063063063063,
+      "loss": 0.4502,
+      "step": 980
+    },
+    {
+      "epoch": 0.29409780775716693,
+      "grad_norm": 0.23923064768314362,
+      "learning_rate": 0.00025626126126126125,
+      "loss": 0.4574,
+      "step": 981
+    },
+    {
+      "epoch": 0.2943976016488664,
+      "grad_norm": 0.24568355083465576,
+      "learning_rate": 0.0002562162162162162,
+      "loss": 0.4358,
+      "step": 982
+    },
+    {
+      "epoch": 0.29469739554056584,
+      "grad_norm": 0.24993112683296204,
+      "learning_rate": 0.0002561711711711712,
+      "loss": 0.4663,
+      "step": 983
+    },
+    {
+      "epoch": 0.2949971894322653,
+      "grad_norm": 0.2531440854072571,
+      "learning_rate": 0.0002561261261261261,
+      "loss": 0.4964,
+      "step": 984
+    },
+    {
+      "epoch": 0.29529698332396476,
+      "grad_norm": 0.26995500922203064,
+      "learning_rate": 0.00025608108108108105,
+      "loss": 0.4971,
+      "step": 985
+    },
+    {
+      "epoch": 0.29559677721566424,
+      "grad_norm": 0.23319192230701447,
+      "learning_rate": 0.00025603603603603604,
+      "loss": 0.4605,
+      "step": 986
+    },
+    {
+      "epoch": 0.29589657110736367,
+      "grad_norm": 0.2496713548898697,
+      "learning_rate": 0.000255990990990991,
+      "loss": 0.4468,
+      "step": 987
+    },
+    {
+      "epoch": 0.29619636499906316,
+      "grad_norm": 0.23224860429763794,
+      "learning_rate": 0.0002559459459459459,
+      "loss": 0.4519,
+      "step": 988
+    },
+    {
+      "epoch": 0.2964961588907626,
+      "grad_norm": 0.24383842945098877,
+      "learning_rate": 0.0002559009009009009,
+      "loss": 0.467,
+      "step": 989
+    },
+    {
+      "epoch": 0.29679595278246207,
+      "grad_norm": 0.2240372598171234,
+      "learning_rate": 0.00025585585585585584,
+      "loss": 0.4541,
+      "step": 990
+    },
+    {
+      "epoch": 0.2970957466741615,
+      "grad_norm": 0.23554278910160065,
+      "learning_rate": 0.0002558108108108108,
+      "loss": 0.4627,
+      "step": 991
+    },
+    {
+      "epoch": 0.297395540565861,
+      "grad_norm": 0.24655288457870483,
+      "learning_rate": 0.00025576576576576577,
+      "loss": 0.4697,
+      "step": 992
+    },
+    {
+      "epoch": 0.2976953344575604,
+      "grad_norm": 0.2397495061159134,
+      "learning_rate": 0.0002557207207207207,
+      "loss": 0.4821,
+      "step": 993
+    },
+    {
+      "epoch": 0.2979951283492599,
+      "grad_norm": 0.24440963566303253,
+      "learning_rate": 0.00025567567567567564,
+      "loss": 0.4293,
+      "step": 994
+    },
+    {
+      "epoch": 0.2982949222409593,
+      "grad_norm": 0.230440154671669,
+      "learning_rate": 0.00025563063063063063,
+      "loss": 0.4592,
+      "step": 995
+    },
+    {
+      "epoch": 0.2985947161326588,
+      "grad_norm": 0.22211557626724243,
+      "learning_rate": 0.00025558558558558557,
+      "loss": 0.4324,
+      "step": 996
+    },
+    {
+      "epoch": 0.29889451002435824,
+      "grad_norm": 0.22826789319515228,
+      "learning_rate": 0.0002555405405405405,
+      "loss": 0.447,
+      "step": 997
+    },
+    {
+      "epoch": 0.2991943039160577,
+      "grad_norm": 0.24060975015163422,
+      "learning_rate": 0.0002554954954954955,
+      "loss": 0.4954,
+      "step": 998
+    },
+    {
+      "epoch": 0.29949409780775715,
+      "grad_norm": 0.2227400243282318,
+      "learning_rate": 0.00025545045045045043,
+      "loss": 0.4503,
+      "step": 999
+    },
+    {
+      "epoch": 0.29979389169945664,
+      "grad_norm": 0.23061898350715637,
+      "learning_rate": 0.00025540540540540537,
+      "loss": 0.4356,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29979389169945664,
+      "eval_loss": 0.46561944484710693,
+      "eval_runtime": 566.7946,
+      "eval_samples_per_second": 3.809,
+      "eval_steps_per_second": 0.476,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30009368559115607,
+      "grad_norm": 0.25651443004608154,
+      "learning_rate": 0.00025536036036036036,
+      "loss": 0.4813,
+      "step": 1001
+    },
+    {
+      "epoch": 0.30039347948285555,
+      "grad_norm": 0.23068277537822723,
+      "learning_rate": 0.0002553153153153153,
+      "loss": 0.4563,
+      "step": 1002
+    },
+    {
+      "epoch": 0.300693273374555,
+      "grad_norm": 0.2346974015235901,
+      "learning_rate": 0.00025527027027027024,
+      "loss": 0.4471,
+      "step": 1003
+    },
+    {
+      "epoch": 0.30099306726625447,
+      "grad_norm": 0.22291089594364166,
+      "learning_rate": 0.0002552252252252252,
+      "loss": 0.4399,
+      "step": 1004
+    },
+    {
+      "epoch": 0.3012928611579539,
+      "grad_norm": 0.24533626437187195,
+      "learning_rate": 0.00025518018018018016,
+      "loss": 0.4949,
+      "step": 1005
+    },
+    {
+      "epoch": 0.3015926550496534,
+      "grad_norm": 0.2337205857038498,
+      "learning_rate": 0.0002551351351351351,
+      "loss": 0.4815,
+      "step": 1006
+    },
+    {
+      "epoch": 0.3018924489413528,
+      "grad_norm": 0.23926278948783875,
+      "learning_rate": 0.0002550900900900901,
+      "loss": 0.4742,
+      "step": 1007
+    },
+    {
+      "epoch": 0.3021922428330523,
+      "grad_norm": 0.23630262911319733,
+      "learning_rate": 0.00025504504504504503,
+      "loss": 0.4643,
+      "step": 1008
+    },
+    {
+      "epoch": 0.3024920367247517,
+      "grad_norm": 0.2534981071949005,
+      "learning_rate": 0.00025499999999999996,
+      "loss": 0.4806,
+      "step": 1009
+    },
+    {
+      "epoch": 0.3027918306164512,
+      "grad_norm": 0.22765369713306427,
+      "learning_rate": 0.00025495495495495496,
+      "loss": 0.4562,
+      "step": 1010
+    },
+    {
+      "epoch": 0.30309162450815064,
+      "grad_norm": 0.23825423419475555,
+      "learning_rate": 0.0002549099099099099,
+      "loss": 0.4642,
+      "step": 1011
+    },
+    {
+      "epoch": 0.3033914183998501,
+      "grad_norm": 0.2275952249765396,
+      "learning_rate": 0.00025486486486486483,
+      "loss": 0.4716,
+      "step": 1012
+    },
+    {
+      "epoch": 0.30369121229154955,
+      "grad_norm": 0.23309756815433502,
+      "learning_rate": 0.00025481981981981977,
+      "loss": 0.467,
+      "step": 1013
+    },
+    {
+      "epoch": 0.30399100618324904,
+      "grad_norm": 0.2582738995552063,
+      "learning_rate": 0.00025477477477477476,
+      "loss": 0.4756,
+      "step": 1014
+    },
+    {
+      "epoch": 0.30429080007494846,
+      "grad_norm": 0.21543192863464355,
+      "learning_rate": 0.0002547297297297297,
+      "loss": 0.4184,
+      "step": 1015
+    },
+    {
+      "epoch": 0.30459059396664795,
+      "grad_norm": 0.22537867724895477,
+      "learning_rate": 0.00025468468468468463,
+      "loss": 0.451,
+      "step": 1016
+    },
+    {
+      "epoch": 0.3048903878583474,
+      "grad_norm": 0.22783374786376953,
+      "learning_rate": 0.0002546396396396396,
+      "loss": 0.4595,
+      "step": 1017
+    },
+    {
+      "epoch": 0.30519018175004686,
+      "grad_norm": 0.2382606863975525,
+      "learning_rate": 0.00025459459459459456,
+      "loss": 0.4465,
+      "step": 1018
+    },
+    {
+      "epoch": 0.3054899756417463,
+      "grad_norm": 0.23583681881427765,
+      "learning_rate": 0.0002545495495495495,
+      "loss": 0.453,
+      "step": 1019
+    },
+    {
+      "epoch": 0.3057897695334458,
+      "grad_norm": 0.24536754190921783,
+      "learning_rate": 0.0002545045045045045,
+      "loss": 0.4829,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3060895634251452,
+      "grad_norm": 0.21961447596549988,
+      "learning_rate": 0.0002544594594594594,
+      "loss": 0.4555,
+      "step": 1021
+    },
+    {
+      "epoch": 0.3063893573168447,
+      "grad_norm": 0.22678659856319427,
+      "learning_rate": 0.00025441441441441436,
+      "loss": 0.4423,
+      "step": 1022
+    },
+    {
+      "epoch": 0.3066891512085441,
+      "grad_norm": 0.23871202766895294,
+      "learning_rate": 0.00025436936936936935,
+      "loss": 0.4657,
+      "step": 1023
+    },
+    {
+      "epoch": 0.3069889451002436,
+      "grad_norm": 0.23637500405311584,
+      "learning_rate": 0.0002543243243243243,
+      "loss": 0.4665,
+      "step": 1024
+    },
+    {
+      "epoch": 0.30728873899194303,
+      "grad_norm": 0.22538509964942932,
+      "learning_rate": 0.0002542792792792792,
+      "loss": 0.4349,
+      "step": 1025
+    },
+    {
+      "epoch": 0.3075885328836425,
+      "grad_norm": 0.2449694573879242,
+      "learning_rate": 0.0002542342342342342,
+      "loss": 0.465,
+      "step": 1026
+    },
+    {
+      "epoch": 0.30788832677534195,
+      "grad_norm": 0.22211913764476776,
+      "learning_rate": 0.00025418918918918915,
+      "loss": 0.4276,
+      "step": 1027
+    },
+    {
+      "epoch": 0.30818812066704143,
+      "grad_norm": 0.25039684772491455,
+      "learning_rate": 0.0002541441441441441,
+      "loss": 0.4799,
+      "step": 1028
+    },
+    {
+      "epoch": 0.30848791455874086,
+      "grad_norm": 0.22903893887996674,
+      "learning_rate": 0.0002540990990990991,
+      "loss": 0.4679,
+      "step": 1029
+    },
+    {
+      "epoch": 0.30878770845044035,
+      "grad_norm": 0.2479073703289032,
+      "learning_rate": 0.000254054054054054,
+      "loss": 0.4722,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3090875023421398,
+      "grad_norm": 0.22166845202445984,
+      "learning_rate": 0.00025400900900900895,
+      "loss": 0.4354,
+      "step": 1031
+    },
+    {
+      "epoch": 0.30938729623383926,
+      "grad_norm": 0.22735866904258728,
+      "learning_rate": 0.00025396396396396394,
+      "loss": 0.4344,
+      "step": 1032
+    },
+    {
+      "epoch": 0.3096870901255387,
+      "grad_norm": 0.2471131831407547,
+      "learning_rate": 0.0002539189189189189,
+      "loss": 0.4751,
+      "step": 1033
+    },
+    {
+      "epoch": 0.3099868840172382,
+      "grad_norm": 0.22858619689941406,
+      "learning_rate": 0.00025387387387387387,
+      "loss": 0.4589,
+      "step": 1034
+    },
+    {
+      "epoch": 0.3102866779089376,
+      "grad_norm": 0.2418094426393509,
+      "learning_rate": 0.0002538288288288288,
+      "loss": 0.4873,
+      "step": 1035
+    },
+    {
+      "epoch": 0.3105864718006371,
+      "grad_norm": 0.2313918024301529,
+      "learning_rate": 0.00025378378378378374,
+      "loss": 0.4819,
+      "step": 1036
+    },
+    {
+      "epoch": 0.3108862656923365,
+      "grad_norm": 0.22201773524284363,
+      "learning_rate": 0.00025373873873873874,
+      "loss": 0.4553,
+      "step": 1037
+    },
+    {
+      "epoch": 0.311186059584036,
+      "grad_norm": 0.2284671664237976,
+      "learning_rate": 0.00025369369369369367,
+      "loss": 0.4485,
+      "step": 1038
+    },
+    {
+      "epoch": 0.31148585347573543,
+      "grad_norm": 0.2529887855052948,
+      "learning_rate": 0.0002536486486486486,
+      "loss": 0.4674,
+      "step": 1039
+    },
+    {
+      "epoch": 0.3117856473674349,
+      "grad_norm": 0.23350189626216888,
+      "learning_rate": 0.0002536036036036036,
+      "loss": 0.4835,
+      "step": 1040
+    },
+    {
+      "epoch": 0.31208544125913434,
+      "grad_norm": 0.23258428275585175,
+      "learning_rate": 0.00025355855855855854,
+      "loss": 0.4458,
+      "step": 1041
+    },
+    {
+      "epoch": 0.31238523515083383,
+      "grad_norm": 0.23113112151622772,
+      "learning_rate": 0.0002535135135135135,
+      "loss": 0.4581,
+      "step": 1042
+    },
+    {
+      "epoch": 0.31268502904253326,
+      "grad_norm": 0.22711153328418732,
+      "learning_rate": 0.00025346846846846846,
+      "loss": 0.4485,
+      "step": 1043
+    },
+    {
+      "epoch": 0.31298482293423274,
+      "grad_norm": 0.23305024206638336,
+      "learning_rate": 0.0002534234234234234,
+      "loss": 0.4316,
+      "step": 1044
+    },
+    {
+      "epoch": 0.31328461682593217,
+      "grad_norm": 0.24723917245864868,
+      "learning_rate": 0.0002533783783783784,
+      "loss": 0.4512,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3135844107176316,
+      "grad_norm": 0.21640846133232117,
+      "learning_rate": 0.00025333333333333333,
+      "loss": 0.4485,
+      "step": 1046
+    },
+    {
+      "epoch": 0.3138842046093311,
+      "grad_norm": 0.25021156668663025,
+      "learning_rate": 0.00025328828828828827,
+      "loss": 0.4708,
+      "step": 1047
+    },
+    {
+      "epoch": 0.3141839985010305,
+      "grad_norm": 0.24005773663520813,
+      "learning_rate": 0.00025324324324324326,
+      "loss": 0.4698,
+      "step": 1048
+    },
+    {
+      "epoch": 0.31448379239273,
+      "grad_norm": 0.24885396659374237,
+      "learning_rate": 0.0002531981981981982,
+      "loss": 0.4899,
+      "step": 1049
+    },
+    {
+      "epoch": 0.31478358628442943,
+      "grad_norm": 0.2413524091243744,
+      "learning_rate": 0.00025315315315315313,
+      "loss": 0.4776,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3150833801761289,
+      "grad_norm": 0.25239062309265137,
+      "learning_rate": 0.0002531081081081081,
+      "loss": 0.4788,
+      "step": 1051
+    },
+    {
+      "epoch": 0.31538317406782834,
+      "grad_norm": 0.23389939963817596,
+      "learning_rate": 0.00025306306306306306,
+      "loss": 0.4299,
+      "step": 1052
+    },
+    {
+      "epoch": 0.3156829679595278,
+      "grad_norm": 0.2468218207359314,
+      "learning_rate": 0.000253018018018018,
+      "loss": 0.4759,
+      "step": 1053
+    },
+    {
+      "epoch": 0.31598276185122726,
+      "grad_norm": 0.2298142910003662,
+      "learning_rate": 0.000252972972972973,
+      "loss": 0.4658,
+      "step": 1054
+    },
+    {
+      "epoch": 0.31628255574292674,
+      "grad_norm": 0.22888216376304626,
+      "learning_rate": 0.0002529279279279279,
+      "loss": 0.4419,
+      "step": 1055
+    },
+    {
+      "epoch": 0.31658234963462617,
+      "grad_norm": 0.23855063319206238,
+      "learning_rate": 0.00025288288288288286,
+      "loss": 0.4642,
+      "step": 1056
+    },
+    {
+      "epoch": 0.31688214352632565,
+      "grad_norm": 0.24454447627067566,
+      "learning_rate": 0.00025283783783783785,
+      "loss": 0.4495,
+      "step": 1057
+    },
+    {
+      "epoch": 0.3171819374180251,
+      "grad_norm": 0.22794046998023987,
+      "learning_rate": 0.0002527927927927928,
+      "loss": 0.4474,
+      "step": 1058
+    },
+    {
+      "epoch": 0.31748173130972457,
+      "grad_norm": 0.248634934425354,
+      "learning_rate": 0.0002527477477477477,
+      "loss": 0.4694,
+      "step": 1059
+    },
+    {
+      "epoch": 0.317781525201424,
+      "grad_norm": 0.24363334476947784,
+      "learning_rate": 0.00025270270270270266,
+      "loss": 0.4549,
+      "step": 1060
+    },
+    {
+      "epoch": 0.3180813190931235,
+      "grad_norm": 0.23220765590667725,
+      "learning_rate": 0.00025265765765765765,
+      "loss": 0.4478,
+      "step": 1061
+    },
+    {
+      "epoch": 0.3183811129848229,
+      "grad_norm": 0.22161665558815002,
+      "learning_rate": 0.0002526126126126126,
+      "loss": 0.4549,
+      "step": 1062
+    },
+    {
+      "epoch": 0.3186809068765224,
+      "grad_norm": 0.24613521993160248,
+      "learning_rate": 0.0002525675675675675,
+      "loss": 0.4505,
+      "step": 1063
+    },
+    {
+      "epoch": 0.3189807007682218,
+      "grad_norm": 0.26228928565979004,
+      "learning_rate": 0.0002525225225225225,
+      "loss": 0.4878,
+      "step": 1064
+    },
+    {
+      "epoch": 0.3192804946599213,
+      "grad_norm": 0.2279721200466156,
+      "learning_rate": 0.00025247747747747745,
+      "loss": 0.4481,
+      "step": 1065
+    },
+    {
+      "epoch": 0.31958028855162074,
+      "grad_norm": 0.24583470821380615,
+      "learning_rate": 0.0002524324324324324,
+      "loss": 0.4643,
+      "step": 1066
+    },
+    {
+      "epoch": 0.3198800824433202,
+      "grad_norm": 0.24150992929935455,
+      "learning_rate": 0.0002523873873873874,
+      "loss": 0.4705,
+      "step": 1067
+    },
+    {
+      "epoch": 0.32017987633501965,
+      "grad_norm": 0.2419997900724411,
+      "learning_rate": 0.0002523423423423423,
+      "loss": 0.4648,
+      "step": 1068
+    },
+    {
+      "epoch": 0.32047967022671914,
+      "grad_norm": 0.26776254177093506,
+      "learning_rate": 0.00025229729729729725,
+      "loss": 0.5013,
+      "step": 1069
+    },
+    {
+      "epoch": 0.32077946411841857,
+      "grad_norm": 0.24678350985050201,
+      "learning_rate": 0.00025225225225225225,
+      "loss": 0.449,
+      "step": 1070
+    },
+    {
+      "epoch": 0.32107925801011805,
+      "grad_norm": 0.24101199209690094,
+      "learning_rate": 0.0002522072072072072,
+      "loss": 0.469,
+      "step": 1071
+    },
+    {
+      "epoch": 0.3213790519018175,
+      "grad_norm": 0.24230711162090302,
+      "learning_rate": 0.0002521621621621621,
+      "loss": 0.481,
+      "step": 1072
+    },
+    {
+      "epoch": 0.32167884579351697,
+      "grad_norm": 0.22988542914390564,
+      "learning_rate": 0.0002521171171171171,
+      "loss": 0.4442,
+      "step": 1073
+    },
+    {
+      "epoch": 0.3219786396852164,
+      "grad_norm": 0.23284588754177094,
+      "learning_rate": 0.00025207207207207205,
+      "loss": 0.4392,
+      "step": 1074
+    },
+    {
+      "epoch": 0.3222784335769159,
+      "grad_norm": 0.24554894864559174,
+      "learning_rate": 0.000252027027027027,
+      "loss": 0.4544,
+      "step": 1075
+    },
+    {
+      "epoch": 0.3225782274686153,
+      "grad_norm": 0.22776508331298828,
+      "learning_rate": 0.000251981981981982,
+      "loss": 0.4503,
+      "step": 1076
+    },
+    {
+      "epoch": 0.3228780213603148,
+      "grad_norm": 0.2508374750614166,
+      "learning_rate": 0.0002519369369369369,
+      "loss": 0.4779,
+      "step": 1077
+    },
+    {
+      "epoch": 0.3231778152520142,
+      "grad_norm": 0.22543244063854218,
+      "learning_rate": 0.00025189189189189185,
+      "loss": 0.4672,
+      "step": 1078
+    },
+    {
+      "epoch": 0.3234776091437137,
+      "grad_norm": 0.2409958839416504,
+      "learning_rate": 0.00025184684684684684,
+      "loss": 0.4631,
+      "step": 1079
+    },
+    {
+      "epoch": 0.32377740303541314,
+      "grad_norm": 0.2308938056230545,
+      "learning_rate": 0.0002518018018018018,
+      "loss": 0.4244,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3240771969271126,
+      "grad_norm": 0.2354745715856552,
+      "learning_rate": 0.0002517567567567567,
+      "loss": 0.4499,
+      "step": 1081
+    },
+    {
+      "epoch": 0.32437699081881205,
+      "grad_norm": 0.24564653635025024,
+      "learning_rate": 0.0002517117117117117,
+      "loss": 0.4655,
+      "step": 1082
+    },
+    {
+      "epoch": 0.32467678471051153,
+      "grad_norm": 0.2388393133878708,
+      "learning_rate": 0.00025166666666666664,
+      "loss": 0.4747,
+      "step": 1083
+    },
+    {
+      "epoch": 0.32497657860221096,
+      "grad_norm": 0.23941588401794434,
+      "learning_rate": 0.0002516216216216216,
+      "loss": 0.4646,
+      "step": 1084
+    },
+    {
+      "epoch": 0.32527637249391045,
+      "grad_norm": 0.24191126227378845,
+      "learning_rate": 0.0002515765765765765,
+      "loss": 0.4686,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3255761663856099,
+      "grad_norm": 0.2466372847557068,
+      "learning_rate": 0.0002515315315315315,
+      "loss": 0.4989,
+      "step": 1086
+    },
+    {
+      "epoch": 0.32587596027730936,
+      "grad_norm": 0.2441006302833557,
+      "learning_rate": 0.00025148648648648644,
+      "loss": 0.4562,
+      "step": 1087
+    },
+    {
+      "epoch": 0.3261757541690088,
+      "grad_norm": 0.26483842730522156,
+      "learning_rate": 0.0002514414414414414,
+      "loss": 0.4872,
+      "step": 1088
+    },
+    {
+      "epoch": 0.3264755480607083,
+      "grad_norm": 0.2481161653995514,
+      "learning_rate": 0.00025139639639639637,
+      "loss": 0.4486,
+      "step": 1089
+    },
+    {
+      "epoch": 0.3267753419524077,
+      "grad_norm": 0.23705454170703888,
+      "learning_rate": 0.0002513513513513513,
+      "loss": 0.4407,
+      "step": 1090
+    },
+    {
+      "epoch": 0.3270751358441072,
+      "grad_norm": 0.25678539276123047,
+      "learning_rate": 0.0002513063063063063,
+      "loss": 0.4899,
+      "step": 1091
+    },
+    {
+      "epoch": 0.3273749297358066,
+      "grad_norm": 0.22578591108322144,
+      "learning_rate": 0.00025126126126126123,
+      "loss": 0.4326,
+      "step": 1092
+    },
+    {
+      "epoch": 0.3276747236275061,
+      "grad_norm": 0.23661458492279053,
+      "learning_rate": 0.00025121621621621617,
+      "loss": 0.4649,
+      "step": 1093
+    },
+    {
+      "epoch": 0.32797451751920553,
+      "grad_norm": 0.2496035248041153,
+      "learning_rate": 0.00025117117117117116,
+      "loss": 0.4978,
+      "step": 1094
+    },
+    {
+      "epoch": 0.328274311410905,
+      "grad_norm": 0.225214421749115,
+      "learning_rate": 0.0002511261261261261,
+      "loss": 0.4608,
+      "step": 1095
+    },
+    {
+      "epoch": 0.32857410530260445,
+      "grad_norm": 0.24089965224266052,
+      "learning_rate": 0.00025108108108108103,
+      "loss": 0.4854,
+      "step": 1096
+    },
+    {
+      "epoch": 0.32887389919430393,
+      "grad_norm": 0.23737536370754242,
+      "learning_rate": 0.000251036036036036,
+      "loss": 0.4692,
+      "step": 1097
+    },
+    {
+      "epoch": 0.32917369308600336,
+      "grad_norm": 0.23569715023040771,
+      "learning_rate": 0.00025099099099099096,
+      "loss": 0.44,
+      "step": 1098
+    },
+    {
+      "epoch": 0.32947348697770285,
+      "grad_norm": 0.22477473318576813,
+      "learning_rate": 0.0002509459459459459,
+      "loss": 0.4168,
+      "step": 1099
+    },
+    {
+      "epoch": 0.3297732808694023,
+      "grad_norm": 0.25336310267448425,
+      "learning_rate": 0.0002509009009009009,
+      "loss": 0.4493,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33007307476110176,
+      "grad_norm": 0.2452186793088913,
+      "learning_rate": 0.00025085585585585583,
+      "loss": 0.484,
+      "step": 1101
+    },
+    {
+      "epoch": 0.3303728686528012,
+      "grad_norm": 0.23870813846588135,
+      "learning_rate": 0.0002508108108108108,
+      "loss": 0.4228,
+      "step": 1102
+    },
+    {
+      "epoch": 0.3306726625445007,
+      "grad_norm": 0.2262556552886963,
+      "learning_rate": 0.00025076576576576575,
+      "loss": 0.4525,
+      "step": 1103
+    },
+    {
+      "epoch": 0.3309724564362001,
+      "grad_norm": 0.2720157504081726,
+      "learning_rate": 0.0002507207207207207,
+      "loss": 0.4961,
+      "step": 1104
+    },
+    {
+      "epoch": 0.3312722503278996,
+      "grad_norm": 0.23624925315380096,
+      "learning_rate": 0.0002506756756756757,
+      "loss": 0.4524,
+      "step": 1105
+    },
+    {
+      "epoch": 0.331572044219599,
+      "grad_norm": 0.2634661793708801,
+      "learning_rate": 0.0002506306306306306,
+      "loss": 0.4603,
+      "step": 1106
+    },
+    {
+      "epoch": 0.3318718381112985,
+      "grad_norm": 0.24629558622837067,
+      "learning_rate": 0.00025058558558558556,
+      "loss": 0.453,
+      "step": 1107
+    },
+    {
+      "epoch": 0.33217163200299793,
+      "grad_norm": 0.2503926157951355,
+      "learning_rate": 0.00025054054054054055,
+      "loss": 0.4756,
+      "step": 1108
+    },
+    {
+      "epoch": 0.3324714258946974,
+      "grad_norm": 0.22624404728412628,
+      "learning_rate": 0.0002504954954954955,
+      "loss": 0.4391,
+      "step": 1109
+    },
+    {
+      "epoch": 0.33277121978639684,
+      "grad_norm": 0.22146141529083252,
+      "learning_rate": 0.0002504504504504504,
+      "loss": 0.4496,
+      "step": 1110
+    },
+    {
+      "epoch": 0.33307101367809633,
+      "grad_norm": 0.23911216855049133,
+      "learning_rate": 0.0002504054054054054,
+      "loss": 0.4584,
+      "step": 1111
+    },
+    {
+      "epoch": 0.33337080756979576,
+      "grad_norm": 0.23537759482860565,
+      "learning_rate": 0.00025036036036036035,
+      "loss": 0.4735,
+      "step": 1112
+    },
+    {
+      "epoch": 0.33367060146149524,
+      "grad_norm": 0.23113307356834412,
+      "learning_rate": 0.0002503153153153153,
+      "loss": 0.4393,
+      "step": 1113
+    },
+    {
+      "epoch": 0.33397039535319467,
+      "grad_norm": 0.24185238778591156,
+      "learning_rate": 0.0002502702702702703,
+      "loss": 0.4661,
+      "step": 1114
+    },
+    {
+      "epoch": 0.33427018924489416,
+      "grad_norm": 0.25618115067481995,
+      "learning_rate": 0.0002502252252252252,
+      "loss": 0.4777,
+      "step": 1115
+    },
+    {
+      "epoch": 0.3345699831365936,
+      "grad_norm": 0.24486567080020905,
+      "learning_rate": 0.00025018018018018015,
+      "loss": 0.4589,
+      "step": 1116
+    },
+    {
+      "epoch": 0.33486977702829307,
+      "grad_norm": 0.2608473300933838,
+      "learning_rate": 0.00025013513513513514,
+      "loss": 0.4716,
+      "step": 1117
+    },
+    {
+      "epoch": 0.3351695709199925,
+      "grad_norm": 0.2427588254213333,
+      "learning_rate": 0.0002500900900900901,
+      "loss": 0.4856,
+      "step": 1118
+    },
+    {
+      "epoch": 0.335469364811692,
+      "grad_norm": 0.2493797391653061,
+      "learning_rate": 0.000250045045045045,
+      "loss": 0.4925,
+      "step": 1119
+    },
+    {
+      "epoch": 0.3357691587033914,
+      "grad_norm": 0.2610112428665161,
+      "learning_rate": 0.00025,
+      "loss": 0.4685,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3360689525950909,
+      "grad_norm": 0.2435634434223175,
+      "learning_rate": 0.00024995495495495494,
+      "loss": 0.4678,
+      "step": 1121
+    },
+    {
+      "epoch": 0.3363687464867903,
+      "grad_norm": 0.24447932839393616,
+      "learning_rate": 0.0002499099099099099,
+      "loss": 0.4656,
+      "step": 1122
+    },
+    {
+      "epoch": 0.3366685403784898,
+      "grad_norm": 0.24005521833896637,
+      "learning_rate": 0.00024986486486486487,
+      "loss": 0.4712,
+      "step": 1123
+    },
+    {
+      "epoch": 0.33696833427018924,
+      "grad_norm": 0.2303832322359085,
+      "learning_rate": 0.0002498198198198198,
+      "loss": 0.4527,
+      "step": 1124
+    },
+    {
+      "epoch": 0.3372681281618887,
+      "grad_norm": 0.221012681722641,
+      "learning_rate": 0.00024977477477477474,
+      "loss": 0.4376,
+      "step": 1125
+    },
+    {
+      "epoch": 0.33756792205358815,
+      "grad_norm": 0.23421809077262878,
+      "learning_rate": 0.00024972972972972973,
+      "loss": 0.4449,
+      "step": 1126
+    },
+    {
+      "epoch": 0.33786771594528764,
+      "grad_norm": 0.23941418528556824,
+      "learning_rate": 0.00024968468468468467,
+      "loss": 0.4679,
+      "step": 1127
+    },
+    {
+      "epoch": 0.33816750983698707,
+      "grad_norm": 0.2479025423526764,
+      "learning_rate": 0.0002496396396396396,
+      "loss": 0.4872,
+      "step": 1128
+    },
+    {
+      "epoch": 0.33846730372868655,
+      "grad_norm": 0.24516451358795166,
+      "learning_rate": 0.0002495945945945946,
+      "loss": 0.4488,
+      "step": 1129
+    },
+    {
+      "epoch": 0.338767097620386,
+      "grad_norm": 0.2436760663986206,
+      "learning_rate": 0.00024954954954954954,
+      "loss": 0.4477,
+      "step": 1130
+    },
+    {
+      "epoch": 0.33906689151208547,
+      "grad_norm": 0.23894813656806946,
+      "learning_rate": 0.00024950450450450447,
+      "loss": 0.4295,
+      "step": 1131
+    },
+    {
+      "epoch": 0.3393666854037849,
+      "grad_norm": 0.24569731950759888,
+      "learning_rate": 0.00024945945945945946,
+      "loss": 0.46,
+      "step": 1132
+    },
+    {
+      "epoch": 0.3396664792954844,
+      "grad_norm": 0.24807578325271606,
+      "learning_rate": 0.0002494144144144144,
+      "loss": 0.4512,
+      "step": 1133
+    },
+    {
+      "epoch": 0.3399662731871838,
+      "grad_norm": 0.23641791939735413,
+      "learning_rate": 0.00024936936936936934,
+      "loss": 0.454,
+      "step": 1134
+    },
+    {
+      "epoch": 0.3402660670788833,
+      "grad_norm": 0.26076388359069824,
+      "learning_rate": 0.0002493243243243243,
+      "loss": 0.4771,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3405658609705827,
+      "grad_norm": 0.24686305224895477,
+      "learning_rate": 0.00024927927927927926,
+      "loss": 0.4783,
+      "step": 1136
+    },
+    {
+      "epoch": 0.3408656548622822,
+      "grad_norm": 0.2262791097164154,
+      "learning_rate": 0.0002492342342342342,
+      "loss": 0.4272,
+      "step": 1137
+    },
+    {
+      "epoch": 0.34116544875398164,
+      "grad_norm": 0.23418664932250977,
+      "learning_rate": 0.00024918918918918914,
+      "loss": 0.4666,
+      "step": 1138
+    },
+    {
+      "epoch": 0.3414652426456811,
+      "grad_norm": 0.23737958073616028,
+      "learning_rate": 0.00024914414414414413,
+      "loss": 0.4433,
+      "step": 1139
+    },
+    {
+      "epoch": 0.34176503653738055,
+      "grad_norm": 0.2579478919506073,
+      "learning_rate": 0.00024909909909909907,
+      "loss": 0.4702,
+      "step": 1140
+    },
+    {
+      "epoch": 0.34206483042908,
+      "grad_norm": 0.2627730667591095,
+      "learning_rate": 0.000249054054054054,
+      "loss": 0.4607,
+      "step": 1141
+    },
+    {
+      "epoch": 0.34236462432077946,
+      "grad_norm": 0.2283281534910202,
+      "learning_rate": 0.000249009009009009,
+      "loss": 0.4462,
+      "step": 1142
+    },
+    {
+      "epoch": 0.3426644182124789,
+      "grad_norm": 0.24083632230758667,
+      "learning_rate": 0.00024896396396396393,
+      "loss": 0.4663,
+      "step": 1143
+    },
+    {
+      "epoch": 0.3429642121041784,
+      "grad_norm": 0.24331289529800415,
+      "learning_rate": 0.00024891891891891887,
+      "loss": 0.4414,
+      "step": 1144
+    },
+    {
+      "epoch": 0.3432640059958778,
+      "grad_norm": 0.24059659242630005,
+      "learning_rate": 0.00024887387387387386,
+      "loss": 0.4728,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3435637998875773,
+      "grad_norm": 0.23175103962421417,
+      "learning_rate": 0.0002488288288288288,
+      "loss": 0.437,
+      "step": 1146
+    },
+    {
+      "epoch": 0.3438635937792767,
+      "grad_norm": 0.23247724771499634,
+      "learning_rate": 0.00024878378378378373,
+      "loss": 0.4454,
+      "step": 1147
+    },
+    {
+      "epoch": 0.3441633876709762,
+      "grad_norm": 0.22808894515037537,
+      "learning_rate": 0.0002487387387387387,
+      "loss": 0.4379,
+      "step": 1148
+    },
+    {
+      "epoch": 0.34446318156267564,
+      "grad_norm": 0.2515697777271271,
+      "learning_rate": 0.00024869369369369366,
+      "loss": 0.4938,
+      "step": 1149
+    },
+    {
+      "epoch": 0.3447629754543751,
+      "grad_norm": 0.22830873727798462,
+      "learning_rate": 0.0002486486486486486,
+      "loss": 0.4335,
+      "step": 1150
+    },
+    {
+      "epoch": 0.34506276934607455,
+      "grad_norm": 0.2352674901485443,
+      "learning_rate": 0.0002486036036036036,
+      "loss": 0.4661,
+      "step": 1151
+    },
+    {
+      "epoch": 0.34536256323777403,
+      "grad_norm": 0.233395054936409,
+      "learning_rate": 0.0002485585585585585,
+      "loss": 0.4429,
+      "step": 1152
+    },
+    {
+      "epoch": 0.34566235712947346,
+      "grad_norm": 0.2488911747932434,
+      "learning_rate": 0.00024851351351351346,
+      "loss": 0.4832,
+      "step": 1153
+    },
+    {
+      "epoch": 0.34596215102117295,
+      "grad_norm": 0.2402927577495575,
+      "learning_rate": 0.00024846846846846845,
+      "loss": 0.4633,
+      "step": 1154
+    },
+    {
+      "epoch": 0.3462619449128724,
+      "grad_norm": 0.23628897964954376,
+      "learning_rate": 0.0002484234234234234,
+      "loss": 0.4683,
+      "step": 1155
+    },
+    {
+      "epoch": 0.34656173880457186,
+      "grad_norm": 0.23966571688652039,
+      "learning_rate": 0.0002483783783783783,
+      "loss": 0.4716,
+      "step": 1156
+    },
+    {
+      "epoch": 0.3468615326962713,
+      "grad_norm": 0.23786477744579315,
+      "learning_rate": 0.0002483333333333333,
+      "loss": 0.4829,
+      "step": 1157
+    },
+    {
+      "epoch": 0.3471613265879708,
+      "grad_norm": 0.24706590175628662,
+      "learning_rate": 0.00024828828828828825,
+      "loss": 0.4741,
+      "step": 1158
+    },
+    {
+      "epoch": 0.3474611204796702,
+      "grad_norm": 0.25192269682884216,
+      "learning_rate": 0.00024824324324324324,
+      "loss": 0.4725,
+      "step": 1159
+    },
+    {
+      "epoch": 0.3477609143713697,
+      "grad_norm": 0.2672080099582672,
+      "learning_rate": 0.0002481981981981982,
+      "loss": 0.4677,
+      "step": 1160
+    },
+    {
+      "epoch": 0.3480607082630691,
+      "grad_norm": 0.23710590600967407,
+      "learning_rate": 0.0002481531531531531,
+      "loss": 0.4629,
+      "step": 1161
+    },
+    {
+      "epoch": 0.3483605021547686,
+      "grad_norm": 0.2306007295846939,
+      "learning_rate": 0.0002481081081081081,
+      "loss": 0.4425,
+      "step": 1162
+    },
+    {
+      "epoch": 0.34866029604646803,
+      "grad_norm": 0.2416974902153015,
+      "learning_rate": 0.00024806306306306305,
+      "loss": 0.4452,
+      "step": 1163
+    },
+    {
+      "epoch": 0.3489600899381675,
+      "grad_norm": 0.2495068907737732,
+      "learning_rate": 0.000248018018018018,
+      "loss": 0.4875,
+      "step": 1164
+    },
+    {
+      "epoch": 0.34925988382986695,
+      "grad_norm": 0.26994964480400085,
+      "learning_rate": 0.00024797297297297297,
+      "loss": 0.4957,
+      "step": 1165
+    },
+    {
+      "epoch": 0.34955967772156643,
+      "grad_norm": 0.2546437978744507,
+      "learning_rate": 0.0002479279279279279,
+      "loss": 0.498,
+      "step": 1166
+    },
+    {
+      "epoch": 0.34985947161326586,
+      "grad_norm": 0.2271340936422348,
+      "learning_rate": 0.00024788288288288285,
+      "loss": 0.4615,
+      "step": 1167
+    },
+    {
+      "epoch": 0.35015926550496534,
+      "grad_norm": 0.24870999157428741,
+      "learning_rate": 0.00024783783783783784,
+      "loss": 0.4688,
+      "step": 1168
+    },
+    {
+      "epoch": 0.3504590593966648,
+      "grad_norm": 0.23978911340236664,
+      "learning_rate": 0.0002477927927927928,
+      "loss": 0.4882,
+      "step": 1169
+    },
+    {
+      "epoch": 0.35075885328836426,
+      "grad_norm": 0.2773337662220001,
+      "learning_rate": 0.0002477477477477477,
+      "loss": 0.4695,
+      "step": 1170
+    },
+    {
+      "epoch": 0.3510586471800637,
+      "grad_norm": 0.24570350348949432,
+      "learning_rate": 0.0002477027027027027,
+      "loss": 0.4679,
+      "step": 1171
+    },
+    {
+      "epoch": 0.3513584410717632,
+      "grad_norm": 0.25563982129096985,
+      "learning_rate": 0.00024765765765765764,
+      "loss": 0.4731,
+      "step": 1172
+    },
+    {
+      "epoch": 0.3516582349634626,
+      "grad_norm": 0.23189115524291992,
+      "learning_rate": 0.00024761261261261263,
+      "loss": 0.4476,
+      "step": 1173
+    },
+    {
+      "epoch": 0.3519580288551621,
+      "grad_norm": 0.24074453115463257,
+      "learning_rate": 0.00024756756756756757,
+      "loss": 0.4419,
+      "step": 1174
+    },
+    {
+      "epoch": 0.3522578227468615,
+      "grad_norm": 0.2376662790775299,
+      "learning_rate": 0.0002475225225225225,
+      "loss": 0.4571,
+      "step": 1175
+    },
+    {
+      "epoch": 0.352557616638561,
+      "grad_norm": 0.2344047725200653,
+      "learning_rate": 0.0002474774774774775,
+      "loss": 0.4389,
+      "step": 1176
+    },
+    {
+      "epoch": 0.35285741053026043,
+      "grad_norm": 0.23310165107250214,
+      "learning_rate": 0.00024743243243243243,
+      "loss": 0.4583,
+      "step": 1177
+    },
+    {
+      "epoch": 0.3531572044219599,
+      "grad_norm": 0.21277011930942535,
+      "learning_rate": 0.00024738738738738737,
+      "loss": 0.4306,
+      "step": 1178
+    },
+    {
+      "epoch": 0.35345699831365934,
+      "grad_norm": 0.23581352829933167,
+      "learning_rate": 0.00024734234234234236,
+      "loss": 0.475,
+      "step": 1179
+    },
+    {
+      "epoch": 0.3537567922053588,
+      "grad_norm": 0.23194879293441772,
+      "learning_rate": 0.0002472972972972973,
+      "loss": 0.4518,
+      "step": 1180
+    },
+    {
+      "epoch": 0.35405658609705826,
+      "grad_norm": 0.22603453695774078,
+      "learning_rate": 0.00024725225225225223,
+      "loss": 0.4464,
+      "step": 1181
+    },
+    {
+      "epoch": 0.35435637998875774,
+      "grad_norm": 0.23987054824829102,
+      "learning_rate": 0.00024720720720720717,
+      "loss": 0.4586,
+      "step": 1182
+    },
+    {
+      "epoch": 0.35465617388045717,
+      "grad_norm": 0.22986359894275665,
+      "learning_rate": 0.00024716216216216216,
+      "loss": 0.4664,
+      "step": 1183
+    },
+    {
+      "epoch": 0.35495596777215666,
+      "grad_norm": 0.22636739909648895,
+      "learning_rate": 0.0002471171171171171,
+      "loss": 0.4236,
+      "step": 1184
+    },
+    {
+      "epoch": 0.3552557616638561,
+      "grad_norm": 0.2346397340297699,
+      "learning_rate": 0.00024707207207207203,
+      "loss": 0.4703,
+      "step": 1185
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.2564719617366791,
+      "learning_rate": 0.000247027027027027,
+      "loss": 0.4775,
+      "step": 1186
+    },
+    {
+      "epoch": 0.355855349447255,
+      "grad_norm": 0.22305525839328766,
+      "learning_rate": 0.00024698198198198196,
+      "loss": 0.4694,
+      "step": 1187
+    },
+    {
+      "epoch": 0.3561551433389545,
+      "grad_norm": 0.2369467169046402,
+      "learning_rate": 0.0002469369369369369,
+      "loss": 0.4498,
+      "step": 1188
+    },
+    {
+      "epoch": 0.3564549372306539,
+      "grad_norm": 0.25123798847198486,
+      "learning_rate": 0.0002468918918918919,
+      "loss": 0.4619,
+      "step": 1189
+    },
+    {
+      "epoch": 0.3567547311223534,
+      "grad_norm": 0.21925069391727448,
+      "learning_rate": 0.0002468468468468468,
+      "loss": 0.4498,
+      "step": 1190
+    },
+    {
+      "epoch": 0.3570545250140528,
+      "grad_norm": 0.2385261207818985,
+      "learning_rate": 0.00024680180180180176,
+      "loss": 0.4537,
+      "step": 1191
+    },
+    {
+      "epoch": 0.3573543189057523,
+      "grad_norm": 0.23894301056861877,
+      "learning_rate": 0.00024675675675675675,
+      "loss": 0.4665,
+      "step": 1192
+    },
+    {
+      "epoch": 0.35765411279745174,
+      "grad_norm": 0.23315206170082092,
+      "learning_rate": 0.0002467117117117117,
+      "loss": 0.4421,
+      "step": 1193
+    },
+    {
+      "epoch": 0.3579539066891512,
+      "grad_norm": 0.23406696319580078,
+      "learning_rate": 0.0002466666666666666,
+      "loss": 0.4377,
+      "step": 1194
+    },
+    {
+      "epoch": 0.35825370058085065,
+      "grad_norm": 0.25852885842323303,
+      "learning_rate": 0.0002466216216216216,
+      "loss": 0.4838,
+      "step": 1195
+    },
+    {
+      "epoch": 0.35855349447255014,
+      "grad_norm": 0.24008771777153015,
+      "learning_rate": 0.00024657657657657655,
+      "loss": 0.4733,
+      "step": 1196
+    },
+    {
+      "epoch": 0.35885328836424957,
+      "grad_norm": 0.228665292263031,
+      "learning_rate": 0.0002465315315315315,
+      "loss": 0.4753,
+      "step": 1197
+    },
+    {
+      "epoch": 0.35915308225594905,
+      "grad_norm": 0.2344791293144226,
+      "learning_rate": 0.0002464864864864865,
+      "loss": 0.4484,
+      "step": 1198
+    },
+    {
+      "epoch": 0.3594528761476485,
+      "grad_norm": 0.22843588888645172,
+      "learning_rate": 0.0002464414414414414,
+      "loss": 0.4349,
+      "step": 1199
+    },
+    {
+      "epoch": 0.35975267003934797,
+      "grad_norm": 0.23127557337284088,
+      "learning_rate": 0.00024639639639639636,
+      "loss": 0.4466,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3600524639310474,
+      "grad_norm": 0.2092585414648056,
+      "learning_rate": 0.00024635135135135135,
+      "loss": 0.4101,
+      "step": 1201
+    },
+    {
+      "epoch": 0.3603522578227469,
+      "grad_norm": 0.24416851997375488,
+      "learning_rate": 0.0002463063063063063,
+      "loss": 0.4507,
+      "step": 1202
+    },
+    {
+      "epoch": 0.3606520517144463,
+      "grad_norm": 0.2409181445837021,
+      "learning_rate": 0.0002462612612612612,
+      "loss": 0.4641,
+      "step": 1203
+    },
+    {
+      "epoch": 0.3609518456061458,
+      "grad_norm": 0.2390405684709549,
+      "learning_rate": 0.0002462162162162162,
+      "loss": 0.4398,
+      "step": 1204
+    },
+    {
+      "epoch": 0.3612516394978452,
+      "grad_norm": 0.25821688771247864,
+      "learning_rate": 0.00024617117117117115,
+      "loss": 0.4441,
+      "step": 1205
+    },
+    {
+      "epoch": 0.3615514333895447,
+      "grad_norm": 0.24180777370929718,
+      "learning_rate": 0.0002461261261261261,
+      "loss": 0.4852,
+      "step": 1206
+    },
+    {
+      "epoch": 0.36185122728124414,
+      "grad_norm": 0.2260608822107315,
+      "learning_rate": 0.000246081081081081,
+      "loss": 0.4214,
+      "step": 1207
+    },
+    {
+      "epoch": 0.3621510211729436,
+      "grad_norm": 0.2266250103712082,
+      "learning_rate": 0.000246036036036036,
+      "loss": 0.4104,
+      "step": 1208
+    },
+    {
+      "epoch": 0.36245081506464305,
+      "grad_norm": 0.247540682554245,
+      "learning_rate": 0.00024599099099099095,
+      "loss": 0.4563,
+      "step": 1209
+    },
+    {
+      "epoch": 0.36275060895634254,
+      "grad_norm": 0.22714072465896606,
+      "learning_rate": 0.0002459459459459459,
+      "loss": 0.4528,
+      "step": 1210
+    },
+    {
+      "epoch": 0.36305040284804196,
+      "grad_norm": 0.22302433848381042,
+      "learning_rate": 0.0002459009009009009,
+      "loss": 0.3982,
+      "step": 1211
+    },
+    {
+      "epoch": 0.36335019673974145,
+      "grad_norm": 0.2646171748638153,
+      "learning_rate": 0.0002458558558558558,
+      "loss": 0.4837,
+      "step": 1212
+    },
+    {
+      "epoch": 0.3636499906314409,
+      "grad_norm": 0.24546460807323456,
+      "learning_rate": 0.00024581081081081075,
+      "loss": 0.4716,
+      "step": 1213
+    },
+    {
+      "epoch": 0.36394978452314036,
+      "grad_norm": 0.2416929006576538,
+      "learning_rate": 0.00024576576576576574,
+      "loss": 0.4634,
+      "step": 1214
+    },
+    {
+      "epoch": 0.3642495784148398,
+      "grad_norm": 0.2360236495733261,
+      "learning_rate": 0.0002457207207207207,
+      "loss": 0.4409,
+      "step": 1215
+    },
+    {
+      "epoch": 0.3645493723065393,
+      "grad_norm": 0.24383249878883362,
+      "learning_rate": 0.00024567567567567567,
+      "loss": 0.4553,
+      "step": 1216
+    },
+    {
+      "epoch": 0.3648491661982387,
+      "grad_norm": 0.2516370117664337,
+      "learning_rate": 0.0002456306306306306,
+      "loss": 0.4553,
+      "step": 1217
+    },
+    {
+      "epoch": 0.3651489600899382,
+      "grad_norm": 0.2524015009403229,
+      "learning_rate": 0.00024558558558558554,
+      "loss": 0.4766,
+      "step": 1218
+    },
+    {
+      "epoch": 0.3654487539816376,
+      "grad_norm": 0.23386207222938538,
+      "learning_rate": 0.00024554054054054053,
+      "loss": 0.439,
+      "step": 1219
+    },
+    {
+      "epoch": 0.3657485478733371,
+      "grad_norm": 0.23544320464134216,
+      "learning_rate": 0.00024549549549549547,
+      "loss": 0.4414,
+      "step": 1220
+    },
+    {
+      "epoch": 0.36604834176503653,
+      "grad_norm": 0.24809177219867706,
+      "learning_rate": 0.0002454504504504504,
+      "loss": 0.4577,
+      "step": 1221
+    },
+    {
+      "epoch": 0.366348135656736,
+      "grad_norm": 0.24466872215270996,
+      "learning_rate": 0.0002454054054054054,
+      "loss": 0.4609,
+      "step": 1222
+    },
+    {
+      "epoch": 0.36664792954843545,
+      "grad_norm": 0.24159879982471466,
+      "learning_rate": 0.00024536036036036034,
+      "loss": 0.448,
+      "step": 1223
+    },
+    {
+      "epoch": 0.36694772344013493,
+      "grad_norm": 0.2456122189760208,
+      "learning_rate": 0.00024531531531531527,
+      "loss": 0.4563,
+      "step": 1224
+    },
+    {
+      "epoch": 0.36724751733183436,
+      "grad_norm": 0.23266494274139404,
+      "learning_rate": 0.00024527027027027026,
+      "loss": 0.457,
+      "step": 1225
+    },
+    {
+      "epoch": 0.36754731122353385,
+      "grad_norm": 0.24822424352169037,
+      "learning_rate": 0.0002452252252252252,
+      "loss": 0.4577,
+      "step": 1226
+    },
+    {
+      "epoch": 0.3678471051152333,
+      "grad_norm": 0.2528662383556366,
+      "learning_rate": 0.00024518018018018014,
+      "loss": 0.4717,
+      "step": 1227
+    },
+    {
+      "epoch": 0.36814689900693276,
+      "grad_norm": 0.22255463898181915,
+      "learning_rate": 0.00024513513513513513,
+      "loss": 0.4287,
+      "step": 1228
+    },
+    {
+      "epoch": 0.3684466928986322,
+      "grad_norm": 0.23769044876098633,
+      "learning_rate": 0.00024509009009009006,
+      "loss": 0.4348,
+      "step": 1229
+    },
+    {
+      "epoch": 0.3687464867903317,
+      "grad_norm": 0.23163922131061554,
+      "learning_rate": 0.00024504504504504506,
+      "loss": 0.4631,
+      "step": 1230
+    },
+    {
+      "epoch": 0.3690462806820311,
+      "grad_norm": 0.22742880880832672,
+      "learning_rate": 0.000245,
+      "loss": 0.4344,
+      "step": 1231
+    },
+    {
+      "epoch": 0.3693460745737306,
+      "grad_norm": 0.2314460575580597,
+      "learning_rate": 0.00024495495495495493,
+      "loss": 0.4507,
+      "step": 1232
+    },
+    {
+      "epoch": 0.36964586846543,
+      "grad_norm": 0.21651825308799744,
+      "learning_rate": 0.0002449099099099099,
+      "loss": 0.422,
+      "step": 1233
+    },
+    {
+      "epoch": 0.3699456623571295,
+      "grad_norm": 0.24304571747779846,
+      "learning_rate": 0.00024486486486486486,
+      "loss": 0.4322,
+      "step": 1234
+    },
+    {
+      "epoch": 0.37024545624882893,
+      "grad_norm": 0.24105508625507355,
+      "learning_rate": 0.0002448198198198198,
+      "loss": 0.4367,
+      "step": 1235
+    },
+    {
+      "epoch": 0.37054525014052836,
+      "grad_norm": 0.2495047152042389,
+      "learning_rate": 0.0002447747747747748,
+      "loss": 0.456,
+      "step": 1236
+    },
+    {
+      "epoch": 0.37084504403222784,
+      "grad_norm": 0.2545395791530609,
+      "learning_rate": 0.0002447297297297297,
+      "loss": 0.4608,
+      "step": 1237
+    },
+    {
+      "epoch": 0.3711448379239273,
+      "grad_norm": 0.2552499771118164,
+      "learning_rate": 0.00024468468468468466,
+      "loss": 0.4898,
+      "step": 1238
+    },
+    {
+      "epoch": 0.37144463181562676,
+      "grad_norm": 0.2469811737537384,
+      "learning_rate": 0.00024463963963963965,
+      "loss": 0.4427,
+      "step": 1239
+    },
+    {
+      "epoch": 0.3717444257073262,
+      "grad_norm": 0.23857302963733673,
+      "learning_rate": 0.0002445945945945946,
+      "loss": 0.4509,
+      "step": 1240
+    },
+    {
+      "epoch": 0.37204421959902567,
+      "grad_norm": 0.2521422803401947,
+      "learning_rate": 0.0002445495495495495,
+      "loss": 0.4425,
+      "step": 1241
+    },
+    {
+      "epoch": 0.3723440134907251,
+      "grad_norm": 0.24907280504703522,
+      "learning_rate": 0.0002445045045045045,
+      "loss": 0.4542,
+      "step": 1242
+    },
+    {
+      "epoch": 0.3726438073824246,
+      "grad_norm": 0.23783591389656067,
+      "learning_rate": 0.00024445945945945945,
+      "loss": 0.4831,
+      "step": 1243
+    },
+    {
+      "epoch": 0.372943601274124,
+      "grad_norm": 0.2376372069120407,
+      "learning_rate": 0.0002444144144144144,
+      "loss": 0.4514,
+      "step": 1244
+    },
+    {
+      "epoch": 0.3732433951658235,
+      "grad_norm": 0.2387792468070984,
+      "learning_rate": 0.0002443693693693694,
+      "loss": 0.4593,
+      "step": 1245
+    },
+    {
+      "epoch": 0.37354318905752293,
+      "grad_norm": 0.22432541847229004,
+      "learning_rate": 0.0002443243243243243,
+      "loss": 0.4462,
+      "step": 1246
+    },
+    {
+      "epoch": 0.3738429829492224,
+      "grad_norm": 0.24190527200698853,
+      "learning_rate": 0.00024427927927927925,
+      "loss": 0.4645,
+      "step": 1247
+    },
+    {
+      "epoch": 0.37414277684092184,
+      "grad_norm": 0.23738646507263184,
+      "learning_rate": 0.00024423423423423424,
+      "loss": 0.4594,
+      "step": 1248
+    },
+    {
+      "epoch": 0.3744425707326213,
+      "grad_norm": 0.24582220613956451,
+      "learning_rate": 0.0002441891891891892,
+      "loss": 0.4632,
+      "step": 1249
+    },
+    {
+      "epoch": 0.37474236462432076,
+      "grad_norm": 0.22717328369617462,
+      "learning_rate": 0.0002441441441441441,
+      "loss": 0.4372,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37504215851602024,
+      "grad_norm": 0.24414947628974915,
+      "learning_rate": 0.0002440990990990991,
+      "loss": 0.4458,
+      "step": 1251
+    },
+    {
+      "epoch": 0.37534195240771967,
+      "grad_norm": 0.23710165917873383,
+      "learning_rate": 0.00024405405405405404,
+      "loss": 0.4468,
+      "step": 1252
+    },
+    {
+      "epoch": 0.37564174629941915,
+      "grad_norm": 0.25462841987609863,
+      "learning_rate": 0.00024400900900900898,
+      "loss": 0.4616,
+      "step": 1253
+    },
+    {
+      "epoch": 0.3759415401911186,
+      "grad_norm": 0.22636806964874268,
+      "learning_rate": 0.00024396396396396392,
+      "loss": 0.4259,
+      "step": 1254
+    },
+    {
+      "epoch": 0.37624133408281807,
+      "grad_norm": 0.24978788197040558,
+      "learning_rate": 0.0002439189189189189,
+      "loss": 0.464,
+      "step": 1255
+    },
+    {
+      "epoch": 0.3765411279745175,
+      "grad_norm": 0.2312556803226471,
+      "learning_rate": 0.00024387387387387384,
+      "loss": 0.4531,
+      "step": 1256
+    },
+    {
+      "epoch": 0.376840921866217,
+      "grad_norm": 0.22814960777759552,
+      "learning_rate": 0.00024382882882882878,
+      "loss": 0.4608,
+      "step": 1257
+    },
+    {
+      "epoch": 0.3771407157579164,
+      "grad_norm": 0.25135213136672974,
+      "learning_rate": 0.00024378378378378377,
+      "loss": 0.4551,
+      "step": 1258
+    },
+    {
+      "epoch": 0.3774405096496159,
+      "grad_norm": 0.2209721952676773,
+      "learning_rate": 0.0002437387387387387,
+      "loss": 0.4679,
+      "step": 1259
+    },
+    {
+      "epoch": 0.3777403035413153,
+      "grad_norm": 0.2256690412759781,
+      "learning_rate": 0.00024369369369369365,
+      "loss": 0.4388,
+      "step": 1260
+    },
+    {
+      "epoch": 0.3780400974330148,
+      "grad_norm": 0.23604658246040344,
+      "learning_rate": 0.00024364864864864864,
+      "loss": 0.4393,
+      "step": 1261
+    },
+    {
+      "epoch": 0.37833989132471424,
+      "grad_norm": 0.22875599563121796,
+      "learning_rate": 0.00024360360360360357,
+      "loss": 0.4176,
+      "step": 1262
+    },
+    {
+      "epoch": 0.3786396852164137,
+      "grad_norm": 0.2428806722164154,
+      "learning_rate": 0.00024355855855855854,
+      "loss": 0.461,
+      "step": 1263
+    },
+    {
+      "epoch": 0.37893947910811315,
+      "grad_norm": 0.2470446228981018,
+      "learning_rate": 0.0002435135135135135,
+      "loss": 0.462,
+      "step": 1264
+    },
+    {
+      "epoch": 0.37923927299981264,
+      "grad_norm": 0.22954460978507996,
+      "learning_rate": 0.00024346846846846844,
+      "loss": 0.4529,
+      "step": 1265
+    },
+    {
+      "epoch": 0.37953906689151207,
+      "grad_norm": 0.23748372495174408,
+      "learning_rate": 0.0002434234234234234,
+      "loss": 0.4598,
+      "step": 1266
+    },
+    {
+      "epoch": 0.37983886078321155,
+      "grad_norm": 0.26300883293151855,
+      "learning_rate": 0.00024337837837837837,
+      "loss": 0.482,
+      "step": 1267
+    },
+    {
+      "epoch": 0.380138654674911,
+      "grad_norm": 0.2546245753765106,
+      "learning_rate": 0.0002433333333333333,
+      "loss": 0.4607,
+      "step": 1268
+    },
+    {
+      "epoch": 0.38043844856661047,
+      "grad_norm": 0.24974289536476135,
+      "learning_rate": 0.00024328828828828827,
+      "loss": 0.4713,
+      "step": 1269
+    },
+    {
+      "epoch": 0.3807382424583099,
+      "grad_norm": 0.2457670271396637,
+      "learning_rate": 0.00024324324324324323,
+      "loss": 0.4709,
+      "step": 1270
+    },
+    {
+      "epoch": 0.3810380363500094,
+      "grad_norm": 0.2360873520374298,
+      "learning_rate": 0.00024319819819819817,
+      "loss": 0.4653,
+      "step": 1271
+    },
+    {
+      "epoch": 0.3813378302417088,
+      "grad_norm": 0.24448256194591522,
+      "learning_rate": 0.00024315315315315313,
+      "loss": 0.4651,
+      "step": 1272
+    },
+    {
+      "epoch": 0.3816376241334083,
+      "grad_norm": 0.22578337788581848,
+      "learning_rate": 0.0002431081081081081,
+      "loss": 0.4163,
+      "step": 1273
+    },
+    {
+      "epoch": 0.3819374180251077,
+      "grad_norm": 0.24973514676094055,
+      "learning_rate": 0.00024306306306306306,
+      "loss": 0.4638,
+      "step": 1274
+    },
+    {
+      "epoch": 0.3822372119168072,
+      "grad_norm": 0.21938931941986084,
+      "learning_rate": 0.000243018018018018,
+      "loss": 0.4325,
+      "step": 1275
+    },
+    {
+      "epoch": 0.38253700580850664,
+      "grad_norm": 0.23947425186634064,
+      "learning_rate": 0.00024297297297297296,
+      "loss": 0.4408,
+      "step": 1276
+    },
+    {
+      "epoch": 0.3828367997002061,
+      "grad_norm": 0.23008406162261963,
+      "learning_rate": 0.00024292792792792792,
+      "loss": 0.4377,
+      "step": 1277
+    },
+    {
+      "epoch": 0.38313659359190555,
+      "grad_norm": 0.24068063497543335,
+      "learning_rate": 0.00024288288288288286,
+      "loss": 0.4606,
+      "step": 1278
+    },
+    {
+      "epoch": 0.38343638748360503,
+      "grad_norm": 0.2432139664888382,
+      "learning_rate": 0.0002428378378378378,
+      "loss": 0.4494,
+      "step": 1279
+    },
+    {
+      "epoch": 0.38373618137530446,
+      "grad_norm": 0.22731392085552216,
+      "learning_rate": 0.0002427927927927928,
+      "loss": 0.426,
+      "step": 1280
+    },
+    {
+      "epoch": 0.38403597526700395,
+      "grad_norm": 0.2352358102798462,
+      "learning_rate": 0.00024274774774774772,
+      "loss": 0.4537,
+      "step": 1281
+    },
+    {
+      "epoch": 0.3843357691587034,
+      "grad_norm": 0.23868781328201294,
+      "learning_rate": 0.00024270270270270266,
+      "loss": 0.4641,
+      "step": 1282
+    },
+    {
+      "epoch": 0.38463556305040286,
+      "grad_norm": 0.23498302698135376,
+      "learning_rate": 0.00024265765765765765,
+      "loss": 0.4563,
+      "step": 1283
+    },
+    {
+      "epoch": 0.3849353569421023,
+      "grad_norm": 0.24769316613674164,
+      "learning_rate": 0.0002426126126126126,
+      "loss": 0.4567,
+      "step": 1284
+    },
+    {
+      "epoch": 0.3852351508338018,
+      "grad_norm": 0.21658611297607422,
+      "learning_rate": 0.00024256756756756753,
+      "loss": 0.4437,
+      "step": 1285
+    },
+    {
+      "epoch": 0.3855349447255012,
+      "grad_norm": 0.2677985727787018,
+      "learning_rate": 0.00024252252252252252,
+      "loss": 0.45,
+      "step": 1286
+    },
+    {
+      "epoch": 0.3858347386172007,
+      "grad_norm": 0.23147153854370117,
+      "learning_rate": 0.00024247747747747745,
+      "loss": 0.4341,
+      "step": 1287
+    },
+    {
+      "epoch": 0.3861345325089001,
+      "grad_norm": 0.2465144395828247,
+      "learning_rate": 0.0002424324324324324,
+      "loss": 0.4629,
+      "step": 1288
+    },
+    {
+      "epoch": 0.3864343264005996,
+      "grad_norm": 0.23633845150470734,
+      "learning_rate": 0.00024238738738738738,
+      "loss": 0.4484,
+      "step": 1289
+    },
+    {
+      "epoch": 0.38673412029229903,
+      "grad_norm": 0.22743773460388184,
+      "learning_rate": 0.00024234234234234232,
+      "loss": 0.4451,
+      "step": 1290
+    },
+    {
+      "epoch": 0.3870339141839985,
+      "grad_norm": 0.233259379863739,
+      "learning_rate": 0.00024229729729729726,
+      "loss": 0.4546,
+      "step": 1291
+    },
+    {
+      "epoch": 0.38733370807569795,
+      "grad_norm": 0.24213840067386627,
+      "learning_rate": 0.00024225225225225225,
+      "loss": 0.4378,
+      "step": 1292
+    },
+    {
+      "epoch": 0.38763350196739743,
+      "grad_norm": 0.23812246322631836,
+      "learning_rate": 0.00024220720720720718,
+      "loss": 0.4613,
+      "step": 1293
+    },
+    {
+      "epoch": 0.38793329585909686,
+      "grad_norm": 0.25436896085739136,
+      "learning_rate": 0.00024216216216216212,
+      "loss": 0.4462,
+      "step": 1294
+    },
+    {
+      "epoch": 0.38823308975079635,
+      "grad_norm": 0.24321508407592773,
+      "learning_rate": 0.0002421171171171171,
+      "loss": 0.4674,
+      "step": 1295
+    },
+    {
+      "epoch": 0.3885328836424958,
+      "grad_norm": 0.2434927523136139,
+      "learning_rate": 0.00024207207207207205,
+      "loss": 0.4317,
+      "step": 1296
+    },
+    {
+      "epoch": 0.38883267753419526,
+      "grad_norm": 0.2564734220504761,
+      "learning_rate": 0.000242027027027027,
+      "loss": 0.4556,
+      "step": 1297
+    },
+    {
+      "epoch": 0.3891324714258947,
+      "grad_norm": 0.26102596521377563,
+      "learning_rate": 0.00024198198198198198,
+      "loss": 0.4866,
+      "step": 1298
+    },
+    {
+      "epoch": 0.3894322653175942,
+      "grad_norm": 0.26838192343711853,
+      "learning_rate": 0.0002419369369369369,
+      "loss": 0.4824,
+      "step": 1299
+    },
+    {
+      "epoch": 0.3897320592092936,
+      "grad_norm": 0.24552933871746063,
+      "learning_rate": 0.00024189189189189188,
+      "loss": 0.4581,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3900318531009931,
+      "grad_norm": 0.2453169822692871,
+      "learning_rate": 0.0002418468468468468,
+      "loss": 0.4605,
+      "step": 1301
+    },
+    {
+      "epoch": 0.3903316469926925,
+      "grad_norm": 0.24922487139701843,
+      "learning_rate": 0.00024180180180180178,
+      "loss": 0.4555,
+      "step": 1302
+    },
+    {
+      "epoch": 0.390631440884392,
+      "grad_norm": 0.2683355212211609,
+      "learning_rate": 0.00024175675675675674,
+      "loss": 0.4674,
+      "step": 1303
+    },
+    {
+      "epoch": 0.39093123477609143,
+      "grad_norm": 0.23587100207805634,
+      "learning_rate": 0.00024171171171171168,
+      "loss": 0.4564,
+      "step": 1304
+    },
+    {
+      "epoch": 0.3912310286677909,
+      "grad_norm": 0.2331109195947647,
+      "learning_rate": 0.00024166666666666664,
+      "loss": 0.4499,
+      "step": 1305
+    },
+    {
+      "epoch": 0.39153082255949034,
+      "grad_norm": 0.24744129180908203,
+      "learning_rate": 0.0002416216216216216,
+      "loss": 0.453,
+      "step": 1306
+    },
+    {
+      "epoch": 0.39183061645118983,
+      "grad_norm": 0.228163942694664,
+      "learning_rate": 0.00024157657657657654,
+      "loss": 0.4219,
+      "step": 1307
+    },
+    {
+      "epoch": 0.39213041034288926,
+      "grad_norm": 0.2409953773021698,
+      "learning_rate": 0.00024153153153153153,
+      "loss": 0.4322,
+      "step": 1308
+    },
+    {
+      "epoch": 0.39243020423458874,
+      "grad_norm": 0.22519168257713318,
+      "learning_rate": 0.00024148648648648647,
+      "loss": 0.4177,
+      "step": 1309
+    },
+    {
+      "epoch": 0.39272999812628817,
+      "grad_norm": 0.2466001957654953,
+      "learning_rate": 0.0002414414414414414,
+      "loss": 0.4747,
+      "step": 1310
+    },
+    {
+      "epoch": 0.39302979201798766,
+      "grad_norm": 0.247292622923851,
+      "learning_rate": 0.0002413963963963964,
+      "loss": 0.4663,
+      "step": 1311
+    },
+    {
+      "epoch": 0.3933295859096871,
+      "grad_norm": 0.23905499279499054,
+      "learning_rate": 0.00024135135135135133,
+      "loss": 0.4631,
+      "step": 1312
+    },
+    {
+      "epoch": 0.39362937980138657,
+      "grad_norm": 0.27826932072639465,
+      "learning_rate": 0.00024130630630630627,
+      "loss": 0.4863,
+      "step": 1313
+    },
+    {
+      "epoch": 0.393929173693086,
+      "grad_norm": 0.3269807994365692,
+      "learning_rate": 0.00024126126126126126,
+      "loss": 0.4751,
+      "step": 1314
+    },
+    {
+      "epoch": 0.3942289675847855,
+      "grad_norm": 0.2593337595462799,
+      "learning_rate": 0.0002412162162162162,
+      "loss": 0.4754,
+      "step": 1315
+    },
+    {
+      "epoch": 0.3945287614764849,
+      "grad_norm": 0.24511882662773132,
+      "learning_rate": 0.00024117117117117114,
+      "loss": 0.4704,
+      "step": 1316
+    },
+    {
+      "epoch": 0.3948285553681844,
+      "grad_norm": 0.2439701408147812,
+      "learning_rate": 0.00024112612612612613,
+      "loss": 0.4367,
+      "step": 1317
+    },
+    {
+      "epoch": 0.3951283492598838,
+      "grad_norm": 0.24651208519935608,
+      "learning_rate": 0.00024108108108108106,
+      "loss": 0.4635,
+      "step": 1318
+    },
+    {
+      "epoch": 0.3954281431515833,
+      "grad_norm": 0.23849599063396454,
+      "learning_rate": 0.000241036036036036,
+      "loss": 0.4377,
+      "step": 1319
+    },
+    {
+      "epoch": 0.39572793704328274,
+      "grad_norm": 0.23733258247375488,
+      "learning_rate": 0.000240990990990991,
+      "loss": 0.4401,
+      "step": 1320
+    },
+    {
+      "epoch": 0.3960277309349822,
+      "grad_norm": 0.2643173336982727,
+      "learning_rate": 0.00024094594594594593,
+      "loss": 0.4572,
+      "step": 1321
+    },
+    {
+      "epoch": 0.39632752482668165,
+      "grad_norm": 0.23576226830482483,
+      "learning_rate": 0.00024090090090090086,
+      "loss": 0.4693,
+      "step": 1322
+    },
+    {
+      "epoch": 0.39662731871838114,
+      "grad_norm": 0.2418884038925171,
+      "learning_rate": 0.00024085585585585586,
+      "loss": 0.4453,
+      "step": 1323
+    },
+    {
+      "epoch": 0.39692711261008057,
+      "grad_norm": 0.23336432874202728,
+      "learning_rate": 0.0002408108108108108,
+      "loss": 0.4663,
+      "step": 1324
+    },
+    {
+      "epoch": 0.39722690650178005,
+      "grad_norm": 0.2603462040424347,
+      "learning_rate": 0.00024076576576576573,
+      "loss": 0.4643,
+      "step": 1325
+    },
+    {
+      "epoch": 0.3975267003934795,
+      "grad_norm": 0.24288874864578247,
+      "learning_rate": 0.0002407207207207207,
+      "loss": 0.4442,
+      "step": 1326
+    },
+    {
+      "epoch": 0.39782649428517897,
+      "grad_norm": 0.24161702394485474,
+      "learning_rate": 0.00024067567567567566,
+      "loss": 0.4773,
+      "step": 1327
+    },
+    {
+      "epoch": 0.3981262881768784,
+      "grad_norm": 0.23539233207702637,
+      "learning_rate": 0.0002406306306306306,
+      "loss": 0.4416,
+      "step": 1328
+    },
+    {
+      "epoch": 0.3984260820685779,
+      "grad_norm": 0.26161664724349976,
+      "learning_rate": 0.00024058558558558556,
+      "loss": 0.4672,
+      "step": 1329
+    },
+    {
+      "epoch": 0.3987258759602773,
+      "grad_norm": 0.24159055948257446,
+      "learning_rate": 0.00024054054054054052,
+      "loss": 0.4563,
+      "step": 1330
+    },
+    {
+      "epoch": 0.39902566985197674,
+      "grad_norm": 0.24944022297859192,
+      "learning_rate": 0.00024049549549549548,
+      "loss": 0.4554,
+      "step": 1331
+    },
+    {
+      "epoch": 0.3993254637436762,
+      "grad_norm": 0.24242587387561798,
+      "learning_rate": 0.00024045045045045042,
+      "loss": 0.4343,
+      "step": 1332
+    },
+    {
+      "epoch": 0.39962525763537565,
+      "grad_norm": 0.2406960427761078,
+      "learning_rate": 0.00024040540540540539,
+      "loss": 0.4469,
+      "step": 1333
+    },
+    {
+      "epoch": 0.39992505152707514,
+      "grad_norm": 0.24025918543338776,
+      "learning_rate": 0.00024036036036036035,
+      "loss": 0.4056,
+      "step": 1334
+    },
+    {
+      "epoch": 0.40022484541877457,
+      "grad_norm": 0.267782062292099,
+      "learning_rate": 0.00024031531531531529,
+      "loss": 0.4319,
+      "step": 1335
+    },
+    {
+      "epoch": 0.40052463931047405,
+      "grad_norm": 0.2527433931827545,
+      "learning_rate": 0.00024027027027027025,
+      "loss": 0.4679,
+      "step": 1336
+    },
+    {
+      "epoch": 0.4008244332021735,
+      "grad_norm": 0.2520921528339386,
+      "learning_rate": 0.00024022522522522521,
+      "loss": 0.4802,
+      "step": 1337
+    },
+    {
+      "epoch": 0.40112422709387296,
+      "grad_norm": 0.24161456525325775,
+      "learning_rate": 0.00024018018018018015,
+      "loss": 0.4415,
+      "step": 1338
+    },
+    {
+      "epoch": 0.4014240209855724,
+      "grad_norm": 0.24513588845729828,
+      "learning_rate": 0.00024013513513513511,
+      "loss": 0.445,
+      "step": 1339
+    },
+    {
+      "epoch": 0.4017238148772719,
+      "grad_norm": 0.24400116503238678,
+      "learning_rate": 0.00024009009009009008,
+      "loss": 0.4283,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4020236087689713,
+      "grad_norm": 0.24796655774116516,
+      "learning_rate": 0.00024004504504504502,
+      "loss": 0.4434,
+      "step": 1341
+    },
+    {
+      "epoch": 0.4023234026606708,
+      "grad_norm": 0.2471655309200287,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.4378,
+      "step": 1342
+    },
+    {
+      "epoch": 0.4026231965523702,
+      "grad_norm": 0.2507822811603546,
+      "learning_rate": 0.00023995495495495494,
+      "loss": 0.4623,
+      "step": 1343
+    },
+    {
+      "epoch": 0.4029229904440697,
+      "grad_norm": 0.24304543435573578,
+      "learning_rate": 0.00023990990990990988,
+      "loss": 0.4355,
+      "step": 1344
+    },
+    {
+      "epoch": 0.40322278433576914,
+      "grad_norm": 0.240738183259964,
+      "learning_rate": 0.00023986486486486487,
+      "loss": 0.4454,
+      "step": 1345
+    },
+    {
+      "epoch": 0.4035225782274686,
+      "grad_norm": 0.2353314906358719,
+      "learning_rate": 0.0002398198198198198,
+      "loss": 0.4493,
+      "step": 1346
+    },
+    {
+      "epoch": 0.40382237211916805,
+      "grad_norm": 0.24467633664608002,
+      "learning_rate": 0.00023977477477477474,
+      "loss": 0.4771,
+      "step": 1347
+    },
+    {
+      "epoch": 0.40412216601086753,
+      "grad_norm": 0.22876618802547455,
+      "learning_rate": 0.00023972972972972974,
+      "loss": 0.4311,
+      "step": 1348
+    },
+    {
+      "epoch": 0.40442195990256696,
+      "grad_norm": 0.24602358043193817,
+      "learning_rate": 0.00023968468468468467,
+      "loss": 0.4676,
+      "step": 1349
+    },
+    {
+      "epoch": 0.40472175379426645,
+      "grad_norm": 0.2410927563905716,
+      "learning_rate": 0.0002396396396396396,
+      "loss": 0.4564,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4050215476859659,
+      "grad_norm": 0.22765080630779266,
+      "learning_rate": 0.00023959459459459455,
+      "loss": 0.447,
+      "step": 1351
+    },
+    {
+      "epoch": 0.40532134157766536,
+      "grad_norm": 0.24429920315742493,
+      "learning_rate": 0.00023954954954954954,
+      "loss": 0.4734,
+      "step": 1352
+    },
+    {
+      "epoch": 0.4056211354693648,
+      "grad_norm": 0.2417088747024536,
+      "learning_rate": 0.00023950450450450447,
+      "loss": 0.4465,
+      "step": 1353
+    },
+    {
+      "epoch": 0.4059209293610643,
+      "grad_norm": 0.25090181827545166,
+      "learning_rate": 0.00023945945945945944,
+      "loss": 0.4754,
+      "step": 1354
+    },
+    {
+      "epoch": 0.4062207232527637,
+      "grad_norm": 0.255610853433609,
+      "learning_rate": 0.0002394144144144144,
+      "loss": 0.4891,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4065205171444632,
+      "grad_norm": 0.22734206914901733,
+      "learning_rate": 0.00023936936936936934,
+      "loss": 0.4279,
+      "step": 1356
+    },
+    {
+      "epoch": 0.4068203110361626,
+      "grad_norm": 0.24400131404399872,
+      "learning_rate": 0.0002393243243243243,
+      "loss": 0.4493,
+      "step": 1357
+    },
+    {
+      "epoch": 0.4071201049278621,
+      "grad_norm": 0.2442186176776886,
+      "learning_rate": 0.00023927927927927927,
+      "loss": 0.4347,
+      "step": 1358
+    },
+    {
+      "epoch": 0.40741989881956153,
+      "grad_norm": 0.2278696894645691,
+      "learning_rate": 0.0002392342342342342,
+      "loss": 0.4346,
+      "step": 1359
+    },
+    {
+      "epoch": 0.407719692711261,
+      "grad_norm": 0.23576432466506958,
+      "learning_rate": 0.00023918918918918917,
+      "loss": 0.4279,
+      "step": 1360
+    },
+    {
+      "epoch": 0.40801948660296045,
+      "grad_norm": 0.25753775238990784,
+      "learning_rate": 0.00023914414414414413,
+      "loss": 0.465,
+      "step": 1361
+    },
+    {
+      "epoch": 0.40831928049465993,
+      "grad_norm": 0.232134148478508,
+      "learning_rate": 0.00023909909909909907,
+      "loss": 0.4467,
+      "step": 1362
+    },
+    {
+      "epoch": 0.40861907438635936,
+      "grad_norm": 0.25058963894844055,
+      "learning_rate": 0.00023905405405405403,
+      "loss": 0.4496,
+      "step": 1363
+    },
+    {
+      "epoch": 0.40891886827805884,
+      "grad_norm": 0.24595490097999573,
+      "learning_rate": 0.000239009009009009,
+      "loss": 0.4625,
+      "step": 1364
+    },
+    {
+      "epoch": 0.4092186621697583,
+      "grad_norm": 0.2425229847431183,
+      "learning_rate": 0.00023896396396396393,
+      "loss": 0.4757,
+      "step": 1365
+    },
+    {
+      "epoch": 0.40951845606145776,
+      "grad_norm": 0.26115790009498596,
+      "learning_rate": 0.0002389189189189189,
+      "loss": 0.4619,
+      "step": 1366
+    },
+    {
+      "epoch": 0.4098182499531572,
+      "grad_norm": 0.22355914115905762,
+      "learning_rate": 0.00023887387387387386,
+      "loss": 0.4234,
+      "step": 1367
+    },
+    {
+      "epoch": 0.4101180438448567,
+      "grad_norm": 0.22979210317134857,
+      "learning_rate": 0.00023882882882882882,
+      "loss": 0.4472,
+      "step": 1368
+    },
+    {
+      "epoch": 0.4104178377365561,
+      "grad_norm": 0.2713291645050049,
+      "learning_rate": 0.00023878378378378376,
+      "loss": 0.4819,
+      "step": 1369
+    },
+    {
+      "epoch": 0.4107176316282556,
+      "grad_norm": 0.22952406108379364,
+      "learning_rate": 0.00023873873873873872,
+      "loss": 0.4428,
+      "step": 1370
+    },
+    {
+      "epoch": 0.411017425519955,
+      "grad_norm": 0.24338556826114655,
+      "learning_rate": 0.0002386936936936937,
+      "loss": 0.4703,
+      "step": 1371
+    },
+    {
+      "epoch": 0.4113172194116545,
+      "grad_norm": 0.24610304832458496,
+      "learning_rate": 0.00023864864864864862,
+      "loss": 0.4583,
+      "step": 1372
+    },
+    {
+      "epoch": 0.41161701330335393,
+      "grad_norm": 0.23116329312324524,
+      "learning_rate": 0.0002386036036036036,
+      "loss": 0.4234,
+      "step": 1373
+    },
+    {
+      "epoch": 0.4119168071950534,
+      "grad_norm": 0.24718856811523438,
+      "learning_rate": 0.00023855855855855855,
+      "loss": 0.4507,
+      "step": 1374
+    },
+    {
+      "epoch": 0.41221660108675284,
+      "grad_norm": 0.25558093190193176,
+      "learning_rate": 0.0002385135135135135,
+      "loss": 0.4609,
+      "step": 1375
+    },
+    {
+      "epoch": 0.4125163949784523,
+      "grad_norm": 0.2391168475151062,
+      "learning_rate": 0.00023846846846846843,
+      "loss": 0.4334,
+      "step": 1376
+    },
+    {
+      "epoch": 0.41281618887015176,
+      "grad_norm": 0.242578387260437,
+      "learning_rate": 0.00023842342342342342,
+      "loss": 0.472,
+      "step": 1377
+    },
+    {
+      "epoch": 0.41311598276185124,
+      "grad_norm": 0.24465034902095795,
+      "learning_rate": 0.00023837837837837835,
+      "loss": 0.4584,
+      "step": 1378
+    },
+    {
+      "epoch": 0.41341577665355067,
+      "grad_norm": 0.2500922679901123,
+      "learning_rate": 0.0002383333333333333,
+      "loss": 0.468,
+      "step": 1379
+    },
+    {
+      "epoch": 0.41371557054525016,
+      "grad_norm": 0.23939989507198334,
+      "learning_rate": 0.00023828828828828828,
+      "loss": 0.443,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4140153644369496,
+      "grad_norm": 0.272876113653183,
+      "learning_rate": 0.00023824324324324322,
+      "loss": 0.4665,
+      "step": 1381
+    },
+    {
+      "epoch": 0.41431515832864907,
+      "grad_norm": 0.2664034068584442,
+      "learning_rate": 0.00023819819819819815,
+      "loss": 0.4855,
+      "step": 1382
+    },
+    {
+      "epoch": 0.4146149522203485,
+      "grad_norm": 0.23809301853179932,
+      "learning_rate": 0.00023815315315315315,
+      "loss": 0.4483,
+      "step": 1383
+    },
+    {
+      "epoch": 0.414914746112048,
+      "grad_norm": 0.23019112646579742,
+      "learning_rate": 0.00023810810810810808,
+      "loss": 0.4324,
+      "step": 1384
+    },
+    {
+      "epoch": 0.4152145400037474,
+      "grad_norm": 0.25093144178390503,
+      "learning_rate": 0.00023806306306306302,
+      "loss": 0.4732,
+      "step": 1385
+    },
+    {
+      "epoch": 0.4155143338954469,
+      "grad_norm": 0.23062798380851746,
+      "learning_rate": 0.000238018018018018,
+      "loss": 0.4489,
+      "step": 1386
+    },
+    {
+      "epoch": 0.4158141277871463,
+      "grad_norm": 0.23424816131591797,
+      "learning_rate": 0.00023797297297297295,
+      "loss": 0.4586,
+      "step": 1387
+    },
+    {
+      "epoch": 0.4161139216788458,
+      "grad_norm": 0.23515811562538147,
+      "learning_rate": 0.0002379279279279279,
+      "loss": 0.4478,
+      "step": 1388
+    },
+    {
+      "epoch": 0.41641371557054524,
+      "grad_norm": 0.23988017439842224,
+      "learning_rate": 0.00023788288288288287,
+      "loss": 0.4382,
+      "step": 1389
+    },
+    {
+      "epoch": 0.4167135094622447,
+      "grad_norm": 0.23148085176944733,
+      "learning_rate": 0.0002378378378378378,
+      "loss": 0.4476,
+      "step": 1390
+    },
+    {
+      "epoch": 0.41701330335394415,
+      "grad_norm": 0.22274036705493927,
+      "learning_rate": 0.00023779279279279278,
+      "loss": 0.4356,
+      "step": 1391
+    },
+    {
+      "epoch": 0.41731309724564364,
+      "grad_norm": 0.22446554899215698,
+      "learning_rate": 0.00023774774774774774,
+      "loss": 0.438,
+      "step": 1392
+    },
+    {
+      "epoch": 0.41761289113734307,
+      "grad_norm": 0.2218816876411438,
+      "learning_rate": 0.00023770270270270268,
+      "loss": 0.4137,
+      "step": 1393
+    },
+    {
+      "epoch": 0.41791268502904255,
+      "grad_norm": 0.23347264528274536,
+      "learning_rate": 0.00023765765765765764,
+      "loss": 0.4361,
+      "step": 1394
+    },
+    {
+      "epoch": 0.418212478920742,
+      "grad_norm": 0.26775455474853516,
+      "learning_rate": 0.0002376126126126126,
+      "loss": 0.5153,
+      "step": 1395
+    },
+    {
+      "epoch": 0.41851227281244147,
+      "grad_norm": 0.2503002882003784,
+      "learning_rate": 0.00023756756756756754,
+      "loss": 0.4705,
+      "step": 1396
+    },
+    {
+      "epoch": 0.4188120667041409,
+      "grad_norm": 0.2259148210287094,
+      "learning_rate": 0.0002375225225225225,
+      "loss": 0.4251,
+      "step": 1397
+    },
+    {
+      "epoch": 0.4191118605958404,
+      "grad_norm": 0.24358846247196198,
+      "learning_rate": 0.00023747747747747744,
+      "loss": 0.4483,
+      "step": 1398
+    },
+    {
+      "epoch": 0.4194116544875398,
+      "grad_norm": 0.24675339460372925,
+      "learning_rate": 0.0002374324324324324,
+      "loss": 0.4379,
+      "step": 1399
+    },
+    {
+      "epoch": 0.4197114483792393,
+      "grad_norm": 0.24181531369686127,
+      "learning_rate": 0.00023738738738738737,
+      "loss": 0.4498,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4200112422709387,
+      "grad_norm": 0.24917852878570557,
+      "learning_rate": 0.0002373423423423423,
+      "loss": 0.4436,
+      "step": 1401
+    },
+    {
+      "epoch": 0.4203110361626382,
+      "grad_norm": 0.24392594397068024,
+      "learning_rate": 0.0002372972972972973,
+      "loss": 0.4791,
+      "step": 1402
+    },
+    {
+      "epoch": 0.42061083005433764,
+      "grad_norm": 0.2411157488822937,
+      "learning_rate": 0.00023725225225225223,
+      "loss": 0.472,
+      "step": 1403
+    },
+    {
+      "epoch": 0.4209106239460371,
+      "grad_norm": 0.2558772563934326,
+      "learning_rate": 0.00023720720720720717,
+      "loss": 0.4914,
+      "step": 1404
+    },
+    {
+      "epoch": 0.42121041783773655,
+      "grad_norm": 0.2580840587615967,
+      "learning_rate": 0.00023716216216216216,
+      "loss": 0.4502,
+      "step": 1405
+    },
+    {
+      "epoch": 0.42151021172943604,
+      "grad_norm": 0.23793622851371765,
+      "learning_rate": 0.0002371171171171171,
+      "loss": 0.4599,
+      "step": 1406
+    },
+    {
+      "epoch": 0.42181000562113546,
+      "grad_norm": 0.23233507573604584,
+      "learning_rate": 0.00023707207207207203,
+      "loss": 0.4268,
+      "step": 1407
+    },
+    {
+      "epoch": 0.42210979951283495,
+      "grad_norm": 0.24053210020065308,
+      "learning_rate": 0.00023702702702702703,
+      "loss": 0.4233,
+      "step": 1408
+    },
+    {
+      "epoch": 0.4224095934045344,
+      "grad_norm": 0.2314370572566986,
+      "learning_rate": 0.00023698198198198196,
+      "loss": 0.427,
+      "step": 1409
+    },
+    {
+      "epoch": 0.42270938729623386,
+      "grad_norm": 0.2242741882801056,
+      "learning_rate": 0.0002369369369369369,
+      "loss": 0.4097,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4230091811879333,
+      "grad_norm": 0.24178822338581085,
+      "learning_rate": 0.0002368918918918919,
+      "loss": 0.4259,
+      "step": 1411
+    },
+    {
+      "epoch": 0.4233089750796328,
+      "grad_norm": 0.23346510529518127,
+      "learning_rate": 0.00023684684684684683,
+      "loss": 0.4442,
+      "step": 1412
+    },
+    {
+      "epoch": 0.4236087689713322,
+      "grad_norm": 0.23908735811710358,
+      "learning_rate": 0.00023680180180180176,
+      "loss": 0.432,
+      "step": 1413
+    },
+    {
+      "epoch": 0.4239085628630317,
+      "grad_norm": 0.24085824191570282,
+      "learning_rate": 0.00023675675675675675,
+      "loss": 0.4648,
+      "step": 1414
+    },
+    {
+      "epoch": 0.4242083567547311,
+      "grad_norm": 0.2325652688741684,
+      "learning_rate": 0.0002367117117117117,
+      "loss": 0.4499,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4245081506464306,
+      "grad_norm": 0.2413867712020874,
+      "learning_rate": 0.00023666666666666663,
+      "loss": 0.4407,
+      "step": 1416
+    },
+    {
+      "epoch": 0.42480794453813003,
+      "grad_norm": 0.2494458556175232,
+      "learning_rate": 0.00023662162162162162,
+      "loss": 0.4828,
+      "step": 1417
+    },
+    {
+      "epoch": 0.4251077384298295,
+      "grad_norm": 0.24031957983970642,
+      "learning_rate": 0.00023657657657657656,
+      "loss": 0.4453,
+      "step": 1418
+    },
+    {
+      "epoch": 0.42540753232152895,
+      "grad_norm": 0.2490081638097763,
+      "learning_rate": 0.0002365315315315315,
+      "loss": 0.4595,
+      "step": 1419
+    },
+    {
+      "epoch": 0.42570732621322843,
+      "grad_norm": 0.2513922452926636,
+      "learning_rate": 0.00023648648648648648,
+      "loss": 0.4507,
+      "step": 1420
+    },
+    {
+      "epoch": 0.42600712010492786,
+      "grad_norm": 0.23692888021469116,
+      "learning_rate": 0.00023644144144144142,
+      "loss": 0.4339,
+      "step": 1421
+    },
+    {
+      "epoch": 0.42630691399662735,
+      "grad_norm": 0.22867028415203094,
+      "learning_rate": 0.00023639639639639636,
+      "loss": 0.437,
+      "step": 1422
+    },
+    {
+      "epoch": 0.4266067078883268,
+      "grad_norm": 0.23194801807403564,
+      "learning_rate": 0.00023635135135135132,
+      "loss": 0.4558,
+      "step": 1423
+    },
+    {
+      "epoch": 0.42690650178002626,
+      "grad_norm": 0.23193570971488953,
+      "learning_rate": 0.00023630630630630628,
+      "loss": 0.4446,
+      "step": 1424
+    },
+    {
+      "epoch": 0.4272062956717257,
+      "grad_norm": 0.2337258905172348,
+      "learning_rate": 0.00023626126126126125,
+      "loss": 0.4337,
+      "step": 1425
+    },
+    {
+      "epoch": 0.4275060895634251,
+      "grad_norm": 0.23658838868141174,
+      "learning_rate": 0.00023621621621621619,
+      "loss": 0.4288,
+      "step": 1426
+    },
+    {
+      "epoch": 0.4278058834551246,
+      "grad_norm": 0.26249489188194275,
+      "learning_rate": 0.00023617117117117115,
+      "loss": 0.4673,
+      "step": 1427
+    },
+    {
+      "epoch": 0.42810567734682403,
+      "grad_norm": 0.2465837299823761,
+      "learning_rate": 0.0002361261261261261,
+      "loss": 0.4357,
+      "step": 1428
+    },
+    {
+      "epoch": 0.4284054712385235,
+      "grad_norm": 0.26788344979286194,
+      "learning_rate": 0.00023608108108108105,
+      "loss": 0.4465,
+      "step": 1429
+    },
+    {
+      "epoch": 0.42870526513022295,
+      "grad_norm": 0.2597041726112366,
+      "learning_rate": 0.00023603603603603601,
+      "loss": 0.4433,
+      "step": 1430
+    },
+    {
+      "epoch": 0.42900505902192243,
+      "grad_norm": 0.235942080616951,
+      "learning_rate": 0.00023599099099099098,
+      "loss": 0.4322,
+      "step": 1431
+    },
+    {
+      "epoch": 0.42930485291362186,
+      "grad_norm": 0.25687772035598755,
+      "learning_rate": 0.00023594594594594591,
+      "loss": 0.478,
+      "step": 1432
+    },
+    {
+      "epoch": 0.42960464680532134,
+      "grad_norm": 0.2209557294845581,
+      "learning_rate": 0.00023590090090090088,
+      "loss": 0.4254,
+      "step": 1433
+    },
+    {
+      "epoch": 0.4299044406970208,
+      "grad_norm": 0.2533595860004425,
+      "learning_rate": 0.00023585585585585584,
+      "loss": 0.4545,
+      "step": 1434
+    },
+    {
+      "epoch": 0.43020423458872026,
+      "grad_norm": 0.2461264431476593,
+      "learning_rate": 0.00023581081081081078,
+      "loss": 0.4642,
+      "step": 1435
+    },
+    {
+      "epoch": 0.4305040284804197,
+      "grad_norm": 0.23638790845870972,
+      "learning_rate": 0.00023576576576576577,
+      "loss": 0.4403,
+      "step": 1436
+    },
+    {
+      "epoch": 0.43080382237211917,
+      "grad_norm": 0.23931315541267395,
+      "learning_rate": 0.0002357207207207207,
+      "loss": 0.4356,
+      "step": 1437
+    },
+    {
+      "epoch": 0.4311036162638186,
+      "grad_norm": 0.26020053029060364,
+      "learning_rate": 0.00023567567567567564,
+      "loss": 0.4562,
+      "step": 1438
+    },
+    {
+      "epoch": 0.4314034101555181,
+      "grad_norm": 0.25653496384620667,
+      "learning_rate": 0.00023563063063063063,
+      "loss": 0.473,
+      "step": 1439
+    },
+    {
+      "epoch": 0.4317032040472175,
+      "grad_norm": 0.23115600645542145,
+      "learning_rate": 0.00023558558558558557,
+      "loss": 0.4259,
+      "step": 1440
+    },
+    {
+      "epoch": 0.432002997938917,
+      "grad_norm": 0.240982785820961,
+      "learning_rate": 0.0002355405405405405,
+      "loss": 0.4507,
+      "step": 1441
+    },
+    {
+      "epoch": 0.43230279183061643,
+      "grad_norm": 0.2819494307041168,
+      "learning_rate": 0.0002354954954954955,
+      "loss": 0.4539,
+      "step": 1442
+    },
+    {
+      "epoch": 0.4326025857223159,
+      "grad_norm": 0.24356286227703094,
+      "learning_rate": 0.00023545045045045044,
+      "loss": 0.4377,
+      "step": 1443
+    },
+    {
+      "epoch": 0.43290237961401534,
+      "grad_norm": 0.23919035494327545,
+      "learning_rate": 0.00023540540540540537,
+      "loss": 0.4141,
+      "step": 1444
+    },
+    {
+      "epoch": 0.4332021735057148,
+      "grad_norm": 0.27333101630210876,
+      "learning_rate": 0.00023536036036036036,
+      "loss": 0.4655,
+      "step": 1445
+    },
+    {
+      "epoch": 0.43350196739741426,
+      "grad_norm": 0.24128217995166779,
+      "learning_rate": 0.0002353153153153153,
+      "loss": 0.4483,
+      "step": 1446
+    },
+    {
+      "epoch": 0.43380176128911374,
+      "grad_norm": 0.2448810636997223,
+      "learning_rate": 0.00023527027027027024,
+      "loss": 0.4376,
+      "step": 1447
+    },
+    {
+      "epoch": 0.43410155518081317,
+      "grad_norm": 0.24084526300430298,
+      "learning_rate": 0.0002352252252252252,
+      "loss": 0.4228,
+      "step": 1448
+    },
+    {
+      "epoch": 0.43440134907251265,
+      "grad_norm": 0.2719487249851227,
+      "learning_rate": 0.00023518018018018016,
+      "loss": 0.4564,
+      "step": 1449
+    },
+    {
+      "epoch": 0.4347011429642121,
+      "grad_norm": 0.23745928704738617,
+      "learning_rate": 0.0002351351351351351,
+      "loss": 0.4283,
+      "step": 1450
+    },
+    {
+      "epoch": 0.43500093685591157,
+      "grad_norm": 0.22788289189338684,
+      "learning_rate": 0.00023509009009009007,
+      "loss": 0.4215,
+      "step": 1451
+    },
+    {
+      "epoch": 0.435300730747611,
+      "grad_norm": 0.2702344059944153,
+      "learning_rate": 0.00023504504504504503,
+      "loss": 0.4425,
+      "step": 1452
+    },
+    {
+      "epoch": 0.4356005246393105,
+      "grad_norm": 0.2503633499145508,
+      "learning_rate": 0.00023499999999999997,
+      "loss": 0.449,
+      "step": 1453
+    },
+    {
+      "epoch": 0.4359003185310099,
+      "grad_norm": 0.2610470950603485,
+      "learning_rate": 0.00023495495495495493,
+      "loss": 0.4708,
+      "step": 1454
+    },
+    {
+      "epoch": 0.4362001124227094,
+      "grad_norm": 0.2375205010175705,
+      "learning_rate": 0.0002349099099099099,
+      "loss": 0.4426,
+      "step": 1455
+    },
+    {
+      "epoch": 0.4364999063144088,
+      "grad_norm": 0.2531331777572632,
+      "learning_rate": 0.00023486486486486483,
+      "loss": 0.4516,
+      "step": 1456
+    },
+    {
+      "epoch": 0.4367997002061083,
+      "grad_norm": 0.23722784221172333,
+      "learning_rate": 0.0002348198198198198,
+      "loss": 0.4345,
+      "step": 1457
+    },
+    {
+      "epoch": 0.43709949409780774,
+      "grad_norm": 0.2411564141511917,
+      "learning_rate": 0.00023477477477477476,
+      "loss": 0.4227,
+      "step": 1458
+    },
+    {
+      "epoch": 0.4373992879895072,
+      "grad_norm": 0.22307904064655304,
+      "learning_rate": 0.00023472972972972972,
+      "loss": 0.4254,
+      "step": 1459
+    },
+    {
+      "epoch": 0.43769908188120665,
+      "grad_norm": 0.273569256067276,
+      "learning_rate": 0.00023468468468468466,
+      "loss": 0.4384,
+      "step": 1460
+    },
+    {
+      "epoch": 0.43799887577290614,
+      "grad_norm": 0.24101589620113373,
+      "learning_rate": 0.00023463963963963962,
+      "loss": 0.4694,
+      "step": 1461
+    },
+    {
+      "epoch": 0.43829866966460557,
+      "grad_norm": 0.2409631907939911,
+      "learning_rate": 0.0002345945945945946,
+      "loss": 0.4515,
+      "step": 1462
+    },
+    {
+      "epoch": 0.43859846355630505,
+      "grad_norm": 0.24057404696941376,
+      "learning_rate": 0.00023454954954954952,
+      "loss": 0.4516,
+      "step": 1463
+    },
+    {
+      "epoch": 0.4388982574480045,
+      "grad_norm": 0.24539843201637268,
+      "learning_rate": 0.0002345045045045045,
+      "loss": 0.4078,
+      "step": 1464
+    },
+    {
+      "epoch": 0.43919805133970397,
+      "grad_norm": 0.23763391375541687,
+      "learning_rate": 0.00023445945945945945,
+      "loss": 0.4538,
+      "step": 1465
+    },
+    {
+      "epoch": 0.4394978452314034,
+      "grad_norm": 0.25087833404541016,
+      "learning_rate": 0.0002344144144144144,
+      "loss": 0.4474,
+      "step": 1466
+    },
+    {
+      "epoch": 0.4397976391231029,
+      "grad_norm": 0.24220441281795502,
+      "learning_rate": 0.00023436936936936935,
+      "loss": 0.4527,
+      "step": 1467
+    },
+    {
+      "epoch": 0.4400974330148023,
+      "grad_norm": 0.23056988418102264,
+      "learning_rate": 0.00023432432432432432,
+      "loss": 0.4375,
+      "step": 1468
+    },
+    {
+      "epoch": 0.4403972269065018,
+      "grad_norm": 0.23940956592559814,
+      "learning_rate": 0.00023427927927927925,
+      "loss": 0.4561,
+      "step": 1469
+    },
+    {
+      "epoch": 0.4406970207982012,
+      "grad_norm": 0.23279373347759247,
+      "learning_rate": 0.0002342342342342342,
+      "loss": 0.4342,
+      "step": 1470
+    },
+    {
+      "epoch": 0.4409968146899007,
+      "grad_norm": 0.23729127645492554,
+      "learning_rate": 0.00023418918918918918,
+      "loss": 0.4196,
+      "step": 1471
+    },
+    {
+      "epoch": 0.44129660858160014,
+      "grad_norm": 0.2296978086233139,
+      "learning_rate": 0.00023414414414414412,
+      "loss": 0.4308,
+      "step": 1472
+    },
+    {
+      "epoch": 0.4415964024732996,
+      "grad_norm": 0.24595269560813904,
+      "learning_rate": 0.00023409909909909905,
+      "loss": 0.4607,
+      "step": 1473
+    },
+    {
+      "epoch": 0.44189619636499905,
+      "grad_norm": 0.2266230583190918,
+      "learning_rate": 0.00023405405405405404,
+      "loss": 0.4233,
+      "step": 1474
+    },
+    {
+      "epoch": 0.44219599025669853,
+      "grad_norm": 0.23812752962112427,
+      "learning_rate": 0.00023400900900900898,
+      "loss": 0.4341,
+      "step": 1475
+    },
+    {
+      "epoch": 0.44249578414839796,
+      "grad_norm": 0.2498784214258194,
+      "learning_rate": 0.00023396396396396392,
+      "loss": 0.4406,
+      "step": 1476
+    },
+    {
+      "epoch": 0.44279557804009745,
+      "grad_norm": 0.26205873489379883,
+      "learning_rate": 0.0002339189189189189,
+      "loss": 0.463,
+      "step": 1477
+    },
+    {
+      "epoch": 0.4430953719317969,
+      "grad_norm": 0.22054405510425568,
+      "learning_rate": 0.00023387387387387385,
+      "loss": 0.4142,
+      "step": 1478
+    },
+    {
+      "epoch": 0.44339516582349636,
+      "grad_norm": 0.2611534893512726,
+      "learning_rate": 0.00023382882882882878,
+      "loss": 0.4985,
+      "step": 1479
+    },
+    {
+      "epoch": 0.4436949597151958,
+      "grad_norm": 0.2515600323677063,
+      "learning_rate": 0.00023378378378378377,
+      "loss": 0.4567,
+      "step": 1480
+    },
+    {
+      "epoch": 0.4439947536068953,
+      "grad_norm": 0.23842091858386993,
+      "learning_rate": 0.0002337387387387387,
+      "loss": 0.4301,
+      "step": 1481
+    },
+    {
+      "epoch": 0.4442945474985947,
+      "grad_norm": 0.21718434989452362,
+      "learning_rate": 0.00023369369369369367,
+      "loss": 0.4444,
+      "step": 1482
+    },
+    {
+      "epoch": 0.4445943413902942,
+      "grad_norm": 0.2340681105852127,
+      "learning_rate": 0.00023364864864864864,
+      "loss": 0.4555,
+      "step": 1483
+    },
+    {
+      "epoch": 0.4448941352819936,
+      "grad_norm": 0.24483737349510193,
+      "learning_rate": 0.00023360360360360357,
+      "loss": 0.4419,
+      "step": 1484
+    },
+    {
+      "epoch": 0.4451939291736931,
+      "grad_norm": 0.24512210488319397,
+      "learning_rate": 0.00023355855855855854,
+      "loss": 0.427,
+      "step": 1485
+    },
+    {
+      "epoch": 0.44549372306539253,
+      "grad_norm": 0.21723505854606628,
+      "learning_rate": 0.0002335135135135135,
+      "loss": 0.4075,
+      "step": 1486
+    },
+    {
+      "epoch": 0.445793516957092,
+      "grad_norm": 0.23696058988571167,
+      "learning_rate": 0.00023346846846846844,
+      "loss": 0.4332,
+      "step": 1487
+    },
+    {
+      "epoch": 0.44609331084879145,
+      "grad_norm": 0.23997806012630463,
+      "learning_rate": 0.0002334234234234234,
+      "loss": 0.4438,
+      "step": 1488
+    },
+    {
+      "epoch": 0.44639310474049093,
+      "grad_norm": 0.2866729497909546,
+      "learning_rate": 0.00023337837837837837,
+      "loss": 0.4851,
+      "step": 1489
+    },
+    {
+      "epoch": 0.44669289863219036,
+      "grad_norm": 0.22988809645175934,
+      "learning_rate": 0.0002333333333333333,
+      "loss": 0.4185,
+      "step": 1490
+    },
+    {
+      "epoch": 0.44699269252388985,
+      "grad_norm": 0.24402973055839539,
+      "learning_rate": 0.00023328828828828827,
+      "loss": 0.4277,
+      "step": 1491
+    },
+    {
+      "epoch": 0.4472924864155893,
+      "grad_norm": 0.25479069352149963,
+      "learning_rate": 0.00023324324324324323,
+      "loss": 0.4593,
+      "step": 1492
+    },
+    {
+      "epoch": 0.44759228030728876,
+      "grad_norm": 0.27010777592658997,
+      "learning_rate": 0.0002331981981981982,
+      "loss": 0.4391,
+      "step": 1493
+    },
+    {
+      "epoch": 0.4478920741989882,
+      "grad_norm": 0.2443162202835083,
+      "learning_rate": 0.00023315315315315313,
+      "loss": 0.4485,
+      "step": 1494
+    },
+    {
+      "epoch": 0.4481918680906877,
+      "grad_norm": 0.23816895484924316,
+      "learning_rate": 0.00023310810810810807,
+      "loss": 0.4606,
+      "step": 1495
+    },
+    {
+      "epoch": 0.4484916619823871,
+      "grad_norm": 0.25230711698532104,
+      "learning_rate": 0.00023306306306306306,
+      "loss": 0.4479,
+      "step": 1496
+    },
+    {
+      "epoch": 0.4487914558740866,
+      "grad_norm": 0.23312939703464508,
+      "learning_rate": 0.000233018018018018,
+      "loss": 0.4157,
+      "step": 1497
+    },
+    {
+      "epoch": 0.449091249765786,
+      "grad_norm": 0.2355630099773407,
+      "learning_rate": 0.00023297297297297293,
+      "loss": 0.4428,
+      "step": 1498
+    },
+    {
+      "epoch": 0.4493910436574855,
+      "grad_norm": 0.21646787226200104,
+      "learning_rate": 0.00023292792792792792,
+      "loss": 0.4166,
+      "step": 1499
+    },
+    {
+      "epoch": 0.44969083754918493,
+      "grad_norm": 0.2547577917575836,
+      "learning_rate": 0.00023288288288288286,
+      "loss": 0.4517,
+      "step": 1500
+    },
+    {
+      "epoch": 0.44969083754918493,
+      "eval_loss": 0.4507916569709778,
+      "eval_runtime": 565.1356,
+      "eval_samples_per_second": 3.82,
+      "eval_steps_per_second": 0.478,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4499906314408844,
+      "grad_norm": 0.22076524794101715,
+      "learning_rate": 0.0002328378378378378,
+      "loss": 0.4395,
+      "step": 1501
+    },
+    {
+      "epoch": 0.45029042533258384,
+      "grad_norm": 0.23838326334953308,
+      "learning_rate": 0.0002327927927927928,
+      "loss": 0.456,
+      "step": 1502
+    },
+    {
+      "epoch": 0.45059021922428333,
+      "grad_norm": 0.23766019940376282,
+      "learning_rate": 0.00023274774774774773,
+      "loss": 0.4506,
+      "step": 1503
+    },
+    {
+      "epoch": 0.45089001311598276,
+      "grad_norm": 0.2391175776720047,
+      "learning_rate": 0.00023270270270270266,
+      "loss": 0.4484,
+      "step": 1504
+    },
+    {
+      "epoch": 0.45118980700768224,
+      "grad_norm": 0.24999289214611053,
+      "learning_rate": 0.00023265765765765765,
+      "loss": 0.4524,
+      "step": 1505
+    },
+    {
+      "epoch": 0.45148960089938167,
+      "grad_norm": 0.24920374155044556,
+      "learning_rate": 0.0002326126126126126,
+      "loss": 0.4626,
+      "step": 1506
+    },
+    {
+      "epoch": 0.45178939479108116,
+      "grad_norm": 0.24506935477256775,
+      "learning_rate": 0.00023256756756756753,
+      "loss": 0.438,
+      "step": 1507
+    },
+    {
+      "epoch": 0.4520891886827806,
+      "grad_norm": 0.2410869002342224,
+      "learning_rate": 0.00023252252252252252,
+      "loss": 0.4579,
+      "step": 1508
+    },
+    {
+      "epoch": 0.45238898257448007,
+      "grad_norm": 0.2394392192363739,
+      "learning_rate": 0.00023247747747747745,
+      "loss": 0.4701,
+      "step": 1509
+    },
+    {
+      "epoch": 0.4526887764661795,
+      "grad_norm": 0.24809177219867706,
+      "learning_rate": 0.0002324324324324324,
+      "loss": 0.4521,
+      "step": 1510
+    },
+    {
+      "epoch": 0.452988570357879,
+      "grad_norm": 0.24093541502952576,
+      "learning_rate": 0.00023238738738738738,
+      "loss": 0.4364,
+      "step": 1511
+    },
+    {
+      "epoch": 0.4532883642495784,
+      "grad_norm": 0.24750453233718872,
+      "learning_rate": 0.00023234234234234232,
+      "loss": 0.454,
+      "step": 1512
+    },
+    {
+      "epoch": 0.4535881581412779,
+      "grad_norm": 0.24669384956359863,
+      "learning_rate": 0.00023229729729729726,
+      "loss": 0.4687,
+      "step": 1513
+    },
+    {
+      "epoch": 0.4538879520329773,
+      "grad_norm": 0.258184015750885,
+      "learning_rate": 0.00023225225225225225,
+      "loss": 0.4741,
+      "step": 1514
+    },
+    {
+      "epoch": 0.4541877459246768,
+      "grad_norm": 0.23264341056346893,
+      "learning_rate": 0.00023220720720720718,
+      "loss": 0.4255,
+      "step": 1515
+    },
+    {
+      "epoch": 0.45448753981637624,
+      "grad_norm": 0.24050508439540863,
+      "learning_rate": 0.00023216216216216215,
+      "loss": 0.4465,
+      "step": 1516
+    },
+    {
+      "epoch": 0.4547873337080757,
+      "grad_norm": 0.23079554736614227,
+      "learning_rate": 0.0002321171171171171,
+      "loss": 0.4232,
+      "step": 1517
+    },
+    {
+      "epoch": 0.45508712759977515,
+      "grad_norm": 0.22280898690223694,
+      "learning_rate": 0.00023207207207207205,
+      "loss": 0.4233,
+      "step": 1518
+    },
+    {
+      "epoch": 0.45538692149147464,
+      "grad_norm": 0.24419550597667694,
+      "learning_rate": 0.000232027027027027,
+      "loss": 0.453,
+      "step": 1519
+    },
+    {
+      "epoch": 0.45568671538317407,
+      "grad_norm": 0.2578713595867157,
+      "learning_rate": 0.00023198198198198195,
+      "loss": 0.4701,
+      "step": 1520
+    },
+    {
+      "epoch": 0.4559865092748735,
+      "grad_norm": 0.24617789685726166,
+      "learning_rate": 0.0002319369369369369,
+      "loss": 0.4744,
+      "step": 1521
+    },
+    {
+      "epoch": 0.456286303166573,
+      "grad_norm": 0.2564181387424469,
+      "learning_rate": 0.00023189189189189188,
+      "loss": 0.4672,
+      "step": 1522
+    },
+    {
+      "epoch": 0.4565860970582724,
+      "grad_norm": 0.23741687834262848,
+      "learning_rate": 0.0002318468468468468,
+      "loss": 0.4337,
+      "step": 1523
+    },
+    {
+      "epoch": 0.4568858909499719,
+      "grad_norm": 0.2671225070953369,
+      "learning_rate": 0.00023180180180180178,
+      "loss": 0.4723,
+      "step": 1524
+    },
+    {
+      "epoch": 0.4571856848416713,
+      "grad_norm": 0.2585636377334595,
+      "learning_rate": 0.00023175675675675674,
+      "loss": 0.4585,
+      "step": 1525
+    },
+    {
+      "epoch": 0.4574854787333708,
+      "grad_norm": 0.25808751583099365,
+      "learning_rate": 0.00023171171171171168,
+      "loss": 0.4667,
+      "step": 1526
+    },
+    {
+      "epoch": 0.45778527262507024,
+      "grad_norm": 0.25702106952667236,
+      "learning_rate": 0.00023166666666666667,
+      "loss": 0.4218,
+      "step": 1527
+    },
+    {
+      "epoch": 0.4580850665167697,
+      "grad_norm": 0.2685486972332001,
+      "learning_rate": 0.0002316216216216216,
+      "loss": 0.4765,
+      "step": 1528
+    },
+    {
+      "epoch": 0.45838486040846915,
+      "grad_norm": 0.25075605511665344,
+      "learning_rate": 0.00023157657657657654,
+      "loss": 0.4795,
+      "step": 1529
+    },
+    {
+      "epoch": 0.45868465430016864,
+      "grad_norm": 0.25849252939224243,
+      "learning_rate": 0.00023153153153153153,
+      "loss": 0.4362,
+      "step": 1530
+    },
+    {
+      "epoch": 0.45898444819186807,
+      "grad_norm": 0.25761592388153076,
+      "learning_rate": 0.00023148648648648647,
+      "loss": 0.4791,
+      "step": 1531
+    },
+    {
+      "epoch": 0.45928424208356755,
+      "grad_norm": 0.228532075881958,
+      "learning_rate": 0.0002314414414414414,
+      "loss": 0.4258,
+      "step": 1532
+    },
+    {
+      "epoch": 0.459584035975267,
+      "grad_norm": 0.24463020265102386,
+      "learning_rate": 0.0002313963963963964,
+      "loss": 0.4199,
+      "step": 1533
+    },
+    {
+      "epoch": 0.45988382986696646,
+      "grad_norm": 0.26668593287467957,
+      "learning_rate": 0.00023135135135135133,
+      "loss": 0.4686,
+      "step": 1534
+    },
+    {
+      "epoch": 0.4601836237586659,
+      "grad_norm": 0.24953673779964447,
+      "learning_rate": 0.00023130630630630627,
+      "loss": 0.4433,
+      "step": 1535
+    },
+    {
+      "epoch": 0.4604834176503654,
+      "grad_norm": 0.2565534710884094,
+      "learning_rate": 0.00023126126126126126,
+      "loss": 0.4638,
+      "step": 1536
+    },
+    {
+      "epoch": 0.4607832115420648,
+      "grad_norm": 0.241172194480896,
+      "learning_rate": 0.0002312162162162162,
+      "loss": 0.4065,
+      "step": 1537
+    },
+    {
+      "epoch": 0.4610830054337643,
+      "grad_norm": 0.2695203125476837,
+      "learning_rate": 0.00023117117117117114,
+      "loss": 0.4782,
+      "step": 1538
+    },
+    {
+      "epoch": 0.4613827993254637,
+      "grad_norm": 0.25559231638908386,
+      "learning_rate": 0.00023112612612612613,
+      "loss": 0.425,
+      "step": 1539
+    },
+    {
+      "epoch": 0.4616825932171632,
+      "grad_norm": 0.2544387876987457,
+      "learning_rate": 0.00023108108108108106,
+      "loss": 0.451,
+      "step": 1540
+    },
+    {
+      "epoch": 0.46198238710886264,
+      "grad_norm": 0.27124300599098206,
+      "learning_rate": 0.000231036036036036,
+      "loss": 0.4492,
+      "step": 1541
+    },
+    {
+      "epoch": 0.4622821810005621,
+      "grad_norm": 0.2581422030925751,
+      "learning_rate": 0.000230990990990991,
+      "loss": 0.4317,
+      "step": 1542
+    },
+    {
+      "epoch": 0.46258197489226155,
+      "grad_norm": 0.2657614052295685,
+      "learning_rate": 0.00023094594594594593,
+      "loss": 0.4308,
+      "step": 1543
+    },
+    {
+      "epoch": 0.46288176878396103,
+      "grad_norm": 0.25568437576293945,
+      "learning_rate": 0.00023090090090090087,
+      "loss": 0.4808,
+      "step": 1544
+    },
+    {
+      "epoch": 0.46318156267566046,
+      "grad_norm": 0.26639649271965027,
+      "learning_rate": 0.00023085585585585583,
+      "loss": 0.4808,
+      "step": 1545
+    },
+    {
+      "epoch": 0.46348135656735995,
+      "grad_norm": 0.23767255246639252,
+      "learning_rate": 0.0002308108108108108,
+      "loss": 0.4382,
+      "step": 1546
+    },
+    {
+      "epoch": 0.4637811504590594,
+      "grad_norm": 0.22875267267227173,
+      "learning_rate": 0.00023076576576576573,
+      "loss": 0.4132,
+      "step": 1547
+    },
+    {
+      "epoch": 0.46408094435075886,
+      "grad_norm": 0.27917149662971497,
+      "learning_rate": 0.0002307207207207207,
+      "loss": 0.4899,
+      "step": 1548
+    },
+    {
+      "epoch": 0.4643807382424583,
+      "grad_norm": 0.24044126272201538,
+      "learning_rate": 0.00023067567567567566,
+      "loss": 0.4481,
+      "step": 1549
+    },
+    {
+      "epoch": 0.4646805321341578,
+      "grad_norm": 0.2577783465385437,
+      "learning_rate": 0.00023063063063063062,
+      "loss": 0.4403,
+      "step": 1550
+    },
+    {
+      "epoch": 0.4649803260258572,
+      "grad_norm": 0.2589547038078308,
+      "learning_rate": 0.00023058558558558556,
+      "loss": 0.438,
+      "step": 1551
+    },
+    {
+      "epoch": 0.4652801199175567,
+      "grad_norm": 0.22958579659461975,
+      "learning_rate": 0.00023054054054054052,
+      "loss": 0.4374,
+      "step": 1552
+    },
+    {
+      "epoch": 0.4655799138092561,
+      "grad_norm": 0.28952687978744507,
+      "learning_rate": 0.00023049549549549549,
+      "loss": 0.4463,
+      "step": 1553
+    },
+    {
+      "epoch": 0.4658797077009556,
+      "grad_norm": 0.2680447995662689,
+      "learning_rate": 0.00023045045045045042,
+      "loss": 0.4758,
+      "step": 1554
+    },
+    {
+      "epoch": 0.46617950159265503,
+      "grad_norm": 0.23771092295646667,
+      "learning_rate": 0.00023040540540540539,
+      "loss": 0.4322,
+      "step": 1555
+    },
+    {
+      "epoch": 0.4664792954843545,
+      "grad_norm": 0.2691210210323334,
+      "learning_rate": 0.00023036036036036035,
+      "loss": 0.4739,
+      "step": 1556
+    },
+    {
+      "epoch": 0.46677908937605395,
+      "grad_norm": 0.24547810852527618,
+      "learning_rate": 0.0002303153153153153,
+      "loss": 0.4507,
+      "step": 1557
+    },
+    {
+      "epoch": 0.46707888326775343,
+      "grad_norm": 0.2669890224933624,
+      "learning_rate": 0.00023027027027027025,
+      "loss": 0.4809,
+      "step": 1558
+    },
+    {
+      "epoch": 0.46737867715945286,
+      "grad_norm": 0.25527825951576233,
+      "learning_rate": 0.00023022522522522521,
+      "loss": 0.4558,
+      "step": 1559
+    },
+    {
+      "epoch": 0.46767847105115234,
+      "grad_norm": 0.23491926491260529,
+      "learning_rate": 0.00023018018018018015,
+      "loss": 0.4594,
+      "step": 1560
+    },
+    {
+      "epoch": 0.4679782649428518,
+      "grad_norm": 0.27634891867637634,
+      "learning_rate": 0.00023013513513513514,
+      "loss": 0.4503,
+      "step": 1561
+    },
+    {
+      "epoch": 0.46827805883455126,
+      "grad_norm": 0.2656886577606201,
+      "learning_rate": 0.00023009009009009008,
+      "loss": 0.4674,
+      "step": 1562
+    },
+    {
+      "epoch": 0.4685778527262507,
+      "grad_norm": 0.23933476209640503,
+      "learning_rate": 0.00023004504504504502,
+      "loss": 0.4124,
+      "step": 1563
+    },
+    {
+      "epoch": 0.4688776466179502,
+      "grad_norm": 0.2596864700317383,
+      "learning_rate": 0.00023,
+      "loss": 0.4288,
+      "step": 1564
+    },
+    {
+      "epoch": 0.4691774405096496,
+      "grad_norm": 0.25186148285865784,
+      "learning_rate": 0.00022995495495495494,
+      "loss": 0.4571,
+      "step": 1565
+    },
+    {
+      "epoch": 0.4694772344013491,
+      "grad_norm": 0.28007790446281433,
+      "learning_rate": 0.00022990990990990988,
+      "loss": 0.4785,
+      "step": 1566
+    },
+    {
+      "epoch": 0.4697770282930485,
+      "grad_norm": 0.26225724816322327,
+      "learning_rate": 0.00022986486486486482,
+      "loss": 0.4378,
+      "step": 1567
+    },
+    {
+      "epoch": 0.470076822184748,
+      "grad_norm": 0.24554051458835602,
+      "learning_rate": 0.0002298198198198198,
+      "loss": 0.4655,
+      "step": 1568
+    },
+    {
+      "epoch": 0.47037661607644743,
+      "grad_norm": 0.24976593255996704,
+      "learning_rate": 0.00022977477477477475,
+      "loss": 0.4356,
+      "step": 1569
+    },
+    {
+      "epoch": 0.4706764099681469,
+      "grad_norm": 0.23914846777915955,
+      "learning_rate": 0.00022972972972972968,
+      "loss": 0.4239,
+      "step": 1570
+    },
+    {
+      "epoch": 0.47097620385984634,
+      "grad_norm": 0.24698884785175323,
+      "learning_rate": 0.00022968468468468467,
+      "loss": 0.4607,
+      "step": 1571
+    },
+    {
+      "epoch": 0.4712759977515458,
+      "grad_norm": 0.24240310490131378,
+      "learning_rate": 0.0002296396396396396,
+      "loss": 0.4218,
+      "step": 1572
+    },
+    {
+      "epoch": 0.47157579164324526,
+      "grad_norm": 0.24838680028915405,
+      "learning_rate": 0.00022959459459459457,
+      "loss": 0.4387,
+      "step": 1573
+    },
+    {
+      "epoch": 0.47187558553494474,
+      "grad_norm": 0.25536447763442993,
+      "learning_rate": 0.00022954954954954954,
+      "loss": 0.4529,
+      "step": 1574
+    },
+    {
+      "epoch": 0.47217537942664417,
+      "grad_norm": 0.24535490572452545,
+      "learning_rate": 0.00022950450450450447,
+      "loss": 0.4505,
+      "step": 1575
+    },
+    {
+      "epoch": 0.47247517331834366,
+      "grad_norm": 0.258878618478775,
+      "learning_rate": 0.00022945945945945944,
+      "loss": 0.4794,
+      "step": 1576
+    },
+    {
+      "epoch": 0.4727749672100431,
+      "grad_norm": 0.23862193524837494,
+      "learning_rate": 0.0002294144144144144,
+      "loss": 0.4555,
+      "step": 1577
+    },
+    {
+      "epoch": 0.47307476110174257,
+      "grad_norm": 0.2369290292263031,
+      "learning_rate": 0.00022936936936936934,
+      "loss": 0.4111,
+      "step": 1578
+    },
+    {
+      "epoch": 0.473374554993442,
+      "grad_norm": 0.2591108977794647,
+      "learning_rate": 0.0002293243243243243,
+      "loss": 0.4738,
+      "step": 1579
+    },
+    {
+      "epoch": 0.4736743488851415,
+      "grad_norm": 0.2639445662498474,
+      "learning_rate": 0.00022927927927927927,
+      "loss": 0.489,
+      "step": 1580
+    },
+    {
+      "epoch": 0.4739741427768409,
+      "grad_norm": 0.2452382892370224,
+      "learning_rate": 0.0002292342342342342,
+      "loss": 0.4499,
+      "step": 1581
+    },
+    {
+      "epoch": 0.4742739366685404,
+      "grad_norm": 0.24414241313934326,
+      "learning_rate": 0.00022918918918918917,
+      "loss": 0.4246,
+      "step": 1582
+    },
+    {
+      "epoch": 0.4745737305602398,
+      "grad_norm": 0.24609197676181793,
+      "learning_rate": 0.00022914414414414413,
+      "loss": 0.4615,
+      "step": 1583
+    },
+    {
+      "epoch": 0.4748735244519393,
+      "grad_norm": 0.2610466480255127,
+      "learning_rate": 0.0002290990990990991,
+      "loss": 0.4597,
+      "step": 1584
+    },
+    {
+      "epoch": 0.47517331834363874,
+      "grad_norm": 0.24946355819702148,
+      "learning_rate": 0.00022905405405405403,
+      "loss": 0.4382,
+      "step": 1585
+    },
+    {
+      "epoch": 0.4754731122353382,
+      "grad_norm": 0.24156548082828522,
+      "learning_rate": 0.000229009009009009,
+      "loss": 0.4421,
+      "step": 1586
+    },
+    {
+      "epoch": 0.47577290612703765,
+      "grad_norm": 0.2650264799594879,
+      "learning_rate": 0.00022896396396396396,
+      "loss": 0.4735,
+      "step": 1587
+    },
+    {
+      "epoch": 0.47607270001873714,
+      "grad_norm": 0.2677678167819977,
+      "learning_rate": 0.0002289189189189189,
+      "loss": 0.4581,
+      "step": 1588
+    },
+    {
+      "epoch": 0.47637249391043657,
+      "grad_norm": 0.2421169877052307,
+      "learning_rate": 0.00022887387387387386,
+      "loss": 0.4342,
+      "step": 1589
+    },
+    {
+      "epoch": 0.47667228780213605,
+      "grad_norm": 0.2284284085035324,
+      "learning_rate": 0.00022882882882882882,
+      "loss": 0.434,
+      "step": 1590
+    },
+    {
+      "epoch": 0.4769720816938355,
+      "grad_norm": 0.235052689909935,
+      "learning_rate": 0.00022878378378378376,
+      "loss": 0.4439,
+      "step": 1591
+    },
+    {
+      "epoch": 0.47727187558553497,
+      "grad_norm": 0.24947918951511383,
+      "learning_rate": 0.0002287387387387387,
+      "loss": 0.4704,
+      "step": 1592
+    },
+    {
+      "epoch": 0.4775716694772344,
+      "grad_norm": 0.24523784220218658,
+      "learning_rate": 0.0002286936936936937,
+      "loss": 0.4688,
+      "step": 1593
+    },
+    {
+      "epoch": 0.4778714633689339,
+      "grad_norm": 0.2427687793970108,
+      "learning_rate": 0.00022864864864864862,
+      "loss": 0.4145,
+      "step": 1594
+    },
+    {
+      "epoch": 0.4781712572606333,
+      "grad_norm": 0.2589262127876282,
+      "learning_rate": 0.00022860360360360356,
+      "loss": 0.4602,
+      "step": 1595
+    },
+    {
+      "epoch": 0.4784710511523328,
+      "grad_norm": 0.22775280475616455,
+      "learning_rate": 0.00022855855855855855,
+      "loss": 0.4118,
+      "step": 1596
+    },
+    {
+      "epoch": 0.4787708450440322,
+      "grad_norm": 0.26483890414237976,
+      "learning_rate": 0.0002285135135135135,
+      "loss": 0.4342,
+      "step": 1597
+    },
+    {
+      "epoch": 0.4790706389357317,
+      "grad_norm": 0.2529483735561371,
+      "learning_rate": 0.00022846846846846843,
+      "loss": 0.4474,
+      "step": 1598
+    },
+    {
+      "epoch": 0.47937043282743114,
+      "grad_norm": 0.24874089658260345,
+      "learning_rate": 0.00022842342342342342,
+      "loss": 0.4383,
+      "step": 1599
+    },
+    {
+      "epoch": 0.4796702267191306,
+      "grad_norm": 0.2583334445953369,
+      "learning_rate": 0.00022837837837837835,
+      "loss": 0.4406,
+      "step": 1600
+    },
+    {
+      "epoch": 0.47997002061083005,
+      "grad_norm": 0.2603740990161896,
+      "learning_rate": 0.0002283333333333333,
+      "loss": 0.4418,
+      "step": 1601
+    },
+    {
+      "epoch": 0.48026981450252954,
+      "grad_norm": 0.24915438890457153,
+      "learning_rate": 0.00022828828828828828,
+      "loss": 0.458,
+      "step": 1602
+    },
+    {
+      "epoch": 0.48056960839422896,
+      "grad_norm": 0.2531617283821106,
+      "learning_rate": 0.00022824324324324322,
+      "loss": 0.4639,
+      "step": 1603
+    },
+    {
+      "epoch": 0.48086940228592845,
+      "grad_norm": 0.23148907721042633,
+      "learning_rate": 0.00022819819819819816,
+      "loss": 0.4345,
+      "step": 1604
+    },
+    {
+      "epoch": 0.4811691961776279,
+      "grad_norm": 0.26466086506843567,
+      "learning_rate": 0.00022815315315315315,
+      "loss": 0.4625,
+      "step": 1605
+    },
+    {
+      "epoch": 0.48146899006932736,
+      "grad_norm": 0.2558290660381317,
+      "learning_rate": 0.00022810810810810808,
+      "loss": 0.4438,
+      "step": 1606
+    },
+    {
+      "epoch": 0.4817687839610268,
+      "grad_norm": 0.2294214814901352,
+      "learning_rate": 0.00022806306306306305,
+      "loss": 0.4343,
+      "step": 1607
+    },
+    {
+      "epoch": 0.4820685778527263,
+      "grad_norm": 0.25497883558273315,
+      "learning_rate": 0.000228018018018018,
+      "loss": 0.4498,
+      "step": 1608
+    },
+    {
+      "epoch": 0.4823683717444257,
+      "grad_norm": 0.25641798973083496,
+      "learning_rate": 0.00022797297297297295,
+      "loss": 0.4077,
+      "step": 1609
+    },
+    {
+      "epoch": 0.4826681656361252,
+      "grad_norm": 0.2556770145893097,
+      "learning_rate": 0.0002279279279279279,
+      "loss": 0.4125,
+      "step": 1610
+    },
+    {
+      "epoch": 0.4829679595278246,
+      "grad_norm": 0.2528950273990631,
+      "learning_rate": 0.00022788288288288288,
+      "loss": 0.4759,
+      "step": 1611
+    },
+    {
+      "epoch": 0.4832677534195241,
+      "grad_norm": 0.22939835488796234,
+      "learning_rate": 0.0002278378378378378,
+      "loss": 0.4422,
+      "step": 1612
+    },
+    {
+      "epoch": 0.48356754731122353,
+      "grad_norm": 0.24314181506633759,
+      "learning_rate": 0.00022779279279279278,
+      "loss": 0.4358,
+      "step": 1613
+    },
+    {
+      "epoch": 0.483867341202923,
+      "grad_norm": 0.2460578978061676,
+      "learning_rate": 0.00022774774774774774,
+      "loss": 0.4571,
+      "step": 1614
+    },
+    {
+      "epoch": 0.48416713509462245,
+      "grad_norm": 0.2627396583557129,
+      "learning_rate": 0.00022770270270270268,
+      "loss": 0.4616,
+      "step": 1615
+    },
+    {
+      "epoch": 0.4844669289863219,
+      "grad_norm": 0.23525434732437134,
+      "learning_rate": 0.00022765765765765764,
+      "loss": 0.4243,
+      "step": 1616
+    },
+    {
+      "epoch": 0.48476672287802136,
+      "grad_norm": 0.2397356927394867,
+      "learning_rate": 0.00022761261261261258,
+      "loss": 0.4382,
+      "step": 1617
+    },
+    {
+      "epoch": 0.4850665167697208,
+      "grad_norm": 0.2398831993341446,
+      "learning_rate": 0.00022756756756756757,
+      "loss": 0.4412,
+      "step": 1618
+    },
+    {
+      "epoch": 0.4853663106614203,
+      "grad_norm": 0.259376585483551,
+      "learning_rate": 0.0002275225225225225,
+      "loss": 0.4259,
+      "step": 1619
+    },
+    {
+      "epoch": 0.4856661045531197,
+      "grad_norm": 0.23204876482486725,
+      "learning_rate": 0.00022747747747747744,
+      "loss": 0.4068,
+      "step": 1620
+    },
+    {
+      "epoch": 0.4859658984448192,
+      "grad_norm": 0.2531450688838959,
+      "learning_rate": 0.00022743243243243243,
+      "loss": 0.4383,
+      "step": 1621
+    },
+    {
+      "epoch": 0.4862656923365186,
+      "grad_norm": 0.24866148829460144,
+      "learning_rate": 0.00022738738738738737,
+      "loss": 0.4856,
+      "step": 1622
+    },
+    {
+      "epoch": 0.4865654862282181,
+      "grad_norm": 0.23114702105522156,
+      "learning_rate": 0.0002273423423423423,
+      "loss": 0.4167,
+      "step": 1623
+    },
+    {
+      "epoch": 0.48686528011991753,
+      "grad_norm": 0.24857169389724731,
+      "learning_rate": 0.0002272972972972973,
+      "loss": 0.4678,
+      "step": 1624
+    },
+    {
+      "epoch": 0.487165074011617,
+      "grad_norm": 0.24516363441944122,
+      "learning_rate": 0.00022725225225225223,
+      "loss": 0.4236,
+      "step": 1625
+    },
+    {
+      "epoch": 0.48746486790331645,
+      "grad_norm": 0.2485557198524475,
+      "learning_rate": 0.00022720720720720717,
+      "loss": 0.4562,
+      "step": 1626
+    },
+    {
+      "epoch": 0.48776466179501593,
+      "grad_norm": 0.2518232762813568,
+      "learning_rate": 0.00022716216216216216,
+      "loss": 0.4703,
+      "step": 1627
+    },
+    {
+      "epoch": 0.48806445568671536,
+      "grad_norm": 0.23469193279743195,
+      "learning_rate": 0.0002271171171171171,
+      "loss": 0.4301,
+      "step": 1628
+    },
+    {
+      "epoch": 0.48836424957841484,
+      "grad_norm": 0.25275421142578125,
+      "learning_rate": 0.00022707207207207204,
+      "loss": 0.4576,
+      "step": 1629
+    },
+    {
+      "epoch": 0.4886640434701143,
+      "grad_norm": 0.25434768199920654,
+      "learning_rate": 0.00022702702702702703,
+      "loss": 0.4654,
+      "step": 1630
+    },
+    {
+      "epoch": 0.48896383736181376,
+      "grad_norm": 0.24483707547187805,
+      "learning_rate": 0.00022698198198198196,
+      "loss": 0.45,
+      "step": 1631
+    },
+    {
+      "epoch": 0.4892636312535132,
+      "grad_norm": 0.24300119280815125,
+      "learning_rate": 0.0002269369369369369,
+      "loss": 0.4795,
+      "step": 1632
+    },
+    {
+      "epoch": 0.48956342514521267,
+      "grad_norm": 0.23230652511119843,
+      "learning_rate": 0.0002268918918918919,
+      "loss": 0.4318,
+      "step": 1633
+    },
+    {
+      "epoch": 0.4898632190369121,
+      "grad_norm": 0.2618871331214905,
+      "learning_rate": 0.00022684684684684683,
+      "loss": 0.4846,
+      "step": 1634
+    },
+    {
+      "epoch": 0.4901630129286116,
+      "grad_norm": 0.2477121651172638,
+      "learning_rate": 0.00022680180180180176,
+      "loss": 0.4392,
+      "step": 1635
+    },
+    {
+      "epoch": 0.490462806820311,
+      "grad_norm": 0.2603763937950134,
+      "learning_rate": 0.00022675675675675676,
+      "loss": 0.4875,
+      "step": 1636
+    },
+    {
+      "epoch": 0.4907626007120105,
+      "grad_norm": 0.2470758855342865,
+      "learning_rate": 0.0002267117117117117,
+      "loss": 0.4575,
+      "step": 1637
+    },
+    {
+      "epoch": 0.49106239460370993,
+      "grad_norm": 0.2612755000591278,
+      "learning_rate": 0.00022666666666666663,
+      "loss": 0.4652,
+      "step": 1638
+    },
+    {
+      "epoch": 0.4913621884954094,
+      "grad_norm": 0.2424866259098053,
+      "learning_rate": 0.0002266216216216216,
+      "loss": 0.4346,
+      "step": 1639
+    },
+    {
+      "epoch": 0.49166198238710884,
+      "grad_norm": 0.2529725730419159,
+      "learning_rate": 0.00022657657657657656,
+      "loss": 0.4427,
+      "step": 1640
+    },
+    {
+      "epoch": 0.4919617762788083,
+      "grad_norm": 0.2564398944377899,
+      "learning_rate": 0.00022653153153153152,
+      "loss": 0.4641,
+      "step": 1641
+    },
+    {
+      "epoch": 0.49226157017050776,
+      "grad_norm": 0.2599097490310669,
+      "learning_rate": 0.00022648648648648646,
+      "loss": 0.4564,
+      "step": 1642
+    },
+    {
+      "epoch": 0.49256136406220724,
+      "grad_norm": 0.2355022430419922,
+      "learning_rate": 0.00022644144144144142,
+      "loss": 0.4313,
+      "step": 1643
+    },
+    {
+      "epoch": 0.49286115795390667,
+      "grad_norm": 0.24770522117614746,
+      "learning_rate": 0.00022639639639639638,
+      "loss": 0.4468,
+      "step": 1644
+    },
+    {
+      "epoch": 0.49316095184560615,
+      "grad_norm": 0.2558223009109497,
+      "learning_rate": 0.00022635135135135132,
+      "loss": 0.4506,
+      "step": 1645
+    },
+    {
+      "epoch": 0.4934607457373056,
+      "grad_norm": 0.24741050601005554,
+      "learning_rate": 0.00022630630630630629,
+      "loss": 0.4375,
+      "step": 1646
+    },
+    {
+      "epoch": 0.49376053962900507,
+      "grad_norm": 0.24249225854873657,
+      "learning_rate": 0.00022626126126126125,
+      "loss": 0.4283,
+      "step": 1647
+    },
+    {
+      "epoch": 0.4940603335207045,
+      "grad_norm": 0.2792401611804962,
+      "learning_rate": 0.00022621621621621619,
+      "loss": 0.4757,
+      "step": 1648
+    },
+    {
+      "epoch": 0.494360127412404,
+      "grad_norm": 0.23249030113220215,
+      "learning_rate": 0.00022617117117117115,
+      "loss": 0.4417,
+      "step": 1649
+    },
+    {
+      "epoch": 0.4946599213041034,
+      "grad_norm": 0.2646411955356598,
+      "learning_rate": 0.00022612612612612611,
+      "loss": 0.4338,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4949597151958029,
+      "grad_norm": 0.2633950710296631,
+      "learning_rate": 0.00022608108108108105,
+      "loss": 0.476,
+      "step": 1651
+    },
+    {
+      "epoch": 0.4952595090875023,
+      "grad_norm": 0.24183906614780426,
+      "learning_rate": 0.00022603603603603601,
+      "loss": 0.4363,
+      "step": 1652
+    },
+    {
+      "epoch": 0.4955593029792018,
+      "grad_norm": 0.26413261890411377,
+      "learning_rate": 0.00022599099099099098,
+      "loss": 0.4387,
+      "step": 1653
+    },
+    {
+      "epoch": 0.49585909687090124,
+      "grad_norm": 0.2538875937461853,
+      "learning_rate": 0.00022594594594594592,
+      "loss": 0.4284,
+      "step": 1654
+    },
+    {
+      "epoch": 0.4961588907626007,
+      "grad_norm": 0.2566417157649994,
+      "learning_rate": 0.0002259009009009009,
+      "loss": 0.4352,
+      "step": 1655
+    },
+    {
+      "epoch": 0.49645868465430015,
+      "grad_norm": 0.2526787519454956,
+      "learning_rate": 0.00022585585585585584,
+      "loss": 0.4567,
+      "step": 1656
+    },
+    {
+      "epoch": 0.49675847854599964,
+      "grad_norm": 0.2506735324859619,
+      "learning_rate": 0.00022581081081081078,
+      "loss": 0.4346,
+      "step": 1657
+    },
+    {
+      "epoch": 0.49705827243769907,
+      "grad_norm": 0.25367823243141174,
+      "learning_rate": 0.00022576576576576577,
+      "loss": 0.4392,
+      "step": 1658
+    },
+    {
+      "epoch": 0.49735806632939855,
+      "grad_norm": 0.24888494610786438,
+      "learning_rate": 0.0002257207207207207,
+      "loss": 0.4532,
+      "step": 1659
+    },
+    {
+      "epoch": 0.497657860221098,
+      "grad_norm": 0.22846034169197083,
+      "learning_rate": 0.00022567567567567564,
+      "loss": 0.4163,
+      "step": 1660
+    },
+    {
+      "epoch": 0.49795765411279747,
+      "grad_norm": 0.23993121087551117,
+      "learning_rate": 0.00022563063063063064,
+      "loss": 0.4385,
+      "step": 1661
+    },
+    {
+      "epoch": 0.4982574480044969,
+      "grad_norm": 0.2544318437576294,
+      "learning_rate": 0.00022558558558558557,
+      "loss": 0.4058,
+      "step": 1662
+    },
+    {
+      "epoch": 0.4985572418961964,
+      "grad_norm": 0.2419573813676834,
+      "learning_rate": 0.0002255405405405405,
+      "loss": 0.4428,
+      "step": 1663
+    },
+    {
+      "epoch": 0.4988570357878958,
+      "grad_norm": 0.23838767409324646,
+      "learning_rate": 0.00022549549549549547,
+      "loss": 0.4293,
+      "step": 1664
+    },
+    {
+      "epoch": 0.4991568296795953,
+      "grad_norm": 0.23000121116638184,
+      "learning_rate": 0.00022545045045045044,
+      "loss": 0.436,
+      "step": 1665
+    },
+    {
+      "epoch": 0.4994566235712947,
+      "grad_norm": 0.2665446102619171,
+      "learning_rate": 0.00022540540540540537,
+      "loss": 0.4841,
+      "step": 1666
+    },
+    {
+      "epoch": 0.4997564174629942,
+      "grad_norm": 0.25025802850723267,
+      "learning_rate": 0.00022536036036036034,
+      "loss": 0.4357,
+      "step": 1667
+    },
+    {
+      "epoch": 0.5000562113546937,
+      "grad_norm": 0.2441510409116745,
+      "learning_rate": 0.0002253153153153153,
+      "loss": 0.4345,
+      "step": 1668
+    },
+    {
+      "epoch": 0.5003560052463931,
+      "grad_norm": 0.24143864214420319,
+      "learning_rate": 0.00022527027027027024,
+      "loss": 0.4416,
+      "step": 1669
+    },
+    {
+      "epoch": 0.5006557991380925,
+      "grad_norm": 0.246190145611763,
+      "learning_rate": 0.0002252252252252252,
+      "loss": 0.4124,
+      "step": 1670
+    },
+    {
+      "epoch": 0.500955593029792,
+      "grad_norm": 0.2695963382720947,
+      "learning_rate": 0.00022518018018018017,
+      "loss": 0.4683,
+      "step": 1671
+    },
+    {
+      "epoch": 0.5012553869214915,
+      "grad_norm": 0.23124708235263824,
+      "learning_rate": 0.0002251351351351351,
+      "loss": 0.4347,
+      "step": 1672
+    },
+    {
+      "epoch": 0.501555180813191,
+      "grad_norm": 0.25153648853302,
+      "learning_rate": 0.00022509009009009007,
+      "loss": 0.4408,
+      "step": 1673
+    },
+    {
+      "epoch": 0.5018549747048904,
+      "grad_norm": 0.2653743028640747,
+      "learning_rate": 0.00022504504504504503,
+      "loss": 0.4482,
+      "step": 1674
+    },
+    {
+      "epoch": 0.5021547685965898,
+      "grad_norm": 0.24365836381912231,
+      "learning_rate": 0.000225,
+      "loss": 0.4225,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5024545624882893,
+      "grad_norm": 0.2595864236354828,
+      "learning_rate": 0.00022495495495495493,
+      "loss": 0.4624,
+      "step": 1676
+    },
+    {
+      "epoch": 0.5027543563799888,
+      "grad_norm": 0.24555698037147522,
+      "learning_rate": 0.0002249099099099099,
+      "loss": 0.4495,
+      "step": 1677
+    },
+    {
+      "epoch": 0.5030541502716882,
+      "grad_norm": 0.25951844453811646,
+      "learning_rate": 0.00022486486486486486,
+      "loss": 0.4503,
+      "step": 1678
+    },
+    {
+      "epoch": 0.5033539441633876,
+      "grad_norm": 0.28872594237327576,
+      "learning_rate": 0.0002248198198198198,
+      "loss": 0.4916,
+      "step": 1679
+    },
+    {
+      "epoch": 0.5036537380550872,
+      "grad_norm": 0.24827940762043,
+      "learning_rate": 0.00022477477477477476,
+      "loss": 0.4024,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5039535319467866,
+      "grad_norm": 0.2625780999660492,
+      "learning_rate": 0.00022472972972972972,
+      "loss": 0.4624,
+      "step": 1681
+    },
+    {
+      "epoch": 0.504253325838486,
+      "grad_norm": 0.26651719212532043,
+      "learning_rate": 0.00022468468468468466,
+      "loss": 0.4526,
+      "step": 1682
+    },
+    {
+      "epoch": 0.5045531197301855,
+      "grad_norm": 0.2538798153400421,
+      "learning_rate": 0.00022463963963963962,
+      "loss": 0.4328,
+      "step": 1683
+    },
+    {
+      "epoch": 0.504852913621885,
+      "grad_norm": 0.25730088353157043,
+      "learning_rate": 0.0002245945945945946,
+      "loss": 0.4642,
+      "step": 1684
+    },
+    {
+      "epoch": 0.5051527075135844,
+      "grad_norm": 0.24593298137187958,
+      "learning_rate": 0.00022454954954954952,
+      "loss": 0.4422,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5054525014052839,
+      "grad_norm": 0.25883376598358154,
+      "learning_rate": 0.0002245045045045045,
+      "loss": 0.4439,
+      "step": 1686
+    },
+    {
+      "epoch": 0.5057522952969833,
+      "grad_norm": 0.2680940330028534,
+      "learning_rate": 0.00022445945945945945,
+      "loss": 0.4459,
+      "step": 1687
+    },
+    {
+      "epoch": 0.5060520891886828,
+      "grad_norm": 0.23688143491744995,
+      "learning_rate": 0.0002244144144144144,
+      "loss": 0.4404,
+      "step": 1688
+    },
+    {
+      "epoch": 0.5063518830803823,
+      "grad_norm": 0.24362222850322723,
+      "learning_rate": 0.00022436936936936933,
+      "loss": 0.473,
+      "step": 1689
+    },
+    {
+      "epoch": 0.5066516769720817,
+      "grad_norm": 0.24516165256500244,
+      "learning_rate": 0.00022432432432432432,
+      "loss": 0.4288,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5069514708637811,
+      "grad_norm": 0.23739798367023468,
+      "learning_rate": 0.00022427927927927925,
+      "loss": 0.4231,
+      "step": 1691
+    },
+    {
+      "epoch": 0.5072512647554807,
+      "grad_norm": 0.25751793384552,
+      "learning_rate": 0.0002242342342342342,
+      "loss": 0.4406,
+      "step": 1692
+    },
+    {
+      "epoch": 0.5075510586471801,
+      "grad_norm": 0.23062650859355927,
+      "learning_rate": 0.00022418918918918918,
+      "loss": 0.4273,
+      "step": 1693
+    },
+    {
+      "epoch": 0.5078508525388795,
+      "grad_norm": 0.2453629970550537,
+      "learning_rate": 0.00022414414414414412,
+      "loss": 0.4526,
+      "step": 1694
+    },
+    {
+      "epoch": 0.508150646430579,
+      "grad_norm": 0.26255685091018677,
+      "learning_rate": 0.00022409909909909905,
+      "loss": 0.461,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5084504403222785,
+      "grad_norm": 0.2384280562400818,
+      "learning_rate": 0.00022405405405405405,
+      "loss": 0.4333,
+      "step": 1696
+    },
+    {
+      "epoch": 0.5087502342139779,
+      "grad_norm": 0.24055825173854828,
+      "learning_rate": 0.00022400900900900898,
+      "loss": 0.4472,
+      "step": 1697
+    },
+    {
+      "epoch": 0.5090500281056773,
+      "grad_norm": 0.24356570839881897,
+      "learning_rate": 0.00022396396396396395,
+      "loss": 0.4159,
+      "step": 1698
+    },
+    {
+      "epoch": 0.5093498219973768,
+      "grad_norm": 0.3042013645172119,
+      "learning_rate": 0.0002239189189189189,
+      "loss": 0.474,
+      "step": 1699
+    },
+    {
+      "epoch": 0.5096496158890763,
+      "grad_norm": 0.2500532567501068,
+      "learning_rate": 0.00022387387387387385,
+      "loss": 0.4223,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5099494097807757,
+      "grad_norm": 0.25324761867523193,
+      "learning_rate": 0.0002238288288288288,
+      "loss": 0.452,
+      "step": 1701
+    },
+    {
+      "epoch": 0.5102492036724752,
+      "grad_norm": 0.26007258892059326,
+      "learning_rate": 0.00022378378378378377,
+      "loss": 0.4516,
+      "step": 1702
+    },
+    {
+      "epoch": 0.5105489975641746,
+      "grad_norm": 0.2657194435596466,
+      "learning_rate": 0.0002237387387387387,
+      "loss": 0.4663,
+      "step": 1703
+    },
+    {
+      "epoch": 0.5108487914558741,
+      "grad_norm": 0.28216373920440674,
+      "learning_rate": 0.00022369369369369368,
+      "loss": 0.4341,
+      "step": 1704
+    },
+    {
+      "epoch": 0.5111485853475736,
+      "grad_norm": 0.2571386992931366,
+      "learning_rate": 0.00022364864864864864,
+      "loss": 0.4532,
+      "step": 1705
+    },
+    {
+      "epoch": 0.511448379239273,
+      "grad_norm": 0.27189430594444275,
+      "learning_rate": 0.00022360360360360358,
+      "loss": 0.4593,
+      "step": 1706
+    },
+    {
+      "epoch": 0.5117481731309724,
+      "grad_norm": 0.2536429166793823,
+      "learning_rate": 0.00022355855855855854,
+      "loss": 0.4603,
+      "step": 1707
+    },
+    {
+      "epoch": 0.512047967022672,
+      "grad_norm": 0.2552615702152252,
+      "learning_rate": 0.0002235135135135135,
+      "loss": 0.4265,
+      "step": 1708
+    },
+    {
+      "epoch": 0.5123477609143714,
+      "grad_norm": 0.24926409125328064,
+      "learning_rate": 0.00022346846846846844,
+      "loss": 0.444,
+      "step": 1709
+    },
+    {
+      "epoch": 0.5126475548060708,
+      "grad_norm": 0.27449071407318115,
+      "learning_rate": 0.0002234234234234234,
+      "loss": 0.4605,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5129473486977703,
+      "grad_norm": 0.255071222782135,
+      "learning_rate": 0.00022337837837837837,
+      "loss": 0.4409,
+      "step": 1711
+    },
+    {
+      "epoch": 0.5132471425894697,
+      "grad_norm": 0.2560432553291321,
+      "learning_rate": 0.00022333333333333333,
+      "loss": 0.467,
+      "step": 1712
+    },
+    {
+      "epoch": 0.5135469364811692,
+      "grad_norm": 0.2458151876926422,
+      "learning_rate": 0.00022328828828828827,
+      "loss": 0.4626,
+      "step": 1713
+    },
+    {
+      "epoch": 0.5138467303728687,
+      "grad_norm": 0.2361784130334854,
+      "learning_rate": 0.0002232432432432432,
+      "loss": 0.4305,
+      "step": 1714
+    },
+    {
+      "epoch": 0.5141465242645681,
+      "grad_norm": 0.25834766030311584,
+      "learning_rate": 0.0002231981981981982,
+      "loss": 0.4569,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5144463181562675,
+      "grad_norm": 0.253662109375,
+      "learning_rate": 0.00022315315315315313,
+      "loss": 0.4289,
+      "step": 1716
+    },
+    {
+      "epoch": 0.514746112047967,
+      "grad_norm": 0.2442713975906372,
+      "learning_rate": 0.00022310810810810807,
+      "loss": 0.4356,
+      "step": 1717
+    },
+    {
+      "epoch": 0.5150459059396665,
+      "grad_norm": 0.25813472270965576,
+      "learning_rate": 0.00022306306306306306,
+      "loss": 0.4721,
+      "step": 1718
+    },
+    {
+      "epoch": 0.5153456998313659,
+      "grad_norm": 0.269927054643631,
+      "learning_rate": 0.000223018018018018,
+      "loss": 0.4603,
+      "step": 1719
+    },
+    {
+      "epoch": 0.5156454937230653,
+      "grad_norm": 0.25108182430267334,
+      "learning_rate": 0.00022297297297297293,
+      "loss": 0.4611,
+      "step": 1720
+    },
+    {
+      "epoch": 0.5159452876147649,
+      "grad_norm": 0.23553571105003357,
+      "learning_rate": 0.00022292792792792793,
+      "loss": 0.431,
+      "step": 1721
+    },
+    {
+      "epoch": 0.5162450815064643,
+      "grad_norm": 0.2411264032125473,
+      "learning_rate": 0.00022288288288288286,
+      "loss": 0.4405,
+      "step": 1722
+    },
+    {
+      "epoch": 0.5165448753981637,
+      "grad_norm": 0.24999505281448364,
+      "learning_rate": 0.0002228378378378378,
+      "loss": 0.4487,
+      "step": 1723
+    },
+    {
+      "epoch": 0.5168446692898632,
+      "grad_norm": 0.23619996011257172,
+      "learning_rate": 0.0002227927927927928,
+      "loss": 0.4399,
+      "step": 1724
+    },
+    {
+      "epoch": 0.5171444631815627,
+      "grad_norm": 0.23623579740524292,
+      "learning_rate": 0.00022274774774774773,
+      "loss": 0.4358,
+      "step": 1725
+    },
+    {
+      "epoch": 0.5174442570732621,
+      "grad_norm": 0.2538294494152069,
+      "learning_rate": 0.00022270270270270266,
+      "loss": 0.4715,
+      "step": 1726
+    },
+    {
+      "epoch": 0.5177440509649616,
+      "grad_norm": 0.23572376370429993,
+      "learning_rate": 0.00022265765765765765,
+      "loss": 0.4529,
+      "step": 1727
+    },
+    {
+      "epoch": 0.518043844856661,
+      "grad_norm": 0.25421515107154846,
+      "learning_rate": 0.0002226126126126126,
+      "loss": 0.4427,
+      "step": 1728
+    },
+    {
+      "epoch": 0.5183436387483605,
+      "grad_norm": 0.23936650156974792,
+      "learning_rate": 0.00022256756756756753,
+      "loss": 0.4416,
+      "step": 1729
+    },
+    {
+      "epoch": 0.51864343264006,
+      "grad_norm": 0.26791784167289734,
+      "learning_rate": 0.00022252252252252252,
+      "loss": 0.4697,
+      "step": 1730
+    },
+    {
+      "epoch": 0.5189432265317594,
+      "grad_norm": 0.2541804015636444,
+      "learning_rate": 0.00022247747747747746,
+      "loss": 0.4801,
+      "step": 1731
+    },
+    {
+      "epoch": 0.5192430204234588,
+      "grad_norm": 0.24337895214557648,
+      "learning_rate": 0.0002224324324324324,
+      "loss": 0.4316,
+      "step": 1732
+    },
+    {
+      "epoch": 0.5195428143151584,
+      "grad_norm": 0.2545047998428345,
+      "learning_rate": 0.00022238738738738738,
+      "loss": 0.4465,
+      "step": 1733
+    },
+    {
+      "epoch": 0.5198426082068578,
+      "grad_norm": 0.24640010297298431,
+      "learning_rate": 0.00022234234234234232,
+      "loss": 0.4574,
+      "step": 1734
+    },
+    {
+      "epoch": 0.5201424020985572,
+      "grad_norm": 0.2528793215751648,
+      "learning_rate": 0.00022229729729729728,
+      "loss": 0.4423,
+      "step": 1735
+    },
+    {
+      "epoch": 0.5204421959902567,
+      "grad_norm": 0.24697841703891754,
+      "learning_rate": 0.00022225225225225222,
+      "loss": 0.4259,
+      "step": 1736
+    },
+    {
+      "epoch": 0.5207419898819562,
+      "grad_norm": 0.24195986986160278,
+      "learning_rate": 0.00022220720720720718,
+      "loss": 0.439,
+      "step": 1737
+    },
+    {
+      "epoch": 0.5210417837736556,
+      "grad_norm": 0.2523336410522461,
+      "learning_rate": 0.00022216216216216215,
+      "loss": 0.4441,
+      "step": 1738
+    },
+    {
+      "epoch": 0.521341577665355,
+      "grad_norm": 0.22917968034744263,
+      "learning_rate": 0.00022211711711711709,
+      "loss": 0.4152,
+      "step": 1739
+    },
+    {
+      "epoch": 0.5216413715570545,
+      "grad_norm": 0.2633557915687561,
+      "learning_rate": 0.00022207207207207205,
+      "loss": 0.4593,
+      "step": 1740
+    },
+    {
+      "epoch": 0.521941165448754,
+      "grad_norm": 0.23756282031536102,
+      "learning_rate": 0.000222027027027027,
+      "loss": 0.4244,
+      "step": 1741
+    },
+    {
+      "epoch": 0.5222409593404534,
+      "grad_norm": 0.24200837314128876,
+      "learning_rate": 0.00022198198198198195,
+      "loss": 0.4056,
+      "step": 1742
+    },
+    {
+      "epoch": 0.5225407532321529,
+      "grad_norm": 0.2658364474773407,
+      "learning_rate": 0.00022193693693693691,
+      "loss": 0.4432,
+      "step": 1743
+    },
+    {
+      "epoch": 0.5228405471238523,
+      "grad_norm": 0.2403399795293808,
+      "learning_rate": 0.00022189189189189188,
+      "loss": 0.4395,
+      "step": 1744
+    },
+    {
+      "epoch": 0.5231403410155518,
+      "grad_norm": 0.23752082884311676,
+      "learning_rate": 0.00022184684684684681,
+      "loss": 0.4284,
+      "step": 1745
+    },
+    {
+      "epoch": 0.5234401349072513,
+      "grad_norm": 0.2684466540813446,
+      "learning_rate": 0.0002218018018018018,
+      "loss": 0.4266,
+      "step": 1746
+    },
+    {
+      "epoch": 0.5237399287989507,
+      "grad_norm": 0.24474304914474487,
+      "learning_rate": 0.00022175675675675674,
+      "loss": 0.4472,
+      "step": 1747
+    },
+    {
+      "epoch": 0.5240397226906501,
+      "grad_norm": 0.24958543479442596,
+      "learning_rate": 0.00022171171171171168,
+      "loss": 0.4253,
+      "step": 1748
+    },
+    {
+      "epoch": 0.5243395165823497,
+      "grad_norm": 0.25974011421203613,
+      "learning_rate": 0.00022166666666666667,
+      "loss": 0.4477,
+      "step": 1749
+    },
+    {
+      "epoch": 0.5246393104740491,
+      "grad_norm": 0.23762384057044983,
+      "learning_rate": 0.0002216216216216216,
+      "loss": 0.425,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5249391043657485,
+      "grad_norm": 0.2543089687824249,
+      "learning_rate": 0.00022157657657657654,
+      "loss": 0.3947,
+      "step": 1751
+    },
+    {
+      "epoch": 0.525238898257448,
+      "grad_norm": 0.26260843873023987,
+      "learning_rate": 0.00022153153153153153,
+      "loss": 0.4642,
+      "step": 1752
+    },
+    {
+      "epoch": 0.5255386921491475,
+      "grad_norm": 0.2474740445613861,
+      "learning_rate": 0.00022148648648648647,
+      "loss": 0.4468,
+      "step": 1753
+    },
+    {
+      "epoch": 0.5258384860408469,
+      "grad_norm": 0.24307985603809357,
+      "learning_rate": 0.0002214414414414414,
+      "loss": 0.4537,
+      "step": 1754
+    },
+    {
+      "epoch": 0.5261382799325464,
+      "grad_norm": 0.27346494793891907,
+      "learning_rate": 0.0002213963963963964,
+      "loss": 0.4789,
+      "step": 1755
+    },
+    {
+      "epoch": 0.5264380738242458,
+      "grad_norm": 0.2407042384147644,
+      "learning_rate": 0.00022135135135135134,
+      "loss": 0.4334,
+      "step": 1756
+    },
+    {
+      "epoch": 0.5267378677159453,
+      "grad_norm": 0.2311742603778839,
+      "learning_rate": 0.00022130630630630627,
+      "loss": 0.4117,
+      "step": 1757
+    },
+    {
+      "epoch": 0.5270376616076448,
+      "grad_norm": 0.23060280084609985,
+      "learning_rate": 0.00022126126126126126,
+      "loss": 0.4413,
+      "step": 1758
+    },
+    {
+      "epoch": 0.5273374554993442,
+      "grad_norm": 0.286851167678833,
+      "learning_rate": 0.0002212162162162162,
+      "loss": 0.461,
+      "step": 1759
+    },
+    {
+      "epoch": 0.5276372493910436,
+      "grad_norm": 0.23764149844646454,
+      "learning_rate": 0.00022117117117117114,
+      "loss": 0.4428,
+      "step": 1760
+    },
+    {
+      "epoch": 0.5279370432827432,
+      "grad_norm": 0.24021115899085999,
+      "learning_rate": 0.0002211261261261261,
+      "loss": 0.4416,
+      "step": 1761
+    },
+    {
+      "epoch": 0.5282368371744426,
+      "grad_norm": 0.25310957431793213,
+      "learning_rate": 0.00022108108108108106,
+      "loss": 0.4474,
+      "step": 1762
+    },
+    {
+      "epoch": 0.528536631066142,
+      "grad_norm": 0.2636144161224365,
+      "learning_rate": 0.000221036036036036,
+      "loss": 0.4629,
+      "step": 1763
+    },
+    {
+      "epoch": 0.5288364249578414,
+      "grad_norm": 0.254807710647583,
+      "learning_rate": 0.00022099099099099097,
+      "loss": 0.4558,
+      "step": 1764
+    },
+    {
+      "epoch": 0.529136218849541,
+      "grad_norm": 0.2389029860496521,
+      "learning_rate": 0.00022094594594594593,
+      "loss": 0.4519,
+      "step": 1765
+    },
+    {
+      "epoch": 0.5294360127412404,
+      "grad_norm": 0.24269163608551025,
+      "learning_rate": 0.00022090090090090087,
+      "loss": 0.4371,
+      "step": 1766
+    },
+    {
+      "epoch": 0.5297358066329398,
+      "grad_norm": 0.2602348029613495,
+      "learning_rate": 0.00022085585585585583,
+      "loss": 0.4474,
+      "step": 1767
+    },
+    {
+      "epoch": 0.5300356005246393,
+      "grad_norm": 0.2557549774646759,
+      "learning_rate": 0.0002208108108108108,
+      "loss": 0.4709,
+      "step": 1768
+    },
+    {
+      "epoch": 0.5303353944163388,
+      "grad_norm": 0.27289271354675293,
+      "learning_rate": 0.00022076576576576576,
+      "loss": 0.4248,
+      "step": 1769
+    },
+    {
+      "epoch": 0.5306351883080382,
+      "grad_norm": 0.27305862307548523,
+      "learning_rate": 0.0002207207207207207,
+      "loss": 0.4566,
+      "step": 1770
+    },
+    {
+      "epoch": 0.5309349821997377,
+      "grad_norm": 0.2508034408092499,
+      "learning_rate": 0.00022067567567567566,
+      "loss": 0.4334,
+      "step": 1771
+    },
+    {
+      "epoch": 0.5312347760914371,
+      "grad_norm": 0.24729134142398834,
+      "learning_rate": 0.00022063063063063062,
+      "loss": 0.4251,
+      "step": 1772
+    },
+    {
+      "epoch": 0.5315345699831366,
+      "grad_norm": 0.23145246505737305,
+      "learning_rate": 0.00022058558558558556,
+      "loss": 0.3992,
+      "step": 1773
+    },
+    {
+      "epoch": 0.5318343638748361,
+      "grad_norm": 0.24190931022167206,
+      "learning_rate": 0.00022054054054054052,
+      "loss": 0.4183,
+      "step": 1774
+    },
+    {
+      "epoch": 0.5321341577665355,
+      "grad_norm": 0.2671104073524475,
+      "learning_rate": 0.0002204954954954955,
+      "loss": 0.4548,
+      "step": 1775
+    },
+    {
+      "epoch": 0.5324339516582349,
+      "grad_norm": 0.2361837476491928,
+      "learning_rate": 0.00022045045045045042,
+      "loss": 0.414,
+      "step": 1776
+    },
+    {
+      "epoch": 0.5327337455499345,
+      "grad_norm": 0.25529375672340393,
+      "learning_rate": 0.0002204054054054054,
+      "loss": 0.4345,
+      "step": 1777
+    },
+    {
+      "epoch": 0.5330335394416339,
+      "grad_norm": 0.26258841156959534,
+      "learning_rate": 0.00022036036036036035,
+      "loss": 0.4522,
+      "step": 1778
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.24532164633274078,
+      "learning_rate": 0.0002203153153153153,
+      "loss": 0.4303,
+      "step": 1779
+    },
+    {
+      "epoch": 0.5336331272250328,
+      "grad_norm": 0.24562229216098785,
+      "learning_rate": 0.00022027027027027028,
+      "loss": 0.427,
+      "step": 1780
+    },
+    {
+      "epoch": 0.5339329211167323,
+      "grad_norm": 0.25493496656417847,
+      "learning_rate": 0.00022022522522522522,
+      "loss": 0.45,
+      "step": 1781
+    },
+    {
+      "epoch": 0.5342327150084317,
+      "grad_norm": 0.25944793224334717,
+      "learning_rate": 0.00022018018018018015,
+      "loss": 0.449,
+      "step": 1782
+    },
+    {
+      "epoch": 0.5345325089001312,
+      "grad_norm": 0.24805709719657898,
+      "learning_rate": 0.00022013513513513514,
+      "loss": 0.4431,
+      "step": 1783
+    },
+    {
+      "epoch": 0.5348323027918306,
+      "grad_norm": 0.27083903551101685,
+      "learning_rate": 0.00022009009009009008,
+      "loss": 0.465,
+      "step": 1784
+    },
+    {
+      "epoch": 0.5351320966835301,
+      "grad_norm": 0.2876584827899933,
+      "learning_rate": 0.00022004504504504502,
+      "loss": 0.4859,
+      "step": 1785
+    },
+    {
+      "epoch": 0.5354318905752296,
+      "grad_norm": 0.2499544471502304,
+      "learning_rate": 0.00021999999999999995,
+      "loss": 0.4551,
+      "step": 1786
+    },
+    {
+      "epoch": 0.535731684466929,
+      "grad_norm": 0.2812490165233612,
+      "learning_rate": 0.00021995495495495494,
+      "loss": 0.4554,
+      "step": 1787
+    },
+    {
+      "epoch": 0.5360314783586284,
+      "grad_norm": 0.2530219256877899,
+      "learning_rate": 0.00021990990990990988,
+      "loss": 0.4335,
+      "step": 1788
+    },
+    {
+      "epoch": 0.536331272250328,
+      "grad_norm": 0.24573327600955963,
+      "learning_rate": 0.00021986486486486482,
+      "loss": 0.4574,
+      "step": 1789
+    },
+    {
+      "epoch": 0.5366310661420274,
+      "grad_norm": 0.3053630292415619,
+      "learning_rate": 0.0002198198198198198,
+      "loss": 0.4745,
+      "step": 1790
+    },
+    {
+      "epoch": 0.5369308600337268,
+      "grad_norm": 0.24176299571990967,
+      "learning_rate": 0.00021977477477477475,
+      "loss": 0.4244,
+      "step": 1791
+    },
+    {
+      "epoch": 0.5372306539254262,
+      "grad_norm": 0.27554550766944885,
+      "learning_rate": 0.0002197297297297297,
+      "loss": 0.469,
+      "step": 1792
+    },
+    {
+      "epoch": 0.5375304478171258,
+      "grad_norm": 0.28003454208374023,
+      "learning_rate": 0.00021968468468468467,
+      "loss": 0.4477,
+      "step": 1793
+    },
+    {
+      "epoch": 0.5378302417088252,
+      "grad_norm": 0.27388301491737366,
+      "learning_rate": 0.0002196396396396396,
+      "loss": 0.4602,
+      "step": 1794
+    },
+    {
+      "epoch": 0.5381300356005246,
+      "grad_norm": 0.2521568238735199,
+      "learning_rate": 0.00021959459459459457,
+      "loss": 0.4507,
+      "step": 1795
+    },
+    {
+      "epoch": 0.5384298294922241,
+      "grad_norm": 0.24554435908794403,
+      "learning_rate": 0.00021954954954954954,
+      "loss": 0.4387,
+      "step": 1796
+    },
+    {
+      "epoch": 0.5387296233839236,
+      "grad_norm": 0.24148909747600555,
+      "learning_rate": 0.00021950450450450447,
+      "loss": 0.4316,
+      "step": 1797
+    },
+    {
+      "epoch": 0.539029417275623,
+      "grad_norm": 0.24902650713920593,
+      "learning_rate": 0.00021945945945945944,
+      "loss": 0.448,
+      "step": 1798
+    },
+    {
+      "epoch": 0.5393292111673225,
+      "grad_norm": 0.25397536158561707,
+      "learning_rate": 0.0002194144144144144,
+      "loss": 0.479,
+      "step": 1799
+    },
+    {
+      "epoch": 0.5396290050590219,
+      "grad_norm": 0.2324562668800354,
+      "learning_rate": 0.00021936936936936934,
+      "loss": 0.4077,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5399287989507214,
+      "grad_norm": 0.26509541273117065,
+      "learning_rate": 0.0002193243243243243,
+      "loss": 0.4628,
+      "step": 1801
+    },
+    {
+      "epoch": 0.5402285928424209,
+      "grad_norm": 0.24714629352092743,
+      "learning_rate": 0.00021927927927927927,
+      "loss": 0.4393,
+      "step": 1802
+    },
+    {
+      "epoch": 0.5405283867341203,
+      "grad_norm": 0.2634679079055786,
+      "learning_rate": 0.00021923423423423423,
+      "loss": 0.4508,
+      "step": 1803
+    },
+    {
+      "epoch": 0.5408281806258197,
+      "grad_norm": 0.2673392593860626,
+      "learning_rate": 0.00021918918918918917,
+      "loss": 0.4587,
+      "step": 1804
+    },
+    {
+      "epoch": 0.5411279745175192,
+      "grad_norm": 0.2438841462135315,
+      "learning_rate": 0.00021914414414414413,
+      "loss": 0.4271,
+      "step": 1805
+    },
+    {
+      "epoch": 0.5414277684092187,
+      "grad_norm": 0.2463088482618332,
+      "learning_rate": 0.0002190990990990991,
+      "loss": 0.4216,
+      "step": 1806
+    },
+    {
+      "epoch": 0.5417275623009181,
+      "grad_norm": 0.22443120181560516,
+      "learning_rate": 0.00021905405405405403,
+      "loss": 0.3935,
+      "step": 1807
+    },
+    {
+      "epoch": 0.5420273561926175,
+      "grad_norm": 0.2647371292114258,
+      "learning_rate": 0.00021900900900900897,
+      "loss": 0.4474,
+      "step": 1808
+    },
+    {
+      "epoch": 0.542327150084317,
+      "grad_norm": 0.2505868077278137,
+      "learning_rate": 0.00021896396396396396,
+      "loss": 0.4623,
+      "step": 1809
+    },
+    {
+      "epoch": 0.5426269439760165,
+      "grad_norm": 0.2395186722278595,
+      "learning_rate": 0.0002189189189189189,
+      "loss": 0.4469,
+      "step": 1810
+    },
+    {
+      "epoch": 0.542926737867716,
+      "grad_norm": 0.2345222383737564,
+      "learning_rate": 0.00021887387387387383,
+      "loss": 0.4127,
+      "step": 1811
+    },
+    {
+      "epoch": 0.5432265317594154,
+      "grad_norm": 0.25592970848083496,
+      "learning_rate": 0.00021882882882882882,
+      "loss": 0.4642,
+      "step": 1812
+    },
+    {
+      "epoch": 0.5435263256511148,
+      "grad_norm": 0.2604975700378418,
+      "learning_rate": 0.00021878378378378376,
+      "loss": 0.461,
+      "step": 1813
+    },
+    {
+      "epoch": 0.5438261195428143,
+      "grad_norm": 0.2413945347070694,
+      "learning_rate": 0.0002187387387387387,
+      "loss": 0.4281,
+      "step": 1814
+    },
+    {
+      "epoch": 0.5441259134345138,
+      "grad_norm": 0.2543574571609497,
+      "learning_rate": 0.0002186936936936937,
+      "loss": 0.4657,
+      "step": 1815
+    },
+    {
+      "epoch": 0.5444257073262132,
+      "grad_norm": 0.2783868610858917,
+      "learning_rate": 0.00021864864864864863,
+      "loss": 0.4917,
+      "step": 1816
+    },
+    {
+      "epoch": 0.5447255012179126,
+      "grad_norm": 0.23379693925380707,
+      "learning_rate": 0.00021860360360360356,
+      "loss": 0.4324,
+      "step": 1817
+    },
+    {
+      "epoch": 0.5450252951096122,
+      "grad_norm": 0.22307537496089935,
+      "learning_rate": 0.00021855855855855855,
+      "loss": 0.4175,
+      "step": 1818
+    },
+    {
+      "epoch": 0.5453250890013116,
+      "grad_norm": 0.23716312646865845,
+      "learning_rate": 0.0002185135135135135,
+      "loss": 0.4195,
+      "step": 1819
+    },
+    {
+      "epoch": 0.545624882893011,
+      "grad_norm": 0.2469712346792221,
+      "learning_rate": 0.00021846846846846843,
+      "loss": 0.4386,
+      "step": 1820
+    },
+    {
+      "epoch": 0.5459246767847105,
+      "grad_norm": 0.24840980768203735,
+      "learning_rate": 0.00021842342342342342,
+      "loss": 0.4193,
+      "step": 1821
+    },
+    {
+      "epoch": 0.54622447067641,
+      "grad_norm": 0.25362369418144226,
+      "learning_rate": 0.00021837837837837835,
+      "loss": 0.445,
+      "step": 1822
+    },
+    {
+      "epoch": 0.5465242645681094,
+      "grad_norm": 0.23812243342399597,
+      "learning_rate": 0.0002183333333333333,
+      "loss": 0.4511,
+      "step": 1823
+    },
+    {
+      "epoch": 0.5468240584598089,
+      "grad_norm": 0.25310468673706055,
+      "learning_rate": 0.00021828828828828828,
+      "loss": 0.4463,
+      "step": 1824
+    },
+    {
+      "epoch": 0.5471238523515083,
+      "grad_norm": 0.25943437218666077,
+      "learning_rate": 0.00021824324324324322,
+      "loss": 0.42,
+      "step": 1825
+    },
+    {
+      "epoch": 0.5474236462432078,
+      "grad_norm": 0.270178884267807,
+      "learning_rate": 0.00021819819819819818,
+      "loss": 0.4666,
+      "step": 1826
+    },
+    {
+      "epoch": 0.5477234401349073,
+      "grad_norm": 0.23301197588443756,
+      "learning_rate": 0.00021815315315315315,
+      "loss": 0.407,
+      "step": 1827
+    },
+    {
+      "epoch": 0.5480232340266067,
+      "grad_norm": 0.2634783983230591,
+      "learning_rate": 0.00021810810810810808,
+      "loss": 0.4596,
+      "step": 1828
+    },
+    {
+      "epoch": 0.5483230279183061,
+      "grad_norm": 0.24297159910202026,
+      "learning_rate": 0.00021806306306306305,
+      "loss": 0.4107,
+      "step": 1829
+    },
+    {
+      "epoch": 0.5486228218100057,
+      "grad_norm": 0.2781638503074646,
+      "learning_rate": 0.000218018018018018,
+      "loss": 0.4686,
+      "step": 1830
+    },
+    {
+      "epoch": 0.5489226157017051,
+      "grad_norm": 0.25801247358322144,
+      "learning_rate": 0.00021797297297297295,
+      "loss": 0.4644,
+      "step": 1831
+    },
+    {
+      "epoch": 0.5492224095934045,
+      "grad_norm": 0.27080485224723816,
+      "learning_rate": 0.0002179279279279279,
+      "loss": 0.428,
+      "step": 1832
+    },
+    {
+      "epoch": 0.5495222034851039,
+      "grad_norm": 0.2617061138153076,
+      "learning_rate": 0.00021788288288288285,
+      "loss": 0.4437,
+      "step": 1833
+    },
+    {
+      "epoch": 0.5498219973768035,
+      "grad_norm": 0.26249265670776367,
+      "learning_rate": 0.0002178378378378378,
+      "loss": 0.4413,
+      "step": 1834
+    },
+    {
+      "epoch": 0.5501217912685029,
+      "grad_norm": 0.2705504894256592,
+      "learning_rate": 0.00021779279279279278,
+      "loss": 0.4716,
+      "step": 1835
+    },
+    {
+      "epoch": 0.5504215851602023,
+      "grad_norm": 0.25836360454559326,
+      "learning_rate": 0.0002177477477477477,
+      "loss": 0.454,
+      "step": 1836
+    },
+    {
+      "epoch": 0.5507213790519018,
+      "grad_norm": 0.23872222006320953,
+      "learning_rate": 0.0002177027027027027,
+      "loss": 0.4173,
+      "step": 1837
+    },
+    {
+      "epoch": 0.5510211729436013,
+      "grad_norm": 0.2438534051179886,
+      "learning_rate": 0.00021765765765765764,
+      "loss": 0.4114,
+      "step": 1838
+    },
+    {
+      "epoch": 0.5513209668353007,
+      "grad_norm": 0.2600691318511963,
+      "learning_rate": 0.00021761261261261258,
+      "loss": 0.4364,
+      "step": 1839
+    },
+    {
+      "epoch": 0.5516207607270002,
+      "grad_norm": 0.2483833283185959,
+      "learning_rate": 0.00021756756756756757,
+      "loss": 0.4544,
+      "step": 1840
+    },
+    {
+      "epoch": 0.5519205546186996,
+      "grad_norm": 0.2506955564022064,
+      "learning_rate": 0.0002175225225225225,
+      "loss": 0.4561,
+      "step": 1841
+    },
+    {
+      "epoch": 0.5522203485103991,
+      "grad_norm": 0.2800386846065521,
+      "learning_rate": 0.00021747747747747744,
+      "loss": 0.438,
+      "step": 1842
+    },
+    {
+      "epoch": 0.5525201424020986,
+      "grad_norm": 0.2458580881357193,
+      "learning_rate": 0.00021743243243243243,
+      "loss": 0.4431,
+      "step": 1843
+    },
+    {
+      "epoch": 0.552819936293798,
+      "grad_norm": 0.24995484948158264,
+      "learning_rate": 0.00021738738738738737,
+      "loss": 0.4349,
+      "step": 1844
+    },
+    {
+      "epoch": 0.5531197301854974,
+      "grad_norm": 0.2502653896808624,
+      "learning_rate": 0.0002173423423423423,
+      "loss": 0.4366,
+      "step": 1845
+    },
+    {
+      "epoch": 0.553419524077197,
+      "grad_norm": 0.2538207471370697,
+      "learning_rate": 0.0002172972972972973,
+      "loss": 0.4381,
+      "step": 1846
+    },
+    {
+      "epoch": 0.5537193179688964,
+      "grad_norm": 0.2417684644460678,
+      "learning_rate": 0.00021725225225225223,
+      "loss": 0.4277,
+      "step": 1847
+    },
+    {
+      "epoch": 0.5540191118605958,
+      "grad_norm": 0.2531186044216156,
+      "learning_rate": 0.00021720720720720717,
+      "loss": 0.4361,
+      "step": 1848
+    },
+    {
+      "epoch": 0.5543189057522953,
+      "grad_norm": 0.2551022171974182,
+      "learning_rate": 0.00021716216216216216,
+      "loss": 0.4483,
+      "step": 1849
+    },
+    {
+      "epoch": 0.5546186996439948,
+      "grad_norm": 0.23375505208969116,
+      "learning_rate": 0.0002171171171171171,
+      "loss": 0.4157,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5549184935356942,
+      "grad_norm": 0.23428985476493835,
+      "learning_rate": 0.00021707207207207204,
+      "loss": 0.4092,
+      "step": 1851
+    },
+    {
+      "epoch": 0.5552182874273937,
+      "grad_norm": 0.24895919859409332,
+      "learning_rate": 0.00021702702702702703,
+      "loss": 0.4377,
+      "step": 1852
+    },
+    {
+      "epoch": 0.5555180813190931,
+      "grad_norm": 0.23306754231452942,
+      "learning_rate": 0.00021698198198198196,
+      "loss": 0.4316,
+      "step": 1853
+    },
+    {
+      "epoch": 0.5558178752107926,
+      "grad_norm": 0.2527741491794586,
+      "learning_rate": 0.0002169369369369369,
+      "loss": 0.4245,
+      "step": 1854
+    },
+    {
+      "epoch": 0.556117669102492,
+      "grad_norm": 0.2319868952035904,
+      "learning_rate": 0.0002168918918918919,
+      "loss": 0.4074,
+      "step": 1855
+    },
+    {
+      "epoch": 0.5564174629941915,
+      "grad_norm": 0.25132691860198975,
+      "learning_rate": 0.00021684684684684683,
+      "loss": 0.4378,
+      "step": 1856
+    },
+    {
+      "epoch": 0.5567172568858909,
+      "grad_norm": 0.24814215302467346,
+      "learning_rate": 0.00021680180180180177,
+      "loss": 0.4639,
+      "step": 1857
+    },
+    {
+      "epoch": 0.5570170507775905,
+      "grad_norm": 0.23483987152576447,
+      "learning_rate": 0.00021675675675675673,
+      "loss": 0.4419,
+      "step": 1858
+    },
+    {
+      "epoch": 0.5573168446692899,
+      "grad_norm": 0.24670268595218658,
+      "learning_rate": 0.0002167117117117117,
+      "loss": 0.436,
+      "step": 1859
+    },
+    {
+      "epoch": 0.5576166385609893,
+      "grad_norm": 0.23737584054470062,
+      "learning_rate": 0.00021666666666666666,
+      "loss": 0.4246,
+      "step": 1860
+    },
+    {
+      "epoch": 0.5579164324526887,
+      "grad_norm": 0.2634913921356201,
+      "learning_rate": 0.0002166216216216216,
+      "loss": 0.4252,
+      "step": 1861
+    },
+    {
+      "epoch": 0.5582162263443883,
+      "grad_norm": 0.25565293431282043,
+      "learning_rate": 0.00021657657657657656,
+      "loss": 0.4287,
+      "step": 1862
+    },
+    {
+      "epoch": 0.5585160202360877,
+      "grad_norm": 0.2408502846956253,
+      "learning_rate": 0.00021653153153153152,
+      "loss": 0.4259,
+      "step": 1863
+    },
+    {
+      "epoch": 0.5588158141277871,
+      "grad_norm": 0.2516701817512512,
+      "learning_rate": 0.00021648648648648646,
+      "loss": 0.4361,
+      "step": 1864
+    },
+    {
+      "epoch": 0.5591156080194866,
+      "grad_norm": 0.2645619213581085,
+      "learning_rate": 0.00021644144144144142,
+      "loss": 0.4336,
+      "step": 1865
+    },
+    {
+      "epoch": 0.5594154019111861,
+      "grad_norm": 0.27260729670524597,
+      "learning_rate": 0.00021639639639639639,
+      "loss": 0.4494,
+      "step": 1866
+    },
+    {
+      "epoch": 0.5597151958028855,
+      "grad_norm": 0.2506016492843628,
+      "learning_rate": 0.00021635135135135132,
+      "loss": 0.4027,
+      "step": 1867
+    },
+    {
+      "epoch": 0.560014989694585,
+      "grad_norm": 0.2552817463874817,
+      "learning_rate": 0.00021630630630630629,
+      "loss": 0.426,
+      "step": 1868
+    },
+    {
+      "epoch": 0.5603147835862844,
+      "grad_norm": 0.2529433071613312,
+      "learning_rate": 0.00021626126126126125,
+      "loss": 0.4327,
+      "step": 1869
+    },
+    {
+      "epoch": 0.5606145774779839,
+      "grad_norm": 0.2483612447977066,
+      "learning_rate": 0.0002162162162162162,
+      "loss": 0.4492,
+      "step": 1870
+    },
+    {
+      "epoch": 0.5609143713696834,
+      "grad_norm": 0.25298991799354553,
+      "learning_rate": 0.00021617117117117118,
+      "loss": 0.4442,
+      "step": 1871
+    },
+    {
+      "epoch": 0.5612141652613828,
+      "grad_norm": 0.243782639503479,
+      "learning_rate": 0.00021612612612612611,
+      "loss": 0.4462,
+      "step": 1872
+    },
+    {
+      "epoch": 0.5615139591530822,
+      "grad_norm": 0.24677562713623047,
+      "learning_rate": 0.00021608108108108105,
+      "loss": 0.433,
+      "step": 1873
+    },
+    {
+      "epoch": 0.5618137530447818,
+      "grad_norm": 0.2761828601360321,
+      "learning_rate": 0.00021603603603603604,
+      "loss": 0.4436,
+      "step": 1874
+    },
+    {
+      "epoch": 0.5621135469364812,
+      "grad_norm": 0.24887466430664062,
+      "learning_rate": 0.00021599099099099098,
+      "loss": 0.4211,
+      "step": 1875
+    },
+    {
+      "epoch": 0.5624133408281806,
+      "grad_norm": 0.2687954902648926,
+      "learning_rate": 0.00021594594594594592,
+      "loss": 0.4545,
+      "step": 1876
+    },
+    {
+      "epoch": 0.56271313471988,
+      "grad_norm": 0.2376958280801773,
+      "learning_rate": 0.0002159009009009009,
+      "loss": 0.4062,
+      "step": 1877
+    },
+    {
+      "epoch": 0.5630129286115796,
+      "grad_norm": 0.25497403740882874,
+      "learning_rate": 0.00021585585585585584,
+      "loss": 0.4512,
+      "step": 1878
+    },
+    {
+      "epoch": 0.563312722503279,
+      "grad_norm": 0.27644020318984985,
+      "learning_rate": 0.00021581081081081078,
+      "loss": 0.4317,
+      "step": 1879
+    },
+    {
+      "epoch": 0.5636125163949784,
+      "grad_norm": 0.274746298789978,
+      "learning_rate": 0.00021576576576576577,
+      "loss": 0.4523,
+      "step": 1880
+    },
+    {
+      "epoch": 0.5639123102866779,
+      "grad_norm": 0.24522797763347626,
+      "learning_rate": 0.0002157207207207207,
+      "loss": 0.4193,
+      "step": 1881
+    },
+    {
+      "epoch": 0.5642121041783774,
+      "grad_norm": 0.26614081859588623,
+      "learning_rate": 0.00021567567567567565,
+      "loss": 0.4474,
+      "step": 1882
+    },
+    {
+      "epoch": 0.5645118980700768,
+      "grad_norm": 0.25119736790657043,
+      "learning_rate": 0.0002156306306306306,
+      "loss": 0.4349,
+      "step": 1883
+    },
+    {
+      "epoch": 0.5648116919617763,
+      "grad_norm": 0.2535664737224579,
+      "learning_rate": 0.00021558558558558557,
+      "loss": 0.4395,
+      "step": 1884
+    },
+    {
+      "epoch": 0.5651114858534757,
+      "grad_norm": 0.27441659569740295,
+      "learning_rate": 0.0002155405405405405,
+      "loss": 0.4591,
+      "step": 1885
+    },
+    {
+      "epoch": 0.5654112797451752,
+      "grad_norm": 0.2535187900066376,
+      "learning_rate": 0.00021549549549549547,
+      "loss": 0.4256,
+      "step": 1886
+    },
+    {
+      "epoch": 0.5657110736368747,
+      "grad_norm": 0.2877647876739502,
+      "learning_rate": 0.00021545045045045044,
+      "loss": 0.4877,
+      "step": 1887
+    },
+    {
+      "epoch": 0.5660108675285741,
+      "grad_norm": 0.23665641248226166,
+      "learning_rate": 0.00021540540540540537,
+      "loss": 0.4506,
+      "step": 1888
+    },
+    {
+      "epoch": 0.5663106614202735,
+      "grad_norm": 0.2620941996574402,
+      "learning_rate": 0.00021536036036036034,
+      "loss": 0.4415,
+      "step": 1889
+    },
+    {
+      "epoch": 0.5666104553119731,
+      "grad_norm": 0.2704925239086151,
+      "learning_rate": 0.0002153153153153153,
+      "loss": 0.4855,
+      "step": 1890
+    },
+    {
+      "epoch": 0.5669102492036725,
+      "grad_norm": 0.2636096775531769,
+      "learning_rate": 0.00021527027027027024,
+      "loss": 0.4196,
+      "step": 1891
+    },
+    {
+      "epoch": 0.5672100430953719,
+      "grad_norm": 0.2897530496120453,
+      "learning_rate": 0.0002152252252252252,
+      "loss": 0.4556,
+      "step": 1892
+    },
+    {
+      "epoch": 0.5675098369870714,
+      "grad_norm": 0.2407667189836502,
+      "learning_rate": 0.00021518018018018017,
+      "loss": 0.4406,
+      "step": 1893
+    },
+    {
+      "epoch": 0.5678096308787709,
+      "grad_norm": 0.277865469455719,
+      "learning_rate": 0.00021513513513513513,
+      "loss": 0.4033,
+      "step": 1894
+    },
+    {
+      "epoch": 0.5681094247704703,
+      "grad_norm": 0.2540576159954071,
+      "learning_rate": 0.00021509009009009007,
+      "loss": 0.4355,
+      "step": 1895
+    },
+    {
+      "epoch": 0.5684092186621698,
+      "grad_norm": 0.26155397295951843,
+      "learning_rate": 0.00021504504504504503,
+      "loss": 0.4235,
+      "step": 1896
+    },
+    {
+      "epoch": 0.5687090125538692,
+      "grad_norm": 0.25544247031211853,
+      "learning_rate": 0.000215,
+      "loss": 0.4279,
+      "step": 1897
+    },
+    {
+      "epoch": 0.5690088064455687,
+      "grad_norm": 0.2406405806541443,
+      "learning_rate": 0.00021495495495495493,
+      "loss": 0.4208,
+      "step": 1898
+    },
+    {
+      "epoch": 0.5693086003372682,
+      "grad_norm": 0.24712449312210083,
+      "learning_rate": 0.0002149099099099099,
+      "loss": 0.4344,
+      "step": 1899
+    },
+    {
+      "epoch": 0.5696083942289676,
+      "grad_norm": 0.26752716302871704,
+      "learning_rate": 0.00021486486486486486,
+      "loss": 0.4934,
+      "step": 1900
+    },
+    {
+      "epoch": 0.569908188120667,
+      "grad_norm": 0.24652232229709625,
+      "learning_rate": 0.0002148198198198198,
+      "loss": 0.4546,
+      "step": 1901
+    },
+    {
+      "epoch": 0.5702079820123664,
+      "grad_norm": 0.251396507024765,
+      "learning_rate": 0.00021477477477477476,
+      "loss": 0.4738,
+      "step": 1902
+    },
+    {
+      "epoch": 0.570507775904066,
+      "grad_norm": 0.24406296014785767,
+      "learning_rate": 0.00021472972972972972,
+      "loss": 0.4536,
+      "step": 1903
+    },
+    {
+      "epoch": 0.5708075697957654,
+      "grad_norm": 0.2634785771369934,
+      "learning_rate": 0.00021468468468468466,
+      "loss": 0.4653,
+      "step": 1904
+    },
+    {
+      "epoch": 0.5711073636874648,
+      "grad_norm": 0.24010764062404633,
+      "learning_rate": 0.0002146396396396396,
+      "loss": 0.4188,
+      "step": 1905
+    },
+    {
+      "epoch": 0.5714071575791643,
+      "grad_norm": 0.24869616329669952,
+      "learning_rate": 0.0002145945945945946,
+      "loss": 0.4474,
+      "step": 1906
+    },
+    {
+      "epoch": 0.5717069514708638,
+      "grad_norm": 0.24654364585876465,
+      "learning_rate": 0.00021454954954954953,
+      "loss": 0.4409,
+      "step": 1907
+    },
+    {
+      "epoch": 0.5720067453625632,
+      "grad_norm": 0.29856958985328674,
+      "learning_rate": 0.00021450450450450446,
+      "loss": 0.4912,
+      "step": 1908
+    },
+    {
+      "epoch": 0.5723065392542627,
+      "grad_norm": 0.2449256181716919,
+      "learning_rate": 0.00021445945945945945,
+      "loss": 0.4501,
+      "step": 1909
+    },
+    {
+      "epoch": 0.5726063331459621,
+      "grad_norm": 0.29776662588119507,
+      "learning_rate": 0.0002144144144144144,
+      "loss": 0.4403,
+      "step": 1910
+    },
+    {
+      "epoch": 0.5729061270376616,
+      "grad_norm": 0.26075392961502075,
+      "learning_rate": 0.00021436936936936933,
+      "loss": 0.4325,
+      "step": 1911
+    },
+    {
+      "epoch": 0.5732059209293611,
+      "grad_norm": 0.23287932574748993,
+      "learning_rate": 0.00021432432432432432,
+      "loss": 0.4265,
+      "step": 1912
+    },
+    {
+      "epoch": 0.5735057148210605,
+      "grad_norm": 0.2624457776546478,
+      "learning_rate": 0.00021427927927927925,
+      "loss": 0.4433,
+      "step": 1913
+    },
+    {
+      "epoch": 0.5738055087127599,
+      "grad_norm": 0.23440878093242645,
+      "learning_rate": 0.0002142342342342342,
+      "loss": 0.4136,
+      "step": 1914
+    },
+    {
+      "epoch": 0.5741053026044595,
+      "grad_norm": 0.2694714069366455,
+      "learning_rate": 0.00021418918918918918,
+      "loss": 0.4508,
+      "step": 1915
+    },
+    {
+      "epoch": 0.5744050964961589,
+      "grad_norm": 0.261952668428421,
+      "learning_rate": 0.00021414414414414412,
+      "loss": 0.4697,
+      "step": 1916
+    },
+    {
+      "epoch": 0.5747048903878583,
+      "grad_norm": 0.2526634931564331,
+      "learning_rate": 0.00021409909909909908,
+      "loss": 0.4518,
+      "step": 1917
+    },
+    {
+      "epoch": 0.5750046842795578,
+      "grad_norm": 0.2554527223110199,
+      "learning_rate": 0.00021405405405405405,
+      "loss": 0.4802,
+      "step": 1918
+    },
+    {
+      "epoch": 0.5753044781712573,
+      "grad_norm": 0.2729927599430084,
+      "learning_rate": 0.00021400900900900898,
+      "loss": 0.4622,
+      "step": 1919
+    },
+    {
+      "epoch": 0.5756042720629567,
+      "grad_norm": 0.26347601413726807,
+      "learning_rate": 0.00021396396396396395,
+      "loss": 0.4709,
+      "step": 1920
+    },
+    {
+      "epoch": 0.5759040659546562,
+      "grad_norm": 0.27795466780662537,
+      "learning_rate": 0.0002139189189189189,
+      "loss": 0.4555,
+      "step": 1921
+    },
+    {
+      "epoch": 0.5762038598463556,
+      "grad_norm": 0.25687262415885925,
+      "learning_rate": 0.00021387387387387385,
+      "loss": 0.4459,
+      "step": 1922
+    },
+    {
+      "epoch": 0.5765036537380551,
+      "grad_norm": 0.24855007231235504,
+      "learning_rate": 0.0002138288288288288,
+      "loss": 0.3944,
+      "step": 1923
+    },
+    {
+      "epoch": 0.5768034476297546,
+      "grad_norm": 0.26772525906562805,
+      "learning_rate": 0.00021378378378378378,
+      "loss": 0.4557,
+      "step": 1924
+    },
+    {
+      "epoch": 0.577103241521454,
+      "grad_norm": 0.2783243954181671,
+      "learning_rate": 0.0002137387387387387,
+      "loss": 0.4507,
+      "step": 1925
+    },
+    {
+      "epoch": 0.5774030354131534,
+      "grad_norm": 0.26769769191741943,
+      "learning_rate": 0.00021369369369369368,
+      "loss": 0.4371,
+      "step": 1926
+    },
+    {
+      "epoch": 0.577702829304853,
+      "grad_norm": 0.2783718705177307,
+      "learning_rate": 0.00021364864864864864,
+      "loss": 0.4269,
+      "step": 1927
+    },
+    {
+      "epoch": 0.5780026231965524,
+      "grad_norm": 0.2654764652252197,
+      "learning_rate": 0.0002136036036036036,
+      "loss": 0.4407,
+      "step": 1928
+    },
+    {
+      "epoch": 0.5783024170882518,
+      "grad_norm": 0.2694533169269562,
+      "learning_rate": 0.00021355855855855854,
+      "loss": 0.4822,
+      "step": 1929
+    },
+    {
+      "epoch": 0.5786022109799512,
+      "grad_norm": 0.2403596192598343,
+      "learning_rate": 0.00021351351351351348,
+      "loss": 0.3985,
+      "step": 1930
+    },
+    {
+      "epoch": 0.5789020048716508,
+      "grad_norm": 0.26435697078704834,
+      "learning_rate": 0.00021346846846846847,
+      "loss": 0.4547,
+      "step": 1931
+    },
+    {
+      "epoch": 0.5792017987633502,
+      "grad_norm": 0.2416696548461914,
+      "learning_rate": 0.0002134234234234234,
+      "loss": 0.406,
+      "step": 1932
+    },
+    {
+      "epoch": 0.5795015926550496,
+      "grad_norm": 0.2664091885089874,
+      "learning_rate": 0.00021337837837837834,
+      "loss": 0.412,
+      "step": 1933
+    },
+    {
+      "epoch": 0.5798013865467491,
+      "grad_norm": 0.24384742975234985,
+      "learning_rate": 0.00021333333333333333,
+      "loss": 0.4621,
+      "step": 1934
+    },
+    {
+      "epoch": 0.5801011804384486,
+      "grad_norm": 0.27569761872291565,
+      "learning_rate": 0.00021328828828828827,
+      "loss": 0.4643,
+      "step": 1935
+    },
+    {
+      "epoch": 0.580400974330148,
+      "grad_norm": 0.2567230761051178,
+      "learning_rate": 0.0002132432432432432,
+      "loss": 0.4231,
+      "step": 1936
+    },
+    {
+      "epoch": 0.5807007682218475,
+      "grad_norm": 0.25014641880989075,
+      "learning_rate": 0.0002131981981981982,
+      "loss": 0.4461,
+      "step": 1937
+    },
+    {
+      "epoch": 0.5810005621135469,
+      "grad_norm": 0.2679264545440674,
+      "learning_rate": 0.00021315315315315313,
+      "loss": 0.4321,
+      "step": 1938
+    },
+    {
+      "epoch": 0.5813003560052464,
+      "grad_norm": 0.2585947513580322,
+      "learning_rate": 0.00021310810810810807,
+      "loss": 0.4486,
+      "step": 1939
+    },
+    {
+      "epoch": 0.5816001498969459,
+      "grad_norm": 0.2640276253223419,
+      "learning_rate": 0.00021306306306306306,
+      "loss": 0.4439,
+      "step": 1940
+    },
+    {
+      "epoch": 0.5818999437886453,
+      "grad_norm": 0.2556924521923065,
+      "learning_rate": 0.000213018018018018,
+      "loss": 0.4557,
+      "step": 1941
+    },
+    {
+      "epoch": 0.5821997376803447,
+      "grad_norm": 0.2560097873210907,
+      "learning_rate": 0.00021297297297297294,
+      "loss": 0.4384,
+      "step": 1942
+    },
+    {
+      "epoch": 0.5824995315720443,
+      "grad_norm": 0.25773030519485474,
+      "learning_rate": 0.00021292792792792793,
+      "loss": 0.4289,
+      "step": 1943
+    },
+    {
+      "epoch": 0.5827993254637437,
+      "grad_norm": 0.26476195454597473,
+      "learning_rate": 0.00021288288288288286,
+      "loss": 0.4513,
+      "step": 1944
+    },
+    {
+      "epoch": 0.5830991193554431,
+      "grad_norm": 0.26929306983947754,
+      "learning_rate": 0.0002128378378378378,
+      "loss": 0.4464,
+      "step": 1945
+    },
+    {
+      "epoch": 0.5833989132471425,
+      "grad_norm": 0.25713875889778137,
+      "learning_rate": 0.0002127927927927928,
+      "loss": 0.4422,
+      "step": 1946
+    },
+    {
+      "epoch": 0.5836987071388421,
+      "grad_norm": 0.24906396865844727,
+      "learning_rate": 0.00021274774774774773,
+      "loss": 0.4454,
+      "step": 1947
+    },
+    {
+      "epoch": 0.5839985010305415,
+      "grad_norm": 0.24760214984416962,
+      "learning_rate": 0.00021270270270270266,
+      "loss": 0.4343,
+      "step": 1948
+    },
+    {
+      "epoch": 0.584298294922241,
+      "grad_norm": 0.25164082646369934,
+      "learning_rate": 0.00021265765765765766,
+      "loss": 0.4455,
+      "step": 1949
+    },
+    {
+      "epoch": 0.5845980888139404,
+      "grad_norm": 0.24564093351364136,
+      "learning_rate": 0.0002126126126126126,
+      "loss": 0.4389,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5848978827056399,
+      "grad_norm": 0.2754795551300049,
+      "learning_rate": 0.00021256756756756756,
+      "loss": 0.466,
+      "step": 1951
+    },
+    {
+      "epoch": 0.5851976765973393,
+      "grad_norm": 0.2439223974943161,
+      "learning_rate": 0.00021252252252252252,
+      "loss": 0.4425,
+      "step": 1952
+    },
+    {
+      "epoch": 0.5854974704890388,
+      "grad_norm": 0.24320489168167114,
+      "learning_rate": 0.00021247747747747746,
+      "loss": 0.4085,
+      "step": 1953
+    },
+    {
+      "epoch": 0.5857972643807382,
+      "grad_norm": 0.2758175730705261,
+      "learning_rate": 0.00021243243243243242,
+      "loss": 0.4606,
+      "step": 1954
+    },
+    {
+      "epoch": 0.5860970582724377,
+      "grad_norm": 0.2623588442802429,
+      "learning_rate": 0.00021238738738738736,
+      "loss": 0.4167,
+      "step": 1955
+    },
+    {
+      "epoch": 0.5863968521641372,
+      "grad_norm": 0.2561275064945221,
+      "learning_rate": 0.00021234234234234232,
+      "loss": 0.4399,
+      "step": 1956
+    },
+    {
+      "epoch": 0.5866966460558366,
+      "grad_norm": 0.2818892300128937,
+      "learning_rate": 0.00021229729729729728,
+      "loss": 0.4628,
+      "step": 1957
+    },
+    {
+      "epoch": 0.586996439947536,
+      "grad_norm": 0.26609155535697937,
+      "learning_rate": 0.00021225225225225222,
+      "loss": 0.4285,
+      "step": 1958
+    },
+    {
+      "epoch": 0.5872962338392356,
+      "grad_norm": 0.2503769099712372,
+      "learning_rate": 0.00021220720720720719,
+      "loss": 0.4632,
+      "step": 1959
+    },
+    {
+      "epoch": 0.587596027730935,
+      "grad_norm": 0.2810426950454712,
+      "learning_rate": 0.00021216216216216215,
+      "loss": 0.4591,
+      "step": 1960
+    },
+    {
+      "epoch": 0.5878958216226344,
+      "grad_norm": 0.2517390251159668,
+      "learning_rate": 0.00021211711711711709,
+      "loss": 0.4323,
+      "step": 1961
+    },
+    {
+      "epoch": 0.5881956155143339,
+      "grad_norm": 0.26425543427467346,
+      "learning_rate": 0.00021207207207207205,
+      "loss": 0.4466,
+      "step": 1962
+    },
+    {
+      "epoch": 0.5884954094060334,
+      "grad_norm": 0.25192978978157043,
+      "learning_rate": 0.00021202702702702701,
+      "loss": 0.4524,
+      "step": 1963
+    },
+    {
+      "epoch": 0.5887952032977328,
+      "grad_norm": 0.26266035437583923,
+      "learning_rate": 0.00021198198198198195,
+      "loss": 0.4417,
+      "step": 1964
+    },
+    {
+      "epoch": 0.5890949971894323,
+      "grad_norm": 0.31183677911758423,
+      "learning_rate": 0.00021193693693693694,
+      "loss": 0.4473,
+      "step": 1965
+    },
+    {
+      "epoch": 0.5893947910811317,
+      "grad_norm": 0.24742548167705536,
+      "learning_rate": 0.00021189189189189188,
+      "loss": 0.4209,
+      "step": 1966
+    },
+    {
+      "epoch": 0.5896945849728312,
+      "grad_norm": 0.27282243967056274,
+      "learning_rate": 0.00021184684684684682,
+      "loss": 0.4316,
+      "step": 1967
+    },
+    {
+      "epoch": 0.5899943788645307,
+      "grad_norm": 0.2748444974422455,
+      "learning_rate": 0.0002118018018018018,
+      "loss": 0.4276,
+      "step": 1968
+    },
+    {
+      "epoch": 0.5902941727562301,
+      "grad_norm": 0.2746492624282837,
+      "learning_rate": 0.00021175675675675674,
+      "loss": 0.4597,
+      "step": 1969
+    },
+    {
+      "epoch": 0.5905939666479295,
+      "grad_norm": 0.25905507802963257,
+      "learning_rate": 0.00021171171171171168,
+      "loss": 0.4668,
+      "step": 1970
+    },
+    {
+      "epoch": 0.590893760539629,
+      "grad_norm": 0.2456134557723999,
+      "learning_rate": 0.00021166666666666667,
+      "loss": 0.4057,
+      "step": 1971
+    },
+    {
+      "epoch": 0.5911935544313285,
+      "grad_norm": 0.25221529603004456,
+      "learning_rate": 0.0002116216216216216,
+      "loss": 0.4427,
+      "step": 1972
+    },
+    {
+      "epoch": 0.5914933483230279,
+      "grad_norm": 0.24493472278118134,
+      "learning_rate": 0.00021157657657657654,
+      "loss": 0.4206,
+      "step": 1973
+    },
+    {
+      "epoch": 0.5917931422147273,
+      "grad_norm": 0.2561238408088684,
+      "learning_rate": 0.00021153153153153154,
+      "loss": 0.4537,
+      "step": 1974
+    },
+    {
+      "epoch": 0.5920929361064269,
+      "grad_norm": 0.2350313663482666,
+      "learning_rate": 0.00021148648648648647,
+      "loss": 0.4163,
+      "step": 1975
+    },
+    {
+      "epoch": 0.5923927299981263,
+      "grad_norm": 0.2535955309867859,
+      "learning_rate": 0.0002114414414414414,
+      "loss": 0.4435,
+      "step": 1976
+    },
+    {
+      "epoch": 0.5926925238898257,
+      "grad_norm": 0.2498706728219986,
+      "learning_rate": 0.00021139639639639637,
+      "loss": 0.4507,
+      "step": 1977
+    },
+    {
+      "epoch": 0.5929923177815252,
+      "grad_norm": 0.2510056495666504,
+      "learning_rate": 0.00021135135135135134,
+      "loss": 0.4268,
+      "step": 1978
+    },
+    {
+      "epoch": 0.5932921116732247,
+      "grad_norm": 0.252605676651001,
+      "learning_rate": 0.00021130630630630627,
+      "loss": 0.4457,
+      "step": 1979
+    },
+    {
+      "epoch": 0.5935919055649241,
+      "grad_norm": 0.2618269622325897,
+      "learning_rate": 0.00021126126126126124,
+      "loss": 0.4447,
+      "step": 1980
+    },
+    {
+      "epoch": 0.5938916994566236,
+      "grad_norm": 0.25301989912986755,
+      "learning_rate": 0.0002112162162162162,
+      "loss": 0.4468,
+      "step": 1981
+    },
+    {
+      "epoch": 0.594191493348323,
+      "grad_norm": 0.2434222400188446,
+      "learning_rate": 0.00021117117117117114,
+      "loss": 0.4348,
+      "step": 1982
+    },
+    {
+      "epoch": 0.5944912872400225,
+      "grad_norm": 0.24123191833496094,
+      "learning_rate": 0.0002111261261261261,
+      "loss": 0.4485,
+      "step": 1983
+    },
+    {
+      "epoch": 0.594791081131722,
+      "grad_norm": 0.2643167972564697,
+      "learning_rate": 0.00021108108108108107,
+      "loss": 0.4464,
+      "step": 1984
+    },
+    {
+      "epoch": 0.5950908750234214,
+      "grad_norm": 0.2503741979598999,
+      "learning_rate": 0.00021103603603603603,
+      "loss": 0.4247,
+      "step": 1985
+    },
+    {
+      "epoch": 0.5953906689151208,
+      "grad_norm": 0.23599058389663696,
+      "learning_rate": 0.00021099099099099097,
+      "loss": 0.4089,
+      "step": 1986
+    },
+    {
+      "epoch": 0.5956904628068204,
+      "grad_norm": 0.25657573342323303,
+      "learning_rate": 0.00021094594594594593,
+      "loss": 0.445,
+      "step": 1987
+    },
+    {
+      "epoch": 0.5959902566985198,
+      "grad_norm": 0.26530593633651733,
+      "learning_rate": 0.0002109009009009009,
+      "loss": 0.4386,
+      "step": 1988
+    },
+    {
+      "epoch": 0.5962900505902192,
+      "grad_norm": 0.2542692720890045,
+      "learning_rate": 0.00021085585585585583,
+      "loss": 0.4431,
+      "step": 1989
+    },
+    {
+      "epoch": 0.5965898444819187,
+      "grad_norm": 0.2595093548297882,
+      "learning_rate": 0.0002108108108108108,
+      "loss": 0.4243,
+      "step": 1990
+    },
+    {
+      "epoch": 0.5968896383736182,
+      "grad_norm": 0.2481933832168579,
+      "learning_rate": 0.00021076576576576576,
+      "loss": 0.4215,
+      "step": 1991
+    },
+    {
+      "epoch": 0.5971894322653176,
+      "grad_norm": 0.2607382535934448,
+      "learning_rate": 0.0002107207207207207,
+      "loss": 0.4505,
+      "step": 1992
+    },
+    {
+      "epoch": 0.597489226157017,
+      "grad_norm": 0.26596030592918396,
+      "learning_rate": 0.00021067567567567566,
+      "loss": 0.4703,
+      "step": 1993
+    },
+    {
+      "epoch": 0.5977890200487165,
+      "grad_norm": 0.24884359538555145,
+      "learning_rate": 0.00021063063063063062,
+      "loss": 0.4237,
+      "step": 1994
+    },
+    {
+      "epoch": 0.5980888139404159,
+      "grad_norm": 0.23510870337486267,
+      "learning_rate": 0.00021058558558558556,
+      "loss": 0.4235,
+      "step": 1995
+    },
+    {
+      "epoch": 0.5983886078321154,
+      "grad_norm": 0.25982344150543213,
+      "learning_rate": 0.00021054054054054052,
+      "loss": 0.4262,
+      "step": 1996
+    },
+    {
+      "epoch": 0.5986884017238149,
+      "grad_norm": 0.24918124079704285,
+      "learning_rate": 0.0002104954954954955,
+      "loss": 0.4225,
+      "step": 1997
+    },
+    {
+      "epoch": 0.5989881956155143,
+      "grad_norm": 0.26578548550605774,
+      "learning_rate": 0.00021045045045045042,
+      "loss": 0.4685,
+      "step": 1998
+    },
+    {
+      "epoch": 0.5992879895072137,
+      "grad_norm": 0.2682873010635376,
+      "learning_rate": 0.00021040540540540542,
+      "loss": 0.4291,
+      "step": 1999
+    },
+    {
+      "epoch": 0.5995877833989133,
+      "grad_norm": 0.247017040848732,
+      "learning_rate": 0.00021036036036036035,
+      "loss": 0.4457,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5995877833989133,
+      "eval_loss": 0.4406769275665283,
+      "eval_runtime": 564.9558,
+      "eval_samples_per_second": 3.822,
+      "eval_steps_per_second": 0.478,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5998875772906127,
+      "grad_norm": 0.25887760519981384,
+      "learning_rate": 0.0002103153153153153,
+      "loss": 0.4261,
+      "step": 2001
+    },
+    {
+      "epoch": 0.6001873711823121,
+      "grad_norm": 0.26813459396362305,
+      "learning_rate": 0.00021027027027027023,
+      "loss": 0.4091,
+      "step": 2002
+    },
+    {
+      "epoch": 0.6004871650740116,
+      "grad_norm": 0.2717922329902649,
+      "learning_rate": 0.00021022522522522522,
+      "loss": 0.4257,
+      "step": 2003
+    },
+    {
+      "epoch": 0.6007869589657111,
+      "grad_norm": 0.2423432320356369,
+      "learning_rate": 0.00021018018018018015,
+      "loss": 0.4187,
+      "step": 2004
+    },
+    {
+      "epoch": 0.6010867528574105,
+      "grad_norm": 0.2616721987724304,
+      "learning_rate": 0.0002101351351351351,
+      "loss": 0.4414,
+      "step": 2005
+    },
+    {
+      "epoch": 0.60138654674911,
+      "grad_norm": 0.2668519914150238,
+      "learning_rate": 0.00021009009009009008,
+      "loss": 0.4566,
+      "step": 2006
+    },
+    {
+      "epoch": 0.6016863406408094,
+      "grad_norm": 0.24378737807273865,
+      "learning_rate": 0.00021004504504504502,
+      "loss": 0.4379,
+      "step": 2007
+    },
+    {
+      "epoch": 0.6019861345325089,
+      "grad_norm": 0.24571847915649414,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.4491,
+      "step": 2008
+    },
+    {
+      "epoch": 0.6022859284242084,
+      "grad_norm": 0.23369182646274567,
+      "learning_rate": 0.00020995495495495495,
+      "loss": 0.4023,
+      "step": 2009
+    },
+    {
+      "epoch": 0.6025857223159078,
+      "grad_norm": 0.27274230122566223,
+      "learning_rate": 0.00020990990990990988,
+      "loss": 0.4514,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6028855162076072,
+      "grad_norm": 0.29650700092315674,
+      "learning_rate": 0.00020986486486486485,
+      "loss": 0.4553,
+      "step": 2011
+    },
+    {
+      "epoch": 0.6031853100993068,
+      "grad_norm": 0.2543351650238037,
+      "learning_rate": 0.0002098198198198198,
+      "loss": 0.4522,
+      "step": 2012
+    },
+    {
+      "epoch": 0.6034851039910062,
+      "grad_norm": 0.25208958983421326,
+      "learning_rate": 0.00020977477477477475,
+      "loss": 0.4342,
+      "step": 2013
+    },
+    {
+      "epoch": 0.6037848978827056,
+      "grad_norm": 0.24142181873321533,
+      "learning_rate": 0.0002097297297297297,
+      "loss": 0.4106,
+      "step": 2014
+    },
+    {
+      "epoch": 0.604084691774405,
+      "grad_norm": 0.2537544071674347,
+      "learning_rate": 0.00020968468468468467,
+      "loss": 0.4373,
+      "step": 2015
+    },
+    {
+      "epoch": 0.6043844856661046,
+      "grad_norm": 0.27682605385780334,
+      "learning_rate": 0.0002096396396396396,
+      "loss": 0.4491,
+      "step": 2016
+    },
+    {
+      "epoch": 0.604684279557804,
+      "grad_norm": 0.24604292213916779,
+      "learning_rate": 0.00020959459459459458,
+      "loss": 0.4059,
+      "step": 2017
+    },
+    {
+      "epoch": 0.6049840734495034,
+      "grad_norm": 0.24552318453788757,
+      "learning_rate": 0.00020954954954954954,
+      "loss": 0.4361,
+      "step": 2018
+    },
+    {
+      "epoch": 0.6052838673412029,
+      "grad_norm": 0.24892392754554749,
+      "learning_rate": 0.00020950450450450448,
+      "loss": 0.4579,
+      "step": 2019
+    },
+    {
+      "epoch": 0.6055836612329024,
+      "grad_norm": 0.26011762022972107,
+      "learning_rate": 0.00020945945945945944,
+      "loss": 0.4444,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6058834551246018,
+      "grad_norm": 0.25486037135124207,
+      "learning_rate": 0.0002094144144144144,
+      "loss": 0.4256,
+      "step": 2021
+    },
+    {
+      "epoch": 0.6061832490163013,
+      "grad_norm": 0.24915416538715363,
+      "learning_rate": 0.00020936936936936937,
+      "loss": 0.4295,
+      "step": 2022
+    },
+    {
+      "epoch": 0.6064830429080007,
+      "grad_norm": 0.24448558688163757,
+      "learning_rate": 0.0002093243243243243,
+      "loss": 0.422,
+      "step": 2023
+    },
+    {
+      "epoch": 0.6067828367997002,
+      "grad_norm": 0.24826547503471375,
+      "learning_rate": 0.00020927927927927927,
+      "loss": 0.3927,
+      "step": 2024
+    },
+    {
+      "epoch": 0.6070826306913997,
+      "grad_norm": 0.24415460228919983,
+      "learning_rate": 0.00020923423423423423,
+      "loss": 0.4128,
+      "step": 2025
+    },
+    {
+      "epoch": 0.6073824245830991,
+      "grad_norm": 0.25376150012016296,
+      "learning_rate": 0.00020918918918918917,
+      "loss": 0.4363,
+      "step": 2026
+    },
+    {
+      "epoch": 0.6076822184747985,
+      "grad_norm": 0.2526264786720276,
+      "learning_rate": 0.0002091441441441441,
+      "loss": 0.4354,
+      "step": 2027
+    },
+    {
+      "epoch": 0.6079820123664981,
+      "grad_norm": 0.25586017966270447,
+      "learning_rate": 0.0002090990990990991,
+      "loss": 0.401,
+      "step": 2028
+    },
+    {
+      "epoch": 0.6082818062581975,
+      "grad_norm": 0.2642875909805298,
+      "learning_rate": 0.00020905405405405403,
+      "loss": 0.4476,
+      "step": 2029
+    },
+    {
+      "epoch": 0.6085816001498969,
+      "grad_norm": 0.2795594334602356,
+      "learning_rate": 0.00020900900900900897,
+      "loss": 0.4845,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6088813940415964,
+      "grad_norm": 0.23246948421001434,
+      "learning_rate": 0.00020896396396396396,
+      "loss": 0.3906,
+      "step": 2031
+    },
+    {
+      "epoch": 0.6091811879332959,
+      "grad_norm": 0.25595536828041077,
+      "learning_rate": 0.0002089189189189189,
+      "loss": 0.4331,
+      "step": 2032
+    },
+    {
+      "epoch": 0.6094809818249953,
+      "grad_norm": 0.24884217977523804,
+      "learning_rate": 0.00020887387387387383,
+      "loss": 0.4361,
+      "step": 2033
+    },
+    {
+      "epoch": 0.6097807757166948,
+      "grad_norm": 0.25283509492874146,
+      "learning_rate": 0.00020882882882882883,
+      "loss": 0.422,
+      "step": 2034
+    },
+    {
+      "epoch": 0.6100805696083942,
+      "grad_norm": 0.25681042671203613,
+      "learning_rate": 0.00020878378378378376,
+      "loss": 0.4301,
+      "step": 2035
+    },
+    {
+      "epoch": 0.6103803635000937,
+      "grad_norm": 0.2695556879043579,
+      "learning_rate": 0.0002087387387387387,
+      "loss": 0.4624,
+      "step": 2036
+    },
+    {
+      "epoch": 0.6106801573917932,
+      "grad_norm": 0.28260117769241333,
+      "learning_rate": 0.0002086936936936937,
+      "loss": 0.4608,
+      "step": 2037
+    },
+    {
+      "epoch": 0.6109799512834926,
+      "grad_norm": 0.2578640878200531,
+      "learning_rate": 0.00020864864864864863,
+      "loss": 0.4638,
+      "step": 2038
+    },
+    {
+      "epoch": 0.611279745175192,
+      "grad_norm": 0.2544034421443939,
+      "learning_rate": 0.00020860360360360356,
+      "loss": 0.4222,
+      "step": 2039
+    },
+    {
+      "epoch": 0.6115795390668916,
+      "grad_norm": 0.2726188600063324,
+      "learning_rate": 0.00020855855855855855,
+      "loss": 0.4802,
+      "step": 2040
+    },
+    {
+      "epoch": 0.611879332958591,
+      "grad_norm": 0.26486557722091675,
+      "learning_rate": 0.0002085135135135135,
+      "loss": 0.4475,
+      "step": 2041
+    },
+    {
+      "epoch": 0.6121791268502904,
+      "grad_norm": 0.2517718970775604,
+      "learning_rate": 0.00020846846846846843,
+      "loss": 0.4445,
+      "step": 2042
+    },
+    {
+      "epoch": 0.6124789207419898,
+      "grad_norm": 0.2507137060165405,
+      "learning_rate": 0.00020842342342342342,
+      "loss": 0.4243,
+      "step": 2043
+    },
+    {
+      "epoch": 0.6127787146336894,
+      "grad_norm": 0.2707221806049347,
+      "learning_rate": 0.00020837837837837836,
+      "loss": 0.4706,
+      "step": 2044
+    },
+    {
+      "epoch": 0.6130785085253888,
+      "grad_norm": 0.24258100986480713,
+      "learning_rate": 0.00020833333333333332,
+      "loss": 0.423,
+      "step": 2045
+    },
+    {
+      "epoch": 0.6133783024170882,
+      "grad_norm": 0.25611022114753723,
+      "learning_rate": 0.00020828828828828828,
+      "loss": 0.4553,
+      "step": 2046
+    },
+    {
+      "epoch": 0.6136780963087877,
+      "grad_norm": 0.26037782430648804,
+      "learning_rate": 0.00020824324324324322,
+      "loss": 0.4173,
+      "step": 2047
+    },
+    {
+      "epoch": 0.6139778902004872,
+      "grad_norm": 0.26126980781555176,
+      "learning_rate": 0.00020819819819819818,
+      "loss": 0.4556,
+      "step": 2048
+    },
+    {
+      "epoch": 0.6142776840921866,
+      "grad_norm": 0.283407598733902,
+      "learning_rate": 0.00020815315315315315,
+      "loss": 0.4421,
+      "step": 2049
+    },
+    {
+      "epoch": 0.6145774779838861,
+      "grad_norm": 0.2533104717731476,
+      "learning_rate": 0.00020810810810810808,
+      "loss": 0.4253,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6148772718755855,
+      "grad_norm": 0.24108561873435974,
+      "learning_rate": 0.00020806306306306305,
+      "loss": 0.438,
+      "step": 2051
+    },
+    {
+      "epoch": 0.615177065767285,
+      "grad_norm": 0.2516781687736511,
+      "learning_rate": 0.00020801801801801799,
+      "loss": 0.4245,
+      "step": 2052
+    },
+    {
+      "epoch": 0.6154768596589845,
+      "grad_norm": 0.25689882040023804,
+      "learning_rate": 0.00020797297297297295,
+      "loss": 0.4412,
+      "step": 2053
+    },
+    {
+      "epoch": 0.6157766535506839,
+      "grad_norm": 0.2661590576171875,
+      "learning_rate": 0.0002079279279279279,
+      "loss": 0.47,
+      "step": 2054
+    },
+    {
+      "epoch": 0.6160764474423833,
+      "grad_norm": 0.2580914795398712,
+      "learning_rate": 0.00020788288288288285,
+      "loss": 0.4537,
+      "step": 2055
+    },
+    {
+      "epoch": 0.6163762413340829,
+      "grad_norm": 0.23960340023040771,
+      "learning_rate": 0.00020783783783783784,
+      "loss": 0.4535,
+      "step": 2056
+    },
+    {
+      "epoch": 0.6166760352257823,
+      "grad_norm": 0.2658163905143738,
+      "learning_rate": 0.00020779279279279278,
+      "loss": 0.4483,
+      "step": 2057
+    },
+    {
+      "epoch": 0.6169758291174817,
+      "grad_norm": 0.2633172869682312,
+      "learning_rate": 0.00020774774774774771,
+      "loss": 0.4408,
+      "step": 2058
+    },
+    {
+      "epoch": 0.6172756230091812,
+      "grad_norm": 0.2618810832500458,
+      "learning_rate": 0.0002077027027027027,
+      "loss": 0.4497,
+      "step": 2059
+    },
+    {
+      "epoch": 0.6175754169008807,
+      "grad_norm": 0.2505929470062256,
+      "learning_rate": 0.00020765765765765764,
+      "loss": 0.4411,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6178752107925801,
+      "grad_norm": 0.2556394338607788,
+      "learning_rate": 0.00020761261261261258,
+      "loss": 0.4427,
+      "step": 2061
+    },
+    {
+      "epoch": 0.6181750046842795,
+      "grad_norm": 0.2584443688392639,
+      "learning_rate": 0.00020756756756756757,
+      "loss": 0.4707,
+      "step": 2062
+    },
+    {
+      "epoch": 0.618474798575979,
+      "grad_norm": 0.2633087635040283,
+      "learning_rate": 0.0002075225225225225,
+      "loss": 0.4576,
+      "step": 2063
+    },
+    {
+      "epoch": 0.6187745924676785,
+      "grad_norm": 0.2487655133008957,
+      "learning_rate": 0.00020747747747747744,
+      "loss": 0.4374,
+      "step": 2064
+    },
+    {
+      "epoch": 0.619074386359378,
+      "grad_norm": 0.24193575978279114,
+      "learning_rate": 0.00020743243243243243,
+      "loss": 0.4267,
+      "step": 2065
+    },
+    {
+      "epoch": 0.6193741802510774,
+      "grad_norm": 0.2395676076412201,
+      "learning_rate": 0.00020738738738738737,
+      "loss": 0.4236,
+      "step": 2066
+    },
+    {
+      "epoch": 0.6196739741427768,
+      "grad_norm": 0.24150055646896362,
+      "learning_rate": 0.0002073423423423423,
+      "loss": 0.4138,
+      "step": 2067
+    },
+    {
+      "epoch": 0.6199737680344763,
+      "grad_norm": 0.2652229368686676,
+      "learning_rate": 0.0002072972972972973,
+      "loss": 0.4248,
+      "step": 2068
+    },
+    {
+      "epoch": 0.6202735619261758,
+      "grad_norm": 0.24750439822673798,
+      "learning_rate": 0.00020725225225225224,
+      "loss": 0.4273,
+      "step": 2069
+    },
+    {
+      "epoch": 0.6205733558178752,
+      "grad_norm": 0.25777891278266907,
+      "learning_rate": 0.00020720720720720717,
+      "loss": 0.4471,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6208731497095746,
+      "grad_norm": 0.25587761402130127,
+      "learning_rate": 0.00020716216216216216,
+      "loss": 0.4246,
+      "step": 2071
+    },
+    {
+      "epoch": 0.6211729436012742,
+      "grad_norm": 0.2518714368343353,
+      "learning_rate": 0.0002071171171171171,
+      "loss": 0.4645,
+      "step": 2072
+    },
+    {
+      "epoch": 0.6214727374929736,
+      "grad_norm": 0.24348096549510956,
+      "learning_rate": 0.00020707207207207204,
+      "loss": 0.4454,
+      "step": 2073
+    },
+    {
+      "epoch": 0.621772531384673,
+      "grad_norm": 0.2423403263092041,
+      "learning_rate": 0.000207027027027027,
+      "loss": 0.405,
+      "step": 2074
+    },
+    {
+      "epoch": 0.6220723252763725,
+      "grad_norm": 0.26894888281822205,
+      "learning_rate": 0.00020698198198198196,
+      "loss": 0.4699,
+      "step": 2075
+    },
+    {
+      "epoch": 0.622372119168072,
+      "grad_norm": 0.2578108608722687,
+      "learning_rate": 0.0002069369369369369,
+      "loss": 0.4365,
+      "step": 2076
+    },
+    {
+      "epoch": 0.6226719130597714,
+      "grad_norm": 0.27144232392311096,
+      "learning_rate": 0.00020689189189189187,
+      "loss": 0.4291,
+      "step": 2077
+    },
+    {
+      "epoch": 0.6229717069514709,
+      "grad_norm": 0.25405609607696533,
+      "learning_rate": 0.00020684684684684683,
+      "loss": 0.4389,
+      "step": 2078
+    },
+    {
+      "epoch": 0.6232715008431703,
+      "grad_norm": 0.24775098264217377,
+      "learning_rate": 0.0002068018018018018,
+      "loss": 0.416,
+      "step": 2079
+    },
+    {
+      "epoch": 0.6235712947348698,
+      "grad_norm": 0.25767782330513,
+      "learning_rate": 0.00020675675675675673,
+      "loss": 0.455,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6238710886265693,
+      "grad_norm": 0.2590571641921997,
+      "learning_rate": 0.0002067117117117117,
+      "loss": 0.4436,
+      "step": 2081
+    },
+    {
+      "epoch": 0.6241708825182687,
+      "grad_norm": 0.24577729403972626,
+      "learning_rate": 0.00020666666666666666,
+      "loss": 0.4389,
+      "step": 2082
+    },
+    {
+      "epoch": 0.6244706764099681,
+      "grad_norm": 0.25743165612220764,
+      "learning_rate": 0.0002066216216216216,
+      "loss": 0.4467,
+      "step": 2083
+    },
+    {
+      "epoch": 0.6247704703016677,
+      "grad_norm": 0.24127769470214844,
+      "learning_rate": 0.00020657657657657656,
+      "loss": 0.4056,
+      "step": 2084
+    },
+    {
+      "epoch": 0.6250702641933671,
+      "grad_norm": 0.25930923223495483,
+      "learning_rate": 0.00020653153153153152,
+      "loss": 0.4352,
+      "step": 2085
+    },
+    {
+      "epoch": 0.6253700580850665,
+      "grad_norm": 0.23111720383167267,
+      "learning_rate": 0.00020648648648648646,
+      "loss": 0.4035,
+      "step": 2086
+    },
+    {
+      "epoch": 0.625669851976766,
+      "grad_norm": 0.25705838203430176,
+      "learning_rate": 0.00020644144144144142,
+      "loss": 0.4402,
+      "step": 2087
+    },
+    {
+      "epoch": 0.6259696458684655,
+      "grad_norm": 0.260232150554657,
+      "learning_rate": 0.0002063963963963964,
+      "loss": 0.444,
+      "step": 2088
+    },
+    {
+      "epoch": 0.6262694397601649,
+      "grad_norm": 0.24525994062423706,
+      "learning_rate": 0.00020635135135135132,
+      "loss": 0.4461,
+      "step": 2089
+    },
+    {
+      "epoch": 0.6265692336518643,
+      "grad_norm": 0.2550069987773895,
+      "learning_rate": 0.00020630630630630631,
+      "loss": 0.4391,
+      "step": 2090
+    },
+    {
+      "epoch": 0.6268690275435638,
+      "grad_norm": 0.24561873078346252,
+      "learning_rate": 0.00020626126126126125,
+      "loss": 0.424,
+      "step": 2091
+    },
+    {
+      "epoch": 0.6271688214352632,
+      "grad_norm": 0.2500348687171936,
+      "learning_rate": 0.0002062162162162162,
+      "loss": 0.4415,
+      "step": 2092
+    },
+    {
+      "epoch": 0.6274686153269627,
+      "grad_norm": 0.23684681951999664,
+      "learning_rate": 0.00020617117117117118,
+      "loss": 0.4282,
+      "step": 2093
+    },
+    {
+      "epoch": 0.6277684092186622,
+      "grad_norm": 0.2459501475095749,
+      "learning_rate": 0.00020612612612612612,
+      "loss": 0.4229,
+      "step": 2094
+    },
+    {
+      "epoch": 0.6280682031103616,
+      "grad_norm": 0.23220065236091614,
+      "learning_rate": 0.00020608108108108105,
+      "loss": 0.4083,
+      "step": 2095
+    },
+    {
+      "epoch": 0.628367997002061,
+      "grad_norm": 0.26996707916259766,
+      "learning_rate": 0.00020603603603603604,
+      "loss": 0.4581,
+      "step": 2096
+    },
+    {
+      "epoch": 0.6286677908937606,
+      "grad_norm": 0.2551998496055603,
+      "learning_rate": 0.00020599099099099098,
+      "loss": 0.4444,
+      "step": 2097
+    },
+    {
+      "epoch": 0.62896758478546,
+      "grad_norm": 0.2604631781578064,
+      "learning_rate": 0.00020594594594594592,
+      "loss": 0.4367,
+      "step": 2098
+    },
+    {
+      "epoch": 0.6292673786771594,
+      "grad_norm": 0.25958266854286194,
+      "learning_rate": 0.00020590090090090085,
+      "loss": 0.4618,
+      "step": 2099
+    },
+    {
+      "epoch": 0.6295671725688589,
+      "grad_norm": 0.2714870274066925,
+      "learning_rate": 0.00020585585585585584,
+      "loss": 0.4382,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6298669664605584,
+      "grad_norm": 0.2739773392677307,
+      "learning_rate": 0.00020581081081081078,
+      "loss": 0.4752,
+      "step": 2101
+    },
+    {
+      "epoch": 0.6301667603522578,
+      "grad_norm": 0.24829277396202087,
+      "learning_rate": 0.00020576576576576575,
+      "loss": 0.4285,
+      "step": 2102
+    },
+    {
+      "epoch": 0.6304665542439573,
+      "grad_norm": 0.2551855742931366,
+      "learning_rate": 0.0002057207207207207,
+      "loss": 0.4259,
+      "step": 2103
+    },
+    {
+      "epoch": 0.6307663481356567,
+      "grad_norm": 0.243735671043396,
+      "learning_rate": 0.00020567567567567565,
+      "loss": 0.4201,
+      "step": 2104
+    },
+    {
+      "epoch": 0.6310661420273562,
+      "grad_norm": 0.2511364817619324,
+      "learning_rate": 0.0002056306306306306,
+      "loss": 0.4351,
+      "step": 2105
+    },
+    {
+      "epoch": 0.6313659359190557,
+      "grad_norm": 0.2456447184085846,
+      "learning_rate": 0.00020558558558558557,
+      "loss": 0.448,
+      "step": 2106
+    },
+    {
+      "epoch": 0.6316657298107551,
+      "grad_norm": 0.26450565457344055,
+      "learning_rate": 0.0002055405405405405,
+      "loss": 0.4576,
+      "step": 2107
+    },
+    {
+      "epoch": 0.6319655237024545,
+      "grad_norm": 0.25267186760902405,
+      "learning_rate": 0.00020549549549549547,
+      "loss": 0.4511,
+      "step": 2108
+    },
+    {
+      "epoch": 0.632265317594154,
+      "grad_norm": 0.2436206340789795,
+      "learning_rate": 0.00020545045045045044,
+      "loss": 0.4147,
+      "step": 2109
+    },
+    {
+      "epoch": 0.6325651114858535,
+      "grad_norm": 0.27077367901802063,
+      "learning_rate": 0.00020540540540540537,
+      "loss": 0.4687,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6328649053775529,
+      "grad_norm": 0.25476735830307007,
+      "learning_rate": 0.00020536036036036034,
+      "loss": 0.4357,
+      "step": 2111
+    },
+    {
+      "epoch": 0.6331646992692523,
+      "grad_norm": 0.23889677226543427,
+      "learning_rate": 0.0002053153153153153,
+      "loss": 0.4109,
+      "step": 2112
+    },
+    {
+      "epoch": 0.6334644931609519,
+      "grad_norm": 0.2620011270046234,
+      "learning_rate": 0.00020527027027027027,
+      "loss": 0.4418,
+      "step": 2113
+    },
+    {
+      "epoch": 0.6337642870526513,
+      "grad_norm": 0.24259789288043976,
+      "learning_rate": 0.0002052252252252252,
+      "loss": 0.4158,
+      "step": 2114
+    },
+    {
+      "epoch": 0.6340640809443507,
+      "grad_norm": 0.26212331652641296,
+      "learning_rate": 0.00020518018018018017,
+      "loss": 0.4407,
+      "step": 2115
+    },
+    {
+      "epoch": 0.6343638748360502,
+      "grad_norm": 0.2421627789735794,
+      "learning_rate": 0.00020513513513513513,
+      "loss": 0.4162,
+      "step": 2116
+    },
+    {
+      "epoch": 0.6346636687277497,
+      "grad_norm": 0.25949686765670776,
+      "learning_rate": 0.00020509009009009007,
+      "loss": 0.4436,
+      "step": 2117
+    },
+    {
+      "epoch": 0.6349634626194491,
+      "grad_norm": 0.26797404885292053,
+      "learning_rate": 0.00020504504504504503,
+      "loss": 0.4684,
+      "step": 2118
+    },
+    {
+      "epoch": 0.6352632565111486,
+      "grad_norm": 0.2433563470840454,
+      "learning_rate": 0.000205,
+      "loss": 0.4312,
+      "step": 2119
+    },
+    {
+      "epoch": 0.635563050402848,
+      "grad_norm": 0.25377362966537476,
+      "learning_rate": 0.00020495495495495493,
+      "loss": 0.4583,
+      "step": 2120
+    },
+    {
+      "epoch": 0.6358628442945475,
+      "grad_norm": 0.2583523392677307,
+      "learning_rate": 0.0002049099099099099,
+      "loss": 0.4587,
+      "step": 2121
+    },
+    {
+      "epoch": 0.636162638186247,
+      "grad_norm": 0.2512962520122528,
+      "learning_rate": 0.00020486486486486486,
+      "loss": 0.4148,
+      "step": 2122
+    },
+    {
+      "epoch": 0.6364624320779464,
+      "grad_norm": 0.24238212406635284,
+      "learning_rate": 0.0002048198198198198,
+      "loss": 0.4057,
+      "step": 2123
+    },
+    {
+      "epoch": 0.6367622259696458,
+      "grad_norm": 0.25228351354599,
+      "learning_rate": 0.00020477477477477473,
+      "loss": 0.4291,
+      "step": 2124
+    },
+    {
+      "epoch": 0.6370620198613454,
+      "grad_norm": 0.26437970995903015,
+      "learning_rate": 0.00020472972972972972,
+      "loss": 0.4425,
+      "step": 2125
+    },
+    {
+      "epoch": 0.6373618137530448,
+      "grad_norm": 0.2502360939979553,
+      "learning_rate": 0.00020468468468468466,
+      "loss": 0.4102,
+      "step": 2126
+    },
+    {
+      "epoch": 0.6376616076447442,
+      "grad_norm": 0.24895761907100677,
+      "learning_rate": 0.0002046396396396396,
+      "loss": 0.4423,
+      "step": 2127
+    },
+    {
+      "epoch": 0.6379614015364437,
+      "grad_norm": 0.2674795985221863,
+      "learning_rate": 0.0002045945945945946,
+      "loss": 0.4572,
+      "step": 2128
+    },
+    {
+      "epoch": 0.6382611954281432,
+      "grad_norm": 0.2468012422323227,
+      "learning_rate": 0.00020454954954954953,
+      "loss": 0.4525,
+      "step": 2129
+    },
+    {
+      "epoch": 0.6385609893198426,
+      "grad_norm": 0.2560267448425293,
+      "learning_rate": 0.00020450450450450446,
+      "loss": 0.4553,
+      "step": 2130
+    },
+    {
+      "epoch": 0.638860783211542,
+      "grad_norm": 0.25216519832611084,
+      "learning_rate": 0.00020445945945945945,
+      "loss": 0.4319,
+      "step": 2131
+    },
+    {
+      "epoch": 0.6391605771032415,
+      "grad_norm": 0.22927191853523254,
+      "learning_rate": 0.0002044144144144144,
+      "loss": 0.4088,
+      "step": 2132
+    },
+    {
+      "epoch": 0.639460370994941,
+      "grad_norm": 0.2495734840631485,
+      "learning_rate": 0.00020436936936936933,
+      "loss": 0.4301,
+      "step": 2133
+    },
+    {
+      "epoch": 0.6397601648866404,
+      "grad_norm": 0.2604648470878601,
+      "learning_rate": 0.00020432432432432432,
+      "loss": 0.4115,
+      "step": 2134
+    },
+    {
+      "epoch": 0.6400599587783399,
+      "grad_norm": 0.27185025811195374,
+      "learning_rate": 0.00020427927927927925,
+      "loss": 0.4452,
+      "step": 2135
+    },
+    {
+      "epoch": 0.6403597526700393,
+      "grad_norm": 0.26344043016433716,
+      "learning_rate": 0.00020423423423423422,
+      "loss": 0.4178,
+      "step": 2136
+    },
+    {
+      "epoch": 0.6406595465617388,
+      "grad_norm": 0.26041179895401,
+      "learning_rate": 0.00020418918918918918,
+      "loss": 0.4384,
+      "step": 2137
+    },
+    {
+      "epoch": 0.6409593404534383,
+      "grad_norm": 0.25143757462501526,
+      "learning_rate": 0.00020414414414414412,
+      "loss": 0.4486,
+      "step": 2138
+    },
+    {
+      "epoch": 0.6412591343451377,
+      "grad_norm": 0.2869662642478943,
+      "learning_rate": 0.00020409909909909908,
+      "loss": 0.487,
+      "step": 2139
+    },
+    {
+      "epoch": 0.6415589282368371,
+      "grad_norm": 0.2788044214248657,
+      "learning_rate": 0.00020405405405405405,
+      "loss": 0.4255,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6418587221285367,
+      "grad_norm": 0.27385109663009644,
+      "learning_rate": 0.00020400900900900898,
+      "loss": 0.4535,
+      "step": 2141
+    },
+    {
+      "epoch": 0.6421585160202361,
+      "grad_norm": 0.26231497526168823,
+      "learning_rate": 0.00020396396396396395,
+      "loss": 0.4435,
+      "step": 2142
+    },
+    {
+      "epoch": 0.6424583099119355,
+      "grad_norm": 0.26582983136177063,
+      "learning_rate": 0.0002039189189189189,
+      "loss": 0.4394,
+      "step": 2143
+    },
+    {
+      "epoch": 0.642758103803635,
+      "grad_norm": 0.2665957510471344,
+      "learning_rate": 0.00020387387387387385,
+      "loss": 0.4401,
+      "step": 2144
+    },
+    {
+      "epoch": 0.6430578976953345,
+      "grad_norm": 0.2581312358379364,
+      "learning_rate": 0.0002038288288288288,
+      "loss": 0.4302,
+      "step": 2145
+    },
+    {
+      "epoch": 0.6433576915870339,
+      "grad_norm": 0.25063759088516235,
+      "learning_rate": 0.00020378378378378375,
+      "loss": 0.4558,
+      "step": 2146
+    },
+    {
+      "epoch": 0.6436574854787334,
+      "grad_norm": 0.2655949890613556,
+      "learning_rate": 0.00020373873873873874,
+      "loss": 0.4559,
+      "step": 2147
+    },
+    {
+      "epoch": 0.6439572793704328,
+      "grad_norm": 0.25797712802886963,
+      "learning_rate": 0.00020369369369369368,
+      "loss": 0.4127,
+      "step": 2148
+    },
+    {
+      "epoch": 0.6442570732621323,
+      "grad_norm": 0.2774755358695984,
+      "learning_rate": 0.00020364864864864861,
+      "loss": 0.4807,
+      "step": 2149
+    },
+    {
+      "epoch": 0.6445568671538318,
+      "grad_norm": 0.27283889055252075,
+      "learning_rate": 0.0002036036036036036,
+      "loss": 0.4483,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6448566610455312,
+      "grad_norm": 0.2647114396095276,
+      "learning_rate": 0.00020355855855855854,
+      "loss": 0.4629,
+      "step": 2151
+    },
+    {
+      "epoch": 0.6451564549372306,
+      "grad_norm": 0.2683508098125458,
+      "learning_rate": 0.00020351351351351348,
+      "loss": 0.4755,
+      "step": 2152
+    },
+    {
+      "epoch": 0.6454562488289302,
+      "grad_norm": 0.26375094056129456,
+      "learning_rate": 0.00020346846846846847,
+      "loss": 0.4354,
+      "step": 2153
+    },
+    {
+      "epoch": 0.6457560427206296,
+      "grad_norm": 0.2688734233379364,
+      "learning_rate": 0.0002034234234234234,
+      "loss": 0.4515,
+      "step": 2154
+    },
+    {
+      "epoch": 0.646055836612329,
+      "grad_norm": 0.2545178532600403,
+      "learning_rate": 0.00020337837837837834,
+      "loss": 0.4281,
+      "step": 2155
+    },
+    {
+      "epoch": 0.6463556305040284,
+      "grad_norm": 0.24753254652023315,
+      "learning_rate": 0.00020333333333333333,
+      "loss": 0.4332,
+      "step": 2156
+    },
+    {
+      "epoch": 0.646655424395728,
+      "grad_norm": 0.2619148790836334,
+      "learning_rate": 0.00020328828828828827,
+      "loss": 0.4531,
+      "step": 2157
+    },
+    {
+      "epoch": 0.6469552182874274,
+      "grad_norm": 0.2698518633842468,
+      "learning_rate": 0.0002032432432432432,
+      "loss": 0.4807,
+      "step": 2158
+    },
+    {
+      "epoch": 0.6472550121791268,
+      "grad_norm": 0.2625516355037689,
+      "learning_rate": 0.0002031981981981982,
+      "loss": 0.4368,
+      "step": 2159
+    },
+    {
+      "epoch": 0.6475548060708263,
+      "grad_norm": 0.25750574469566345,
+      "learning_rate": 0.00020315315315315313,
+      "loss": 0.4489,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6478545999625258,
+      "grad_norm": 0.2963887155056,
+      "learning_rate": 0.00020310810810810807,
+      "loss": 0.4567,
+      "step": 2161
+    },
+    {
+      "epoch": 0.6481543938542252,
+      "grad_norm": 0.28631022572517395,
+      "learning_rate": 0.00020306306306306306,
+      "loss": 0.457,
+      "step": 2162
+    },
+    {
+      "epoch": 0.6484541877459247,
+      "grad_norm": 0.2652076184749603,
+      "learning_rate": 0.000203018018018018,
+      "loss": 0.4008,
+      "step": 2163
+    },
+    {
+      "epoch": 0.6487539816376241,
+      "grad_norm": 0.25599247217178345,
+      "learning_rate": 0.00020297297297297294,
+      "loss": 0.4236,
+      "step": 2164
+    },
+    {
+      "epoch": 0.6490537755293236,
+      "grad_norm": 0.2655317783355713,
+      "learning_rate": 0.00020292792792792793,
+      "loss": 0.4401,
+      "step": 2165
+    },
+    {
+      "epoch": 0.6493535694210231,
+      "grad_norm": 0.27608954906463623,
+      "learning_rate": 0.00020288288288288286,
+      "loss": 0.4469,
+      "step": 2166
+    },
+    {
+      "epoch": 0.6496533633127225,
+      "grad_norm": 0.2523987293243408,
+      "learning_rate": 0.0002028378378378378,
+      "loss": 0.4134,
+      "step": 2167
+    },
+    {
+      "epoch": 0.6499531572044219,
+      "grad_norm": 0.26536789536476135,
+      "learning_rate": 0.0002027927927927928,
+      "loss": 0.4416,
+      "step": 2168
+    },
+    {
+      "epoch": 0.6502529510961215,
+      "grad_norm": 0.24977469444274902,
+      "learning_rate": 0.00020274774774774773,
+      "loss": 0.4318,
+      "step": 2169
+    },
+    {
+      "epoch": 0.6505527449878209,
+      "grad_norm": 0.27510321140289307,
+      "learning_rate": 0.0002027027027027027,
+      "loss": 0.4535,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6508525388795203,
+      "grad_norm": 0.24680355191230774,
+      "learning_rate": 0.00020265765765765763,
+      "loss": 0.4247,
+      "step": 2171
+    },
+    {
+      "epoch": 0.6511523327712198,
+      "grad_norm": 0.24580539762973785,
+      "learning_rate": 0.0002026126126126126,
+      "loss": 0.4356,
+      "step": 2172
+    },
+    {
+      "epoch": 0.6514521266629193,
+      "grad_norm": 0.2560003101825714,
+      "learning_rate": 0.00020256756756756756,
+      "loss": 0.4019,
+      "step": 2173
+    },
+    {
+      "epoch": 0.6517519205546187,
+      "grad_norm": 0.2403692603111267,
+      "learning_rate": 0.0002025225225225225,
+      "loss": 0.4088,
+      "step": 2174
+    },
+    {
+      "epoch": 0.6520517144463182,
+      "grad_norm": 0.25952261686325073,
+      "learning_rate": 0.00020247747747747746,
+      "loss": 0.4585,
+      "step": 2175
+    },
+    {
+      "epoch": 0.6523515083380176,
+      "grad_norm": 0.2858710289001465,
+      "learning_rate": 0.00020243243243243242,
+      "loss": 0.4616,
+      "step": 2176
+    },
+    {
+      "epoch": 0.6526513022297171,
+      "grad_norm": 0.2761286795139313,
+      "learning_rate": 0.00020238738738738736,
+      "loss": 0.4694,
+      "step": 2177
+    },
+    {
+      "epoch": 0.6529510961214166,
+      "grad_norm": 0.2620023190975189,
+      "learning_rate": 0.00020234234234234232,
+      "loss": 0.4441,
+      "step": 2178
+    },
+    {
+      "epoch": 0.653250890013116,
+      "grad_norm": 0.2743069529533386,
+      "learning_rate": 0.00020229729729729729,
+      "loss": 0.4433,
+      "step": 2179
+    },
+    {
+      "epoch": 0.6535506839048154,
+      "grad_norm": 0.25842079520225525,
+      "learning_rate": 0.00020225225225225222,
+      "loss": 0.475,
+      "step": 2180
+    },
+    {
+      "epoch": 0.653850477796515,
+      "grad_norm": 0.267971932888031,
+      "learning_rate": 0.0002022072072072072,
+      "loss": 0.4384,
+      "step": 2181
+    },
+    {
+      "epoch": 0.6541502716882144,
+      "grad_norm": 0.26229405403137207,
+      "learning_rate": 0.00020216216216216215,
+      "loss": 0.4525,
+      "step": 2182
+    },
+    {
+      "epoch": 0.6544500655799138,
+      "grad_norm": 0.26667454838752747,
+      "learning_rate": 0.0002021171171171171,
+      "loss": 0.4384,
+      "step": 2183
+    },
+    {
+      "epoch": 0.6547498594716132,
+      "grad_norm": 0.2867937982082367,
+      "learning_rate": 0.00020207207207207208,
+      "loss": 0.4575,
+      "step": 2184
+    },
+    {
+      "epoch": 0.6550496533633127,
+      "grad_norm": 0.2772350013256073,
+      "learning_rate": 0.00020202702702702701,
+      "loss": 0.4515,
+      "step": 2185
+    },
+    {
+      "epoch": 0.6553494472550122,
+      "grad_norm": 0.2599480450153351,
+      "learning_rate": 0.00020198198198198195,
+      "loss": 0.4614,
+      "step": 2186
+    },
+    {
+      "epoch": 0.6556492411467116,
+      "grad_norm": 0.25755733251571655,
+      "learning_rate": 0.00020193693693693694,
+      "loss": 0.4543,
+      "step": 2187
+    },
+    {
+      "epoch": 0.6559490350384111,
+      "grad_norm": 0.2630176246166229,
+      "learning_rate": 0.00020189189189189188,
+      "loss": 0.4317,
+      "step": 2188
+    },
+    {
+      "epoch": 0.6562488289301105,
+      "grad_norm": 0.25234025716781616,
+      "learning_rate": 0.00020184684684684682,
+      "loss": 0.4187,
+      "step": 2189
+    },
+    {
+      "epoch": 0.65654862282181,
+      "grad_norm": 0.2525855302810669,
+      "learning_rate": 0.0002018018018018018,
+      "loss": 0.4364,
+      "step": 2190
+    },
+    {
+      "epoch": 0.6568484167135095,
+      "grad_norm": 0.2502008378505707,
+      "learning_rate": 0.00020175675675675674,
+      "loss": 0.4162,
+      "step": 2191
+    },
+    {
+      "epoch": 0.6571482106052089,
+      "grad_norm": 0.24894630908966064,
+      "learning_rate": 0.00020171171171171168,
+      "loss": 0.4287,
+      "step": 2192
+    },
+    {
+      "epoch": 0.6574480044969083,
+      "grad_norm": 0.24618011713027954,
+      "learning_rate": 0.00020166666666666667,
+      "loss": 0.4225,
+      "step": 2193
+    },
+    {
+      "epoch": 0.6577477983886079,
+      "grad_norm": 0.3233836591243744,
+      "learning_rate": 0.0002016216216216216,
+      "loss": 0.5051,
+      "step": 2194
+    },
+    {
+      "epoch": 0.6580475922803073,
+      "grad_norm": 0.2600880265235901,
+      "learning_rate": 0.00020157657657657655,
+      "loss": 0.4437,
+      "step": 2195
+    },
+    {
+      "epoch": 0.6583473861720067,
+      "grad_norm": 0.2576145529747009,
+      "learning_rate": 0.0002015315315315315,
+      "loss": 0.4035,
+      "step": 2196
+    },
+    {
+      "epoch": 0.6586471800637062,
+      "grad_norm": 0.2555258572101593,
+      "learning_rate": 0.00020148648648648647,
+      "loss": 0.409,
+      "step": 2197
+    },
+    {
+      "epoch": 0.6589469739554057,
+      "grad_norm": 0.2691539525985718,
+      "learning_rate": 0.0002014414414414414,
+      "loss": 0.4281,
+      "step": 2198
+    },
+    {
+      "epoch": 0.6592467678471051,
+      "grad_norm": 0.2896745502948761,
+      "learning_rate": 0.00020139639639639637,
+      "loss": 0.4529,
+      "step": 2199
+    },
+    {
+      "epoch": 0.6595465617388045,
+      "grad_norm": 0.2579841911792755,
+      "learning_rate": 0.00020135135135135134,
+      "loss": 0.4479,
+      "step": 2200
+    },
+    {
+      "epoch": 0.659846355630504,
+      "grad_norm": 0.24643990397453308,
+      "learning_rate": 0.00020130630630630627,
+      "loss": 0.4205,
+      "step": 2201
+    },
+    {
+      "epoch": 0.6601461495222035,
+      "grad_norm": 0.2523176968097687,
+      "learning_rate": 0.00020126126126126124,
+      "loss": 0.428,
+      "step": 2202
+    },
+    {
+      "epoch": 0.660445943413903,
+      "grad_norm": 0.27141350507736206,
+      "learning_rate": 0.0002012162162162162,
+      "loss": 0.4345,
+      "step": 2203
+    },
+    {
+      "epoch": 0.6607457373056024,
+      "grad_norm": 0.2594592571258545,
+      "learning_rate": 0.00020117117117117117,
+      "loss": 0.428,
+      "step": 2204
+    },
+    {
+      "epoch": 0.6610455311973018,
+      "grad_norm": 0.2673409581184387,
+      "learning_rate": 0.0002011261261261261,
+      "loss": 0.4229,
+      "step": 2205
+    },
+    {
+      "epoch": 0.6613453250890013,
+      "grad_norm": 0.26660290360450745,
+      "learning_rate": 0.00020108108108108107,
+      "loss": 0.4635,
+      "step": 2206
+    },
+    {
+      "epoch": 0.6616451189807008,
+      "grad_norm": 0.2599898874759674,
+      "learning_rate": 0.00020103603603603603,
+      "loss": 0.4398,
+      "step": 2207
+    },
+    {
+      "epoch": 0.6619449128724002,
+      "grad_norm": 0.2786708474159241,
+      "learning_rate": 0.00020099099099099097,
+      "loss": 0.4221,
+      "step": 2208
+    },
+    {
+      "epoch": 0.6622447067640996,
+      "grad_norm": 0.2445315271615982,
+      "learning_rate": 0.00020094594594594593,
+      "loss": 0.4143,
+      "step": 2209
+    },
+    {
+      "epoch": 0.6625445006557992,
+      "grad_norm": 0.2615225315093994,
+      "learning_rate": 0.0002009009009009009,
+      "loss": 0.4388,
+      "step": 2210
+    },
+    {
+      "epoch": 0.6628442945474986,
+      "grad_norm": 0.25724494457244873,
+      "learning_rate": 0.00020085585585585583,
+      "loss": 0.4359,
+      "step": 2211
+    },
+    {
+      "epoch": 0.663144088439198,
+      "grad_norm": 0.2562429904937744,
+      "learning_rate": 0.0002008108108108108,
+      "loss": 0.4213,
+      "step": 2212
+    },
+    {
+      "epoch": 0.6634438823308975,
+      "grad_norm": 0.25428321957588196,
+      "learning_rate": 0.00020076576576576576,
+      "loss": 0.4398,
+      "step": 2213
+    },
+    {
+      "epoch": 0.663743676222597,
+      "grad_norm": 0.2503153383731842,
+      "learning_rate": 0.0002007207207207207,
+      "loss": 0.4266,
+      "step": 2214
+    },
+    {
+      "epoch": 0.6640434701142964,
+      "grad_norm": 0.26181626319885254,
+      "learning_rate": 0.00020067567567567566,
+      "loss": 0.4398,
+      "step": 2215
+    },
+    {
+      "epoch": 0.6643432640059959,
+      "grad_norm": 0.26600703597068787,
+      "learning_rate": 0.00020063063063063062,
+      "loss": 0.4418,
+      "step": 2216
+    },
+    {
+      "epoch": 0.6646430578976953,
+      "grad_norm": 0.27476897835731506,
+      "learning_rate": 0.00020058558558558556,
+      "loss": 0.4405,
+      "step": 2217
+    },
+    {
+      "epoch": 0.6649428517893948,
+      "grad_norm": 0.23994912207126617,
+      "learning_rate": 0.0002005405405405405,
+      "loss": 0.4171,
+      "step": 2218
+    },
+    {
+      "epoch": 0.6652426456810943,
+      "grad_norm": 0.2647401988506317,
+      "learning_rate": 0.0002004954954954955,
+      "loss": 0.4514,
+      "step": 2219
+    },
+    {
+      "epoch": 0.6655424395727937,
+      "grad_norm": 0.27040109038352966,
+      "learning_rate": 0.00020045045045045043,
+      "loss": 0.4665,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6658422334644931,
+      "grad_norm": 0.252128005027771,
+      "learning_rate": 0.00020040540540540536,
+      "loss": 0.4316,
+      "step": 2221
+    },
+    {
+      "epoch": 0.6661420273561927,
+      "grad_norm": 0.2605067193508148,
+      "learning_rate": 0.00020036036036036035,
+      "loss": 0.4242,
+      "step": 2222
+    },
+    {
+      "epoch": 0.6664418212478921,
+      "grad_norm": 0.26456427574157715,
+      "learning_rate": 0.0002003153153153153,
+      "loss": 0.4379,
+      "step": 2223
+    },
+    {
+      "epoch": 0.6667416151395915,
+      "grad_norm": 0.24853096902370453,
+      "learning_rate": 0.00020027027027027023,
+      "loss": 0.4339,
+      "step": 2224
+    },
+    {
+      "epoch": 0.6670414090312909,
+      "grad_norm": 0.2573966979980469,
+      "learning_rate": 0.00020022522522522522,
+      "loss": 0.4427,
+      "step": 2225
+    },
+    {
+      "epoch": 0.6673412029229905,
+      "grad_norm": 0.2402806282043457,
+      "learning_rate": 0.00020018018018018015,
+      "loss": 0.4195,
+      "step": 2226
+    },
+    {
+      "epoch": 0.6676409968146899,
+      "grad_norm": 0.2738892138004303,
+      "learning_rate": 0.00020013513513513512,
+      "loss": 0.4374,
+      "step": 2227
+    },
+    {
+      "epoch": 0.6679407907063893,
+      "grad_norm": 0.2548046410083771,
+      "learning_rate": 0.00020009009009009008,
+      "loss": 0.4312,
+      "step": 2228
+    },
+    {
+      "epoch": 0.6682405845980888,
+      "grad_norm": 0.24718345701694489,
+      "learning_rate": 0.00020004504504504502,
+      "loss": 0.4533,
+      "step": 2229
+    },
+    {
+      "epoch": 0.6685403784897883,
+      "grad_norm": 0.2544947862625122,
+      "learning_rate": 0.00019999999999999998,
+      "loss": 0.4255,
+      "step": 2230
+    },
+    {
+      "epoch": 0.6688401723814877,
+      "grad_norm": 0.2689761817455292,
+      "learning_rate": 0.00019995495495495495,
+      "loss": 0.451,
+      "step": 2231
+    },
+    {
+      "epoch": 0.6691399662731872,
+      "grad_norm": 0.2824248969554901,
+      "learning_rate": 0.00019990990990990988,
+      "loss": 0.4374,
+      "step": 2232
+    },
+    {
+      "epoch": 0.6694397601648866,
+      "grad_norm": 0.25299206376075745,
+      "learning_rate": 0.00019986486486486485,
+      "loss": 0.4301,
+      "step": 2233
+    },
+    {
+      "epoch": 0.6697395540565861,
+      "grad_norm": 0.26550740003585815,
+      "learning_rate": 0.0001998198198198198,
+      "loss": 0.4332,
+      "step": 2234
+    },
+    {
+      "epoch": 0.6700393479482856,
+      "grad_norm": 0.2446885108947754,
+      "learning_rate": 0.00019977477477477475,
+      "loss": 0.4207,
+      "step": 2235
+    },
+    {
+      "epoch": 0.670339141839985,
+      "grad_norm": 0.23779337108135223,
+      "learning_rate": 0.0001997297297297297,
+      "loss": 0.432,
+      "step": 2236
+    },
+    {
+      "epoch": 0.6706389357316844,
+      "grad_norm": 0.2619169354438782,
+      "learning_rate": 0.00019968468468468468,
+      "loss": 0.4332,
+      "step": 2237
+    },
+    {
+      "epoch": 0.670938729623384,
+      "grad_norm": 0.258789598941803,
+      "learning_rate": 0.00019963963963963964,
+      "loss": 0.4328,
+      "step": 2238
+    },
+    {
+      "epoch": 0.6712385235150834,
+      "grad_norm": 0.27282267808914185,
+      "learning_rate": 0.00019959459459459458,
+      "loss": 0.4423,
+      "step": 2239
+    },
+    {
+      "epoch": 0.6715383174067828,
+      "grad_norm": 0.2593368887901306,
+      "learning_rate": 0.00019954954954954954,
+      "loss": 0.4473,
+      "step": 2240
+    },
+    {
+      "epoch": 0.6718381112984823,
+      "grad_norm": 0.25508931279182434,
+      "learning_rate": 0.0001995045045045045,
+      "loss": 0.4371,
+      "step": 2241
+    },
+    {
+      "epoch": 0.6721379051901818,
+      "grad_norm": 0.25891220569610596,
+      "learning_rate": 0.00019945945945945944,
+      "loss": 0.4143,
+      "step": 2242
+    },
+    {
+      "epoch": 0.6724376990818812,
+      "grad_norm": 0.2661759853363037,
+      "learning_rate": 0.00019941441441441438,
+      "loss": 0.4585,
+      "step": 2243
+    },
+    {
+      "epoch": 0.6727374929735807,
+      "grad_norm": 0.25630587339401245,
+      "learning_rate": 0.00019936936936936937,
+      "loss": 0.4442,
+      "step": 2244
+    },
+    {
+      "epoch": 0.6730372868652801,
+      "grad_norm": 0.27050697803497314,
+      "learning_rate": 0.0001993243243243243,
+      "loss": 0.4437,
+      "step": 2245
+    },
+    {
+      "epoch": 0.6733370807569796,
+      "grad_norm": 0.2564866244792938,
+      "learning_rate": 0.00019927927927927924,
+      "loss": 0.4352,
+      "step": 2246
+    },
+    {
+      "epoch": 0.673636874648679,
+      "grad_norm": 0.2438693344593048,
+      "learning_rate": 0.00019923423423423423,
+      "loss": 0.4138,
+      "step": 2247
+    },
+    {
+      "epoch": 0.6739366685403785,
+      "grad_norm": 0.26038050651550293,
+      "learning_rate": 0.00019918918918918917,
+      "loss": 0.4309,
+      "step": 2248
+    },
+    {
+      "epoch": 0.6742364624320779,
+      "grad_norm": 0.2429644614458084,
+      "learning_rate": 0.0001991441441441441,
+      "loss": 0.4036,
+      "step": 2249
+    },
+    {
+      "epoch": 0.6745362563237775,
+      "grad_norm": 0.24670763313770294,
+      "learning_rate": 0.0001990990990990991,
+      "loss": 0.4054,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6748360502154769,
+      "grad_norm": 0.2641808092594147,
+      "learning_rate": 0.00019905405405405403,
+      "loss": 0.4431,
+      "step": 2251
+    },
+    {
+      "epoch": 0.6751358441071763,
+      "grad_norm": 0.2601335644721985,
+      "learning_rate": 0.00019900900900900897,
+      "loss": 0.403,
+      "step": 2252
+    },
+    {
+      "epoch": 0.6754356379988757,
+      "grad_norm": 0.2698741555213928,
+      "learning_rate": 0.00019896396396396396,
+      "loss": 0.4328,
+      "step": 2253
+    },
+    {
+      "epoch": 0.6757354318905753,
+      "grad_norm": 0.26246562600135803,
+      "learning_rate": 0.0001989189189189189,
+      "loss": 0.4264,
+      "step": 2254
+    },
+    {
+      "epoch": 0.6760352257822747,
+      "grad_norm": 0.26453712582588196,
+      "learning_rate": 0.00019887387387387384,
+      "loss": 0.4552,
+      "step": 2255
+    },
+    {
+      "epoch": 0.6763350196739741,
+      "grad_norm": 0.24887052178382874,
+      "learning_rate": 0.00019882882882882883,
+      "loss": 0.3918,
+      "step": 2256
+    },
+    {
+      "epoch": 0.6766348135656736,
+      "grad_norm": 0.26789602637290955,
+      "learning_rate": 0.00019878378378378376,
+      "loss": 0.4416,
+      "step": 2257
+    },
+    {
+      "epoch": 0.6769346074573731,
+      "grad_norm": 0.24345183372497559,
+      "learning_rate": 0.0001987387387387387,
+      "loss": 0.4355,
+      "step": 2258
+    },
+    {
+      "epoch": 0.6772344013490725,
+      "grad_norm": 0.2475103735923767,
+      "learning_rate": 0.0001986936936936937,
+      "loss": 0.4277,
+      "step": 2259
+    },
+    {
+      "epoch": 0.677534195240772,
+      "grad_norm": 0.2740587294101715,
+      "learning_rate": 0.00019864864864864863,
+      "loss": 0.4536,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6778339891324714,
+      "grad_norm": 0.23657365143299103,
+      "learning_rate": 0.0001986036036036036,
+      "loss": 0.3887,
+      "step": 2261
+    },
+    {
+      "epoch": 0.6781337830241709,
+      "grad_norm": 0.267630398273468,
+      "learning_rate": 0.00019855855855855856,
+      "loss": 0.443,
+      "step": 2262
+    },
+    {
+      "epoch": 0.6784335769158704,
+      "grad_norm": 0.2708898186683655,
+      "learning_rate": 0.0001985135135135135,
+      "loss": 0.4398,
+      "step": 2263
+    },
+    {
+      "epoch": 0.6787333708075698,
+      "grad_norm": 0.26607415080070496,
+      "learning_rate": 0.00019846846846846846,
+      "loss": 0.4249,
+      "step": 2264
+    },
+    {
+      "epoch": 0.6790331646992692,
+      "grad_norm": 0.2398756742477417,
+      "learning_rate": 0.00019842342342342342,
+      "loss": 0.419,
+      "step": 2265
+    },
+    {
+      "epoch": 0.6793329585909688,
+      "grad_norm": 0.2509295344352722,
+      "learning_rate": 0.00019837837837837836,
+      "loss": 0.3858,
+      "step": 2266
+    },
+    {
+      "epoch": 0.6796327524826682,
+      "grad_norm": 0.30269870162010193,
+      "learning_rate": 0.00019833333333333332,
+      "loss": 0.4564,
+      "step": 2267
+    },
+    {
+      "epoch": 0.6799325463743676,
+      "grad_norm": 0.2576700448989868,
+      "learning_rate": 0.00019828828828828826,
+      "loss": 0.4296,
+      "step": 2268
+    },
+    {
+      "epoch": 0.680232340266067,
+      "grad_norm": 0.29139164090156555,
+      "learning_rate": 0.00019824324324324322,
+      "loss": 0.4583,
+      "step": 2269
+    },
+    {
+      "epoch": 0.6805321341577666,
+      "grad_norm": 0.2578124701976776,
+      "learning_rate": 0.00019819819819819818,
+      "loss": 0.4419,
+      "step": 2270
+    },
+    {
+      "epoch": 0.680831928049466,
+      "grad_norm": 0.2546633780002594,
+      "learning_rate": 0.00019815315315315312,
+      "loss": 0.4311,
+      "step": 2271
+    },
+    {
+      "epoch": 0.6811317219411654,
+      "grad_norm": 0.293409526348114,
+      "learning_rate": 0.00019810810810810809,
+      "loss": 0.4756,
+      "step": 2272
+    },
+    {
+      "epoch": 0.6814315158328649,
+      "grad_norm": 0.249635249376297,
+      "learning_rate": 0.00019806306306306305,
+      "loss": 0.4434,
+      "step": 2273
+    },
+    {
+      "epoch": 0.6817313097245644,
+      "grad_norm": 0.2729664146900177,
+      "learning_rate": 0.00019801801801801799,
+      "loss": 0.4721,
+      "step": 2274
+    },
+    {
+      "epoch": 0.6820311036162638,
+      "grad_norm": 0.24961845576763153,
+      "learning_rate": 0.00019797297297297298,
+      "loss": 0.4124,
+      "step": 2275
+    },
+    {
+      "epoch": 0.6823308975079633,
+      "grad_norm": 0.26508617401123047,
+      "learning_rate": 0.00019792792792792791,
+      "loss": 0.4311,
+      "step": 2276
+    },
+    {
+      "epoch": 0.6826306913996627,
+      "grad_norm": 0.24888217449188232,
+      "learning_rate": 0.00019788288288288285,
+      "loss": 0.4334,
+      "step": 2277
+    },
+    {
+      "epoch": 0.6829304852913622,
+      "grad_norm": 0.2550651431083679,
+      "learning_rate": 0.00019783783783783784,
+      "loss": 0.4289,
+      "step": 2278
+    },
+    {
+      "epoch": 0.6832302791830617,
+      "grad_norm": 0.25816190242767334,
+      "learning_rate": 0.00019779279279279278,
+      "loss": 0.4425,
+      "step": 2279
+    },
+    {
+      "epoch": 0.6835300730747611,
+      "grad_norm": 0.25145018100738525,
+      "learning_rate": 0.00019774774774774772,
+      "loss": 0.4123,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6838298669664605,
+      "grad_norm": 0.24678850173950195,
+      "learning_rate": 0.0001977027027027027,
+      "loss": 0.4309,
+      "step": 2281
+    },
+    {
+      "epoch": 0.68412966085816,
+      "grad_norm": 0.2629925012588501,
+      "learning_rate": 0.00019765765765765764,
+      "loss": 0.4184,
+      "step": 2282
+    },
+    {
+      "epoch": 0.6844294547498595,
+      "grad_norm": 0.2568414807319641,
+      "learning_rate": 0.00019761261261261258,
+      "loss": 0.4164,
+      "step": 2283
+    },
+    {
+      "epoch": 0.6847292486415589,
+      "grad_norm": 0.25906744599342346,
+      "learning_rate": 0.00019756756756756757,
+      "loss": 0.4547,
+      "step": 2284
+    },
+    {
+      "epoch": 0.6850290425332584,
+      "grad_norm": 0.2697434723377228,
+      "learning_rate": 0.0001975225225225225,
+      "loss": 0.4444,
+      "step": 2285
+    },
+    {
+      "epoch": 0.6853288364249578,
+      "grad_norm": 0.2573794424533844,
+      "learning_rate": 0.00019747747747747744,
+      "loss": 0.4309,
+      "step": 2286
+    },
+    {
+      "epoch": 0.6856286303166573,
+      "grad_norm": 0.2532881796360016,
+      "learning_rate": 0.00019743243243243244,
+      "loss": 0.4172,
+      "step": 2287
+    },
+    {
+      "epoch": 0.6859284242083568,
+      "grad_norm": 0.253292977809906,
+      "learning_rate": 0.00019738738738738737,
+      "loss": 0.4305,
+      "step": 2288
+    },
+    {
+      "epoch": 0.6862282181000562,
+      "grad_norm": 0.246769517660141,
+      "learning_rate": 0.0001973423423423423,
+      "loss": 0.4106,
+      "step": 2289
+    },
+    {
+      "epoch": 0.6865280119917556,
+      "grad_norm": 0.2593647539615631,
+      "learning_rate": 0.0001972972972972973,
+      "loss": 0.4537,
+      "step": 2290
+    },
+    {
+      "epoch": 0.6868278058834552,
+      "grad_norm": 0.25611796975135803,
+      "learning_rate": 0.00019725225225225224,
+      "loss": 0.43,
+      "step": 2291
+    },
+    {
+      "epoch": 0.6871275997751546,
+      "grad_norm": 0.25119319558143616,
+      "learning_rate": 0.00019720720720720717,
+      "loss": 0.4142,
+      "step": 2292
+    },
+    {
+      "epoch": 0.687427393666854,
+      "grad_norm": 0.250675231218338,
+      "learning_rate": 0.00019716216216216214,
+      "loss": 0.4134,
+      "step": 2293
+    },
+    {
+      "epoch": 0.6877271875585534,
+      "grad_norm": 0.2680164873600006,
+      "learning_rate": 0.0001971171171171171,
+      "loss": 0.4201,
+      "step": 2294
+    },
+    {
+      "epoch": 0.688026981450253,
+      "grad_norm": 0.26599201560020447,
+      "learning_rate": 0.00019707207207207206,
+      "loss": 0.45,
+      "step": 2295
+    },
+    {
+      "epoch": 0.6883267753419524,
+      "grad_norm": 0.24248278141021729,
+      "learning_rate": 0.000197027027027027,
+      "loss": 0.4129,
+      "step": 2296
+    },
+    {
+      "epoch": 0.6886265692336518,
+      "grad_norm": 0.25668129324913025,
+      "learning_rate": 0.00019698198198198197,
+      "loss": 0.4354,
+      "step": 2297
+    },
+    {
+      "epoch": 0.6889263631253513,
+      "grad_norm": 0.26304370164871216,
+      "learning_rate": 0.00019693693693693693,
+      "loss": 0.4423,
+      "step": 2298
+    },
+    {
+      "epoch": 0.6892261570170508,
+      "grad_norm": 0.2509578466415405,
+      "learning_rate": 0.00019689189189189187,
+      "loss": 0.4263,
+      "step": 2299
+    },
+    {
+      "epoch": 0.6895259509087502,
+      "grad_norm": 0.2629247009754181,
+      "learning_rate": 0.00019684684684684683,
+      "loss": 0.4323,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6898257448004497,
+      "grad_norm": 0.24706493318080902,
+      "learning_rate": 0.0001968018018018018,
+      "loss": 0.3913,
+      "step": 2301
+    },
+    {
+      "epoch": 0.6901255386921491,
+      "grad_norm": 0.29551559686660767,
+      "learning_rate": 0.00019675675675675673,
+      "loss": 0.419,
+      "step": 2302
+    },
+    {
+      "epoch": 0.6904253325838486,
+      "grad_norm": 0.2612929046154022,
+      "learning_rate": 0.0001967117117117117,
+      "loss": 0.4223,
+      "step": 2303
+    },
+    {
+      "epoch": 0.6907251264755481,
+      "grad_norm": 0.28399109840393066,
+      "learning_rate": 0.00019666666666666666,
+      "loss": 0.4715,
+      "step": 2304
+    },
+    {
+      "epoch": 0.6910249203672475,
+      "grad_norm": 0.24555319547653198,
+      "learning_rate": 0.0001966216216216216,
+      "loss": 0.4559,
+      "step": 2305
+    },
+    {
+      "epoch": 0.6913247142589469,
+      "grad_norm": 0.2576359808444977,
+      "learning_rate": 0.00019657657657657656,
+      "loss": 0.4314,
+      "step": 2306
+    },
+    {
+      "epoch": 0.6916245081506465,
+      "grad_norm": 0.25595325231552124,
+      "learning_rate": 0.00019653153153153152,
+      "loss": 0.4286,
+      "step": 2307
+    },
+    {
+      "epoch": 0.6919243020423459,
+      "grad_norm": 0.23903168737888336,
+      "learning_rate": 0.00019648648648648646,
+      "loss": 0.4259,
+      "step": 2308
+    },
+    {
+      "epoch": 0.6922240959340453,
+      "grad_norm": 0.2797984480857849,
+      "learning_rate": 0.00019644144144144145,
+      "loss": 0.4621,
+      "step": 2309
+    },
+    {
+      "epoch": 0.6925238898257448,
+      "grad_norm": 0.25375935435295105,
+      "learning_rate": 0.0001963963963963964,
+      "loss": 0.4356,
+      "step": 2310
+    },
+    {
+      "epoch": 0.6928236837174443,
+      "grad_norm": 0.2765314280986786,
+      "learning_rate": 0.00019635135135135132,
+      "loss": 0.4474,
+      "step": 2311
+    },
+    {
+      "epoch": 0.6931234776091437,
+      "grad_norm": 0.23902222514152527,
+      "learning_rate": 0.00019630630630630632,
+      "loss": 0.4176,
+      "step": 2312
+    },
+    {
+      "epoch": 0.6934232715008432,
+      "grad_norm": 0.278622031211853,
+      "learning_rate": 0.00019626126126126125,
+      "loss": 0.4177,
+      "step": 2313
+    },
+    {
+      "epoch": 0.6937230653925426,
+      "grad_norm": 0.25161993503570557,
+      "learning_rate": 0.0001962162162162162,
+      "loss": 0.4235,
+      "step": 2314
+    },
+    {
+      "epoch": 0.6940228592842421,
+      "grad_norm": 0.28174108266830444,
+      "learning_rate": 0.00019617117117117113,
+      "loss": 0.4469,
+      "step": 2315
+    },
+    {
+      "epoch": 0.6943226531759416,
+      "grad_norm": 0.24297156929969788,
+      "learning_rate": 0.00019612612612612612,
+      "loss": 0.4348,
+      "step": 2316
+    },
+    {
+      "epoch": 0.694622447067641,
+      "grad_norm": 0.2582569122314453,
+      "learning_rate": 0.00019608108108108105,
+      "loss": 0.4373,
+      "step": 2317
+    },
+    {
+      "epoch": 0.6949222409593404,
+      "grad_norm": 0.2808705270290375,
+      "learning_rate": 0.00019603603603603602,
+      "loss": 0.4574,
+      "step": 2318
+    },
+    {
+      "epoch": 0.69522203485104,
+      "grad_norm": 0.27071914076805115,
+      "learning_rate": 0.00019599099099099098,
+      "loss": 0.4449,
+      "step": 2319
+    },
+    {
+      "epoch": 0.6955218287427394,
+      "grad_norm": 0.27735450863838196,
+      "learning_rate": 0.00019594594594594592,
+      "loss": 0.4154,
+      "step": 2320
+    },
+    {
+      "epoch": 0.6958216226344388,
+      "grad_norm": 0.25535905361175537,
+      "learning_rate": 0.00019590090090090088,
+      "loss": 0.4432,
+      "step": 2321
+    },
+    {
+      "epoch": 0.6961214165261382,
+      "grad_norm": 0.24208863079547882,
+      "learning_rate": 0.00019585585585585585,
+      "loss": 0.4248,
+      "step": 2322
+    },
+    {
+      "epoch": 0.6964212104178378,
+      "grad_norm": 0.26040393114089966,
+      "learning_rate": 0.00019581081081081078,
+      "loss": 0.4215,
+      "step": 2323
+    },
+    {
+      "epoch": 0.6967210043095372,
+      "grad_norm": 0.24389687180519104,
+      "learning_rate": 0.00019576576576576575,
+      "loss": 0.422,
+      "step": 2324
+    },
+    {
+      "epoch": 0.6970207982012366,
+      "grad_norm": 0.2545843720436096,
+      "learning_rate": 0.0001957207207207207,
+      "loss": 0.4384,
+      "step": 2325
+    },
+    {
+      "epoch": 0.6973205920929361,
+      "grad_norm": 0.2566373348236084,
+      "learning_rate": 0.00019567567567567565,
+      "loss": 0.4383,
+      "step": 2326
+    },
+    {
+      "epoch": 0.6976203859846356,
+      "grad_norm": 0.2538570463657379,
+      "learning_rate": 0.0001956306306306306,
+      "loss": 0.4285,
+      "step": 2327
+    },
+    {
+      "epoch": 0.697920179876335,
+      "grad_norm": 0.25821006298065186,
+      "learning_rate": 0.00019558558558558557,
+      "loss": 0.4304,
+      "step": 2328
+    },
+    {
+      "epoch": 0.6982199737680345,
+      "grad_norm": 0.26139143109321594,
+      "learning_rate": 0.0001955405405405405,
+      "loss": 0.4416,
+      "step": 2329
+    },
+    {
+      "epoch": 0.6985197676597339,
+      "grad_norm": 0.2557656168937683,
+      "learning_rate": 0.00019549549549549548,
+      "loss": 0.4166,
+      "step": 2330
+    },
+    {
+      "epoch": 0.6988195615514334,
+      "grad_norm": 0.2611480951309204,
+      "learning_rate": 0.00019545045045045044,
+      "loss": 0.4463,
+      "step": 2331
+    },
+    {
+      "epoch": 0.6991193554431329,
+      "grad_norm": 0.24384301900863647,
+      "learning_rate": 0.0001954054054054054,
+      "loss": 0.4027,
+      "step": 2332
+    },
+    {
+      "epoch": 0.6994191493348323,
+      "grad_norm": 0.2693532407283783,
+      "learning_rate": 0.00019536036036036034,
+      "loss": 0.4449,
+      "step": 2333
+    },
+    {
+      "epoch": 0.6997189432265317,
+      "grad_norm": 0.2669787108898163,
+      "learning_rate": 0.0001953153153153153,
+      "loss": 0.4266,
+      "step": 2334
+    },
+    {
+      "epoch": 0.7000187371182313,
+      "grad_norm": 0.23384937644004822,
+      "learning_rate": 0.00019527027027027027,
+      "loss": 0.4103,
+      "step": 2335
+    },
+    {
+      "epoch": 0.7003185310099307,
+      "grad_norm": 0.2738743722438812,
+      "learning_rate": 0.0001952252252252252,
+      "loss": 0.4664,
+      "step": 2336
+    },
+    {
+      "epoch": 0.7006183249016301,
+      "grad_norm": 0.2557884752750397,
+      "learning_rate": 0.00019518018018018017,
+      "loss": 0.4121,
+      "step": 2337
+    },
+    {
+      "epoch": 0.7009181187933295,
+      "grad_norm": 0.24830694496631622,
+      "learning_rate": 0.00019513513513513513,
+      "loss": 0.4,
+      "step": 2338
+    },
+    {
+      "epoch": 0.7012179126850291,
+      "grad_norm": 0.2636083960533142,
+      "learning_rate": 0.00019509009009009007,
+      "loss": 0.4418,
+      "step": 2339
+    },
+    {
+      "epoch": 0.7015177065767285,
+      "grad_norm": 0.252029687166214,
+      "learning_rate": 0.000195045045045045,
+      "loss": 0.4205,
+      "step": 2340
+    },
+    {
+      "epoch": 0.701817500468428,
+      "grad_norm": 0.25448256731033325,
+      "learning_rate": 0.000195,
+      "loss": 0.4517,
+      "step": 2341
+    },
+    {
+      "epoch": 0.7021172943601274,
+      "grad_norm": 0.26609039306640625,
+      "learning_rate": 0.00019495495495495493,
+      "loss": 0.4213,
+      "step": 2342
+    },
+    {
+      "epoch": 0.7024170882518269,
+      "grad_norm": 0.2746337652206421,
+      "learning_rate": 0.00019490990990990987,
+      "loss": 0.4183,
+      "step": 2343
+    },
+    {
+      "epoch": 0.7027168821435263,
+      "grad_norm": 0.2514724135398865,
+      "learning_rate": 0.00019486486486486486,
+      "loss": 0.4329,
+      "step": 2344
+    },
+    {
+      "epoch": 0.7030166760352258,
+      "grad_norm": 0.27683892846107483,
+      "learning_rate": 0.0001948198198198198,
+      "loss": 0.4207,
+      "step": 2345
+    },
+    {
+      "epoch": 0.7033164699269252,
+      "grad_norm": 0.2525181174278259,
+      "learning_rate": 0.00019477477477477473,
+      "loss": 0.4193,
+      "step": 2346
+    },
+    {
+      "epoch": 0.7036162638186247,
+      "grad_norm": 0.2759072482585907,
+      "learning_rate": 0.00019472972972972973,
+      "loss": 0.4508,
+      "step": 2347
+    },
+    {
+      "epoch": 0.7039160577103242,
+      "grad_norm": 0.2594849169254303,
+      "learning_rate": 0.00019468468468468466,
+      "loss": 0.4289,
+      "step": 2348
+    },
+    {
+      "epoch": 0.7042158516020236,
+      "grad_norm": 0.26971113681793213,
+      "learning_rate": 0.0001946396396396396,
+      "loss": 0.4514,
+      "step": 2349
+    },
+    {
+      "epoch": 0.704515645493723,
+      "grad_norm": 0.25291457772254944,
+      "learning_rate": 0.0001945945945945946,
+      "loss": 0.4664,
+      "step": 2350
+    },
+    {
+      "epoch": 0.7048154393854226,
+      "grad_norm": 0.2617851495742798,
+      "learning_rate": 0.00019454954954954953,
+      "loss": 0.4301,
+      "step": 2351
+    },
+    {
+      "epoch": 0.705115233277122,
+      "grad_norm": 0.24216975271701813,
+      "learning_rate": 0.00019450450450450446,
+      "loss": 0.4144,
+      "step": 2352
+    },
+    {
+      "epoch": 0.7054150271688214,
+      "grad_norm": 0.2737904489040375,
+      "learning_rate": 0.00019445945945945945,
+      "loss": 0.417,
+      "step": 2353
+    },
+    {
+      "epoch": 0.7057148210605209,
+      "grad_norm": 0.27587682008743286,
+      "learning_rate": 0.0001944144144144144,
+      "loss": 0.4421,
+      "step": 2354
+    },
+    {
+      "epoch": 0.7060146149522204,
+      "grad_norm": 0.24917447566986084,
+      "learning_rate": 0.00019436936936936936,
+      "loss": 0.406,
+      "step": 2355
+    },
+    {
+      "epoch": 0.7063144088439198,
+      "grad_norm": 0.27958497405052185,
+      "learning_rate": 0.00019432432432432432,
+      "loss": 0.4591,
+      "step": 2356
+    },
+    {
+      "epoch": 0.7066142027356193,
+      "grad_norm": 0.27273818850517273,
+      "learning_rate": 0.00019427927927927926,
+      "loss": 0.4219,
+      "step": 2357
+    },
+    {
+      "epoch": 0.7069139966273187,
+      "grad_norm": 0.24517607688903809,
+      "learning_rate": 0.00019423423423423422,
+      "loss": 0.4063,
+      "step": 2358
+    },
+    {
+      "epoch": 0.7072137905190182,
+      "grad_norm": 0.28854820132255554,
+      "learning_rate": 0.00019418918918918918,
+      "loss": 0.4383,
+      "step": 2359
+    },
+    {
+      "epoch": 0.7075135844107177,
+      "grad_norm": 0.27321329712867737,
+      "learning_rate": 0.00019414414414414412,
+      "loss": 0.4479,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7078133783024171,
+      "grad_norm": 0.2749727666378021,
+      "learning_rate": 0.00019409909909909908,
+      "loss": 0.4272,
+      "step": 2361
+    },
+    {
+      "epoch": 0.7081131721941165,
+      "grad_norm": 0.27384045720100403,
+      "learning_rate": 0.00019405405405405405,
+      "loss": 0.444,
+      "step": 2362
+    },
+    {
+      "epoch": 0.708412966085816,
+      "grad_norm": 0.2604135274887085,
+      "learning_rate": 0.00019400900900900898,
+      "loss": 0.4248,
+      "step": 2363
+    },
+    {
+      "epoch": 0.7087127599775155,
+      "grad_norm": 0.2598932385444641,
+      "learning_rate": 0.00019396396396396395,
+      "loss": 0.4401,
+      "step": 2364
+    },
+    {
+      "epoch": 0.7090125538692149,
+      "grad_norm": 0.253755122423172,
+      "learning_rate": 0.00019391891891891889,
+      "loss": 0.4316,
+      "step": 2365
+    },
+    {
+      "epoch": 0.7093123477609143,
+      "grad_norm": 0.2677047848701477,
+      "learning_rate": 0.00019387387387387388,
+      "loss": 0.4264,
+      "step": 2366
+    },
+    {
+      "epoch": 0.7096121416526139,
+      "grad_norm": 0.24191899597644806,
+      "learning_rate": 0.0001938288288288288,
+      "loss": 0.4172,
+      "step": 2367
+    },
+    {
+      "epoch": 0.7099119355443133,
+      "grad_norm": 0.2684822082519531,
+      "learning_rate": 0.00019378378378378375,
+      "loss": 0.4466,
+      "step": 2368
+    },
+    {
+      "epoch": 0.7102117294360127,
+      "grad_norm": 0.2859225273132324,
+      "learning_rate": 0.00019373873873873874,
+      "loss": 0.4348,
+      "step": 2369
+    },
+    {
+      "epoch": 0.7105115233277122,
+      "grad_norm": 0.2676656246185303,
+      "learning_rate": 0.00019369369369369368,
+      "loss": 0.4322,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7108113172194117,
+      "grad_norm": 0.27145159244537354,
+      "learning_rate": 0.00019364864864864861,
+      "loss": 0.4485,
+      "step": 2371
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.2646559178829193,
+      "learning_rate": 0.0001936036036036036,
+      "loss": 0.4255,
+      "step": 2372
+    },
+    {
+      "epoch": 0.7114109050028106,
+      "grad_norm": 0.26495832204818726,
+      "learning_rate": 0.00019355855855855854,
+      "loss": 0.4681,
+      "step": 2373
+    },
+    {
+      "epoch": 0.71171069889451,
+      "grad_norm": 0.26360276341438293,
+      "learning_rate": 0.00019351351351351348,
+      "loss": 0.4219,
+      "step": 2374
+    },
+    {
+      "epoch": 0.7120104927862094,
+      "grad_norm": 0.25372758507728577,
+      "learning_rate": 0.00019346846846846847,
+      "loss": 0.4542,
+      "step": 2375
+    },
+    {
+      "epoch": 0.712310286677909,
+      "grad_norm": 0.25859972834587097,
+      "learning_rate": 0.0001934234234234234,
+      "loss": 0.4121,
+      "step": 2376
+    },
+    {
+      "epoch": 0.7126100805696084,
+      "grad_norm": 0.2402067929506302,
+      "learning_rate": 0.00019337837837837834,
+      "loss": 0.4132,
+      "step": 2377
+    },
+    {
+      "epoch": 0.7129098744613078,
+      "grad_norm": 0.25183209776878357,
+      "learning_rate": 0.00019333333333333333,
+      "loss": 0.4106,
+      "step": 2378
+    },
+    {
+      "epoch": 0.7132096683530073,
+      "grad_norm": 0.24842600524425507,
+      "learning_rate": 0.00019328828828828827,
+      "loss": 0.3895,
+      "step": 2379
+    },
+    {
+      "epoch": 0.7135094622447068,
+      "grad_norm": 0.2635684609413147,
+      "learning_rate": 0.0001932432432432432,
+      "loss": 0.4112,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7138092561364062,
+      "grad_norm": 0.2578394412994385,
+      "learning_rate": 0.0001931981981981982,
+      "loss": 0.4429,
+      "step": 2381
+    },
+    {
+      "epoch": 0.7141090500281057,
+      "grad_norm": 0.2912173867225647,
+      "learning_rate": 0.00019315315315315314,
+      "loss": 0.4256,
+      "step": 2382
+    },
+    {
+      "epoch": 0.7144088439198051,
+      "grad_norm": 0.2592369019985199,
+      "learning_rate": 0.00019310810810810807,
+      "loss": 0.4357,
+      "step": 2383
+    },
+    {
+      "epoch": 0.7147086378115046,
+      "grad_norm": 0.25853514671325684,
+      "learning_rate": 0.00019306306306306306,
+      "loss": 0.3816,
+      "step": 2384
+    },
+    {
+      "epoch": 0.715008431703204,
+      "grad_norm": 0.26601412892341614,
+      "learning_rate": 0.000193018018018018,
+      "loss": 0.4344,
+      "step": 2385
+    },
+    {
+      "epoch": 0.7153082255949035,
+      "grad_norm": 0.2789902091026306,
+      "learning_rate": 0.00019297297297297294,
+      "loss": 0.435,
+      "step": 2386
+    },
+    {
+      "epoch": 0.7156080194866029,
+      "grad_norm": 0.285659521818161,
+      "learning_rate": 0.0001929279279279279,
+      "loss": 0.4194,
+      "step": 2387
+    },
+    {
+      "epoch": 0.7159078133783024,
+      "grad_norm": 0.24971207976341248,
+      "learning_rate": 0.00019288288288288286,
+      "loss": 0.4322,
+      "step": 2388
+    },
+    {
+      "epoch": 0.7162076072700019,
+      "grad_norm": 0.30142828822135925,
+      "learning_rate": 0.00019283783783783783,
+      "loss": 0.415,
+      "step": 2389
+    },
+    {
+      "epoch": 0.7165074011617013,
+      "grad_norm": 0.2843458950519562,
+      "learning_rate": 0.00019279279279279277,
+      "loss": 0.4605,
+      "step": 2390
+    },
+    {
+      "epoch": 0.7168071950534007,
+      "grad_norm": 0.24795542657375336,
+      "learning_rate": 0.00019274774774774773,
+      "loss": 0.4147,
+      "step": 2391
+    },
+    {
+      "epoch": 0.7171069889451003,
+      "grad_norm": 0.2908228635787964,
+      "learning_rate": 0.0001927027027027027,
+      "loss": 0.4573,
+      "step": 2392
+    },
+    {
+      "epoch": 0.7174067828367997,
+      "grad_norm": 0.2770592272281647,
+      "learning_rate": 0.00019265765765765763,
+      "loss": 0.4248,
+      "step": 2393
+    },
+    {
+      "epoch": 0.7177065767284991,
+      "grad_norm": 0.2638701796531677,
+      "learning_rate": 0.0001926126126126126,
+      "loss": 0.4409,
+      "step": 2394
+    },
+    {
+      "epoch": 0.7180063706201986,
+      "grad_norm": 0.2952423393726349,
+      "learning_rate": 0.00019256756756756756,
+      "loss": 0.4651,
+      "step": 2395
+    },
+    {
+      "epoch": 0.7183061645118981,
+      "grad_norm": 0.2776016294956207,
+      "learning_rate": 0.0001925225225225225,
+      "loss": 0.4453,
+      "step": 2396
+    },
+    {
+      "epoch": 0.7186059584035975,
+      "grad_norm": 0.2560594081878662,
+      "learning_rate": 0.00019247747747747746,
+      "loss": 0.4441,
+      "step": 2397
+    },
+    {
+      "epoch": 0.718905752295297,
+      "grad_norm": 0.2397725135087967,
+      "learning_rate": 0.00019243243243243242,
+      "loss": 0.3933,
+      "step": 2398
+    },
+    {
+      "epoch": 0.7192055461869964,
+      "grad_norm": 0.2630067467689514,
+      "learning_rate": 0.00019238738738738736,
+      "loss": 0.4334,
+      "step": 2399
+    },
+    {
+      "epoch": 0.7195053400786959,
+      "grad_norm": 0.2648056149482727,
+      "learning_rate": 0.00019234234234234235,
+      "loss": 0.4456,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7198051339703954,
+      "grad_norm": 0.27124911546707153,
+      "learning_rate": 0.0001922972972972973,
+      "loss": 0.4324,
+      "step": 2401
+    },
+    {
+      "epoch": 0.7201049278620948,
+      "grad_norm": 0.2809448540210724,
+      "learning_rate": 0.00019225225225225222,
+      "loss": 0.4419,
+      "step": 2402
+    },
+    {
+      "epoch": 0.7204047217537942,
+      "grad_norm": 0.2577565610408783,
+      "learning_rate": 0.00019220720720720721,
+      "loss": 0.441,
+      "step": 2403
+    },
+    {
+      "epoch": 0.7207045156454938,
+      "grad_norm": 0.261111855506897,
+      "learning_rate": 0.00019216216216216215,
+      "loss": 0.4234,
+      "step": 2404
+    },
+    {
+      "epoch": 0.7210043095371932,
+      "grad_norm": 0.2598218321800232,
+      "learning_rate": 0.0001921171171171171,
+      "loss": 0.4183,
+      "step": 2405
+    },
+    {
+      "epoch": 0.7213041034288926,
+      "grad_norm": 0.2712027132511139,
+      "learning_rate": 0.00019207207207207208,
+      "loss": 0.4354,
+      "step": 2406
+    },
+    {
+      "epoch": 0.721603897320592,
+      "grad_norm": 0.27912774682044983,
+      "learning_rate": 0.00019202702702702702,
+      "loss": 0.4594,
+      "step": 2407
+    },
+    {
+      "epoch": 0.7219036912122916,
+      "grad_norm": 0.25328657031059265,
+      "learning_rate": 0.00019198198198198195,
+      "loss": 0.408,
+      "step": 2408
+    },
+    {
+      "epoch": 0.722203485103991,
+      "grad_norm": 0.2425694465637207,
+      "learning_rate": 0.00019193693693693694,
+      "loss": 0.4197,
+      "step": 2409
+    },
+    {
+      "epoch": 0.7225032789956904,
+      "grad_norm": 0.2552039623260498,
+      "learning_rate": 0.00019189189189189188,
+      "loss": 0.4383,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7228030728873899,
+      "grad_norm": 0.26128917932510376,
+      "learning_rate": 0.00019184684684684682,
+      "loss": 0.4204,
+      "step": 2411
+    },
+    {
+      "epoch": 0.7231028667790894,
+      "grad_norm": 0.2713935077190399,
+      "learning_rate": 0.00019180180180180178,
+      "loss": 0.4325,
+      "step": 2412
+    },
+    {
+      "epoch": 0.7234026606707888,
+      "grad_norm": 0.257618248462677,
+      "learning_rate": 0.00019175675675675674,
+      "loss": 0.4443,
+      "step": 2413
+    },
+    {
+      "epoch": 0.7237024545624883,
+      "grad_norm": 0.2597353458404541,
+      "learning_rate": 0.00019171171171171168,
+      "loss": 0.4415,
+      "step": 2414
+    },
+    {
+      "epoch": 0.7240022484541877,
+      "grad_norm": 0.26509061455726624,
+      "learning_rate": 0.00019166666666666665,
+      "loss": 0.4711,
+      "step": 2415
+    },
+    {
+      "epoch": 0.7243020423458872,
+      "grad_norm": 0.26354658603668213,
+      "learning_rate": 0.0001916216216216216,
+      "loss": 0.4341,
+      "step": 2416
+    },
+    {
+      "epoch": 0.7246018362375867,
+      "grad_norm": 0.2364490032196045,
+      "learning_rate": 0.00019157657657657655,
+      "loss": 0.4016,
+      "step": 2417
+    },
+    {
+      "epoch": 0.7249016301292861,
+      "grad_norm": 0.24982942640781403,
+      "learning_rate": 0.0001915315315315315,
+      "loss": 0.427,
+      "step": 2418
+    },
+    {
+      "epoch": 0.7252014240209855,
+      "grad_norm": 0.27166748046875,
+      "learning_rate": 0.00019148648648648647,
+      "loss": 0.4807,
+      "step": 2419
+    },
+    {
+      "epoch": 0.7255012179126851,
+      "grad_norm": 0.24789117276668549,
+      "learning_rate": 0.0001914414414414414,
+      "loss": 0.4265,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7258010118043845,
+      "grad_norm": 0.2433491349220276,
+      "learning_rate": 0.00019139639639639637,
+      "loss": 0.4405,
+      "step": 2421
+    },
+    {
+      "epoch": 0.7261008056960839,
+      "grad_norm": 0.24121679365634918,
+      "learning_rate": 0.00019135135135135134,
+      "loss": 0.4215,
+      "step": 2422
+    },
+    {
+      "epoch": 0.7264005995877834,
+      "grad_norm": 0.25895169377326965,
+      "learning_rate": 0.0001913063063063063,
+      "loss": 0.4168,
+      "step": 2423
+    },
+    {
+      "epoch": 0.7267003934794829,
+      "grad_norm": 0.24981217086315155,
+      "learning_rate": 0.00019126126126126124,
+      "loss": 0.4268,
+      "step": 2424
+    },
+    {
+      "epoch": 0.7270001873711823,
+      "grad_norm": 0.25490307807922363,
+      "learning_rate": 0.0001912162162162162,
+      "loss": 0.4464,
+      "step": 2425
+    },
+    {
+      "epoch": 0.7272999812628818,
+      "grad_norm": 0.2552802562713623,
+      "learning_rate": 0.00019117117117117117,
+      "loss": 0.4461,
+      "step": 2426
+    },
+    {
+      "epoch": 0.7275997751545812,
+      "grad_norm": 0.27454614639282227,
+      "learning_rate": 0.0001911261261261261,
+      "loss": 0.4683,
+      "step": 2427
+    },
+    {
+      "epoch": 0.7278995690462807,
+      "grad_norm": 0.2501683831214905,
+      "learning_rate": 0.00019108108108108107,
+      "loss": 0.4244,
+      "step": 2428
+    },
+    {
+      "epoch": 0.7281993629379802,
+      "grad_norm": 0.24820026755332947,
+      "learning_rate": 0.00019103603603603603,
+      "loss": 0.4416,
+      "step": 2429
+    },
+    {
+      "epoch": 0.7284991568296796,
+      "grad_norm": 0.25755947828292847,
+      "learning_rate": 0.00019099099099099097,
+      "loss": 0.4377,
+      "step": 2430
+    },
+    {
+      "epoch": 0.728798950721379,
+      "grad_norm": 0.268839031457901,
+      "learning_rate": 0.00019094594594594593,
+      "loss": 0.46,
+      "step": 2431
+    },
+    {
+      "epoch": 0.7290987446130786,
+      "grad_norm": 0.2707115113735199,
+      "learning_rate": 0.0001909009009009009,
+      "loss": 0.4375,
+      "step": 2432
+    },
+    {
+      "epoch": 0.729398538504778,
+      "grad_norm": 0.25406280159950256,
+      "learning_rate": 0.00019085585585585583,
+      "loss": 0.441,
+      "step": 2433
+    },
+    {
+      "epoch": 0.7296983323964774,
+      "grad_norm": 0.2569238841533661,
+      "learning_rate": 0.00019081081081081082,
+      "loss": 0.4446,
+      "step": 2434
+    },
+    {
+      "epoch": 0.7299981262881768,
+      "grad_norm": 0.2784389555454254,
+      "learning_rate": 0.00019076576576576576,
+      "loss": 0.4396,
+      "step": 2435
+    },
+    {
+      "epoch": 0.7302979201798764,
+      "grad_norm": 0.25094011425971985,
+      "learning_rate": 0.0001907207207207207,
+      "loss": 0.4189,
+      "step": 2436
+    },
+    {
+      "epoch": 0.7305977140715758,
+      "grad_norm": 0.2696321904659271,
+      "learning_rate": 0.00019067567567567563,
+      "loss": 0.4575,
+      "step": 2437
+    },
+    {
+      "epoch": 0.7308975079632752,
+      "grad_norm": 0.2526487112045288,
+      "learning_rate": 0.00019063063063063062,
+      "loss": 0.4081,
+      "step": 2438
+    },
+    {
+      "epoch": 0.7311973018549747,
+      "grad_norm": 0.2577098608016968,
+      "learning_rate": 0.00019058558558558556,
+      "loss": 0.4478,
+      "step": 2439
+    },
+    {
+      "epoch": 0.7314970957466742,
+      "grad_norm": 0.26381629705429077,
+      "learning_rate": 0.0001905405405405405,
+      "loss": 0.4573,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7317968896383736,
+      "grad_norm": 0.26065248250961304,
+      "learning_rate": 0.0001904954954954955,
+      "loss": 0.4413,
+      "step": 2441
+    },
+    {
+      "epoch": 0.7320966835300731,
+      "grad_norm": 0.24994462728500366,
+      "learning_rate": 0.00019045045045045043,
+      "loss": 0.4016,
+      "step": 2442
+    },
+    {
+      "epoch": 0.7323964774217725,
+      "grad_norm": 0.2742599844932556,
+      "learning_rate": 0.00019040540540540536,
+      "loss": 0.4578,
+      "step": 2443
+    },
+    {
+      "epoch": 0.732696271313472,
+      "grad_norm": 0.23844504356384277,
+      "learning_rate": 0.00019036036036036035,
+      "loss": 0.4078,
+      "step": 2444
+    },
+    {
+      "epoch": 0.7329960652051715,
+      "grad_norm": 0.2562139630317688,
+      "learning_rate": 0.0001903153153153153,
+      "loss": 0.4363,
+      "step": 2445
+    },
+    {
+      "epoch": 0.7332958590968709,
+      "grad_norm": 0.2525213956832886,
+      "learning_rate": 0.00019027027027027025,
+      "loss": 0.4585,
+      "step": 2446
+    },
+    {
+      "epoch": 0.7335956529885703,
+      "grad_norm": 0.2562119662761688,
+      "learning_rate": 0.00019022522522522522,
+      "loss": 0.4158,
+      "step": 2447
+    },
+    {
+      "epoch": 0.7338954468802699,
+      "grad_norm": 0.2771570682525635,
+      "learning_rate": 0.00019018018018018015,
+      "loss": 0.4601,
+      "step": 2448
+    },
+    {
+      "epoch": 0.7341952407719693,
+      "grad_norm": 0.2594900131225586,
+      "learning_rate": 0.00019013513513513512,
+      "loss": 0.4252,
+      "step": 2449
+    },
+    {
+      "epoch": 0.7344950346636687,
+      "grad_norm": 0.27634164690971375,
+      "learning_rate": 0.00019009009009009008,
+      "loss": 0.4614,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7347948285553682,
+      "grad_norm": 0.27118000388145447,
+      "learning_rate": 0.00019004504504504502,
+      "loss": 0.425,
+      "step": 2451
+    },
+    {
+      "epoch": 0.7350946224470677,
+      "grad_norm": 0.26404282450675964,
+      "learning_rate": 0.00018999999999999998,
+      "loss": 0.435,
+      "step": 2452
+    },
+    {
+      "epoch": 0.7353944163387671,
+      "grad_norm": 0.24286137521266937,
+      "learning_rate": 0.00018995495495495495,
+      "loss": 0.4099,
+      "step": 2453
+    },
+    {
+      "epoch": 0.7356942102304665,
+      "grad_norm": 0.2554706335067749,
+      "learning_rate": 0.00018990990990990988,
+      "loss": 0.4031,
+      "step": 2454
+    },
+    {
+      "epoch": 0.735994004122166,
+      "grad_norm": 0.2666279375553131,
+      "learning_rate": 0.00018986486486486485,
+      "loss": 0.4397,
+      "step": 2455
+    },
+    {
+      "epoch": 0.7362937980138655,
+      "grad_norm": 0.24479645490646362,
+      "learning_rate": 0.0001898198198198198,
+      "loss": 0.4059,
+      "step": 2456
+    },
+    {
+      "epoch": 0.736593591905565,
+      "grad_norm": 0.27331724762916565,
+      "learning_rate": 0.00018977477477477478,
+      "loss": 0.4337,
+      "step": 2457
+    },
+    {
+      "epoch": 0.7368933857972644,
+      "grad_norm": 0.2546418309211731,
+      "learning_rate": 0.0001897297297297297,
+      "loss": 0.4206,
+      "step": 2458
+    },
+    {
+      "epoch": 0.7371931796889638,
+      "grad_norm": 0.2593313753604889,
+      "learning_rate": 0.00018968468468468468,
+      "loss": 0.39,
+      "step": 2459
+    },
+    {
+      "epoch": 0.7374929735806633,
+      "grad_norm": 0.2757156789302826,
+      "learning_rate": 0.00018963963963963964,
+      "loss": 0.441,
+      "step": 2460
+    },
+    {
+      "epoch": 0.7377927674723628,
+      "grad_norm": 0.2826617956161499,
+      "learning_rate": 0.00018959459459459458,
+      "loss": 0.4497,
+      "step": 2461
+    },
+    {
+      "epoch": 0.7380925613640622,
+      "grad_norm": 0.26498305797576904,
+      "learning_rate": 0.00018954954954954951,
+      "loss": 0.4429,
+      "step": 2462
+    },
+    {
+      "epoch": 0.7383923552557616,
+      "grad_norm": 0.22784557938575745,
+      "learning_rate": 0.0001895045045045045,
+      "loss": 0.4069,
+      "step": 2463
+    },
+    {
+      "epoch": 0.7386921491474612,
+      "grad_norm": 0.277037113904953,
+      "learning_rate": 0.00018945945945945944,
+      "loss": 0.4257,
+      "step": 2464
+    },
+    {
+      "epoch": 0.7389919430391606,
+      "grad_norm": 0.25758159160614014,
+      "learning_rate": 0.00018941441441441438,
+      "loss": 0.4255,
+      "step": 2465
+    },
+    {
+      "epoch": 0.73929173693086,
+      "grad_norm": 0.24654820561408997,
+      "learning_rate": 0.00018936936936936937,
+      "loss": 0.402,
+      "step": 2466
+    },
+    {
+      "epoch": 0.7395915308225595,
+      "grad_norm": 0.259376585483551,
+      "learning_rate": 0.0001893243243243243,
+      "loss": 0.416,
+      "step": 2467
+    },
+    {
+      "epoch": 0.739891324714259,
+      "grad_norm": 0.28223109245300293,
+      "learning_rate": 0.00018927927927927924,
+      "loss": 0.4675,
+      "step": 2468
+    },
+    {
+      "epoch": 0.7401911186059584,
+      "grad_norm": 0.2680475413799286,
+      "learning_rate": 0.00018923423423423423,
+      "loss": 0.4147,
+      "step": 2469
+    },
+    {
+      "epoch": 0.7404909124976579,
+      "grad_norm": 0.2528432309627533,
+      "learning_rate": 0.00018918918918918917,
+      "loss": 0.4374,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7407907063893573,
+      "grad_norm": 0.26637372374534607,
+      "learning_rate": 0.0001891441441441441,
+      "loss": 0.4189,
+      "step": 2471
+    },
+    {
+      "epoch": 0.7410905002810567,
+      "grad_norm": 0.2570081055164337,
+      "learning_rate": 0.0001890990990990991,
+      "loss": 0.4388,
+      "step": 2472
+    },
+    {
+      "epoch": 0.7413902941727563,
+      "grad_norm": 0.27075570821762085,
+      "learning_rate": 0.00018905405405405403,
+      "loss": 0.4599,
+      "step": 2473
+    },
+    {
+      "epoch": 0.7416900880644557,
+      "grad_norm": 0.2676197290420532,
+      "learning_rate": 0.00018900900900900897,
+      "loss": 0.4105,
+      "step": 2474
+    },
+    {
+      "epoch": 0.7419898819561551,
+      "grad_norm": 0.24458040297031403,
+      "learning_rate": 0.00018896396396396396,
+      "loss": 0.4152,
+      "step": 2475
+    },
+    {
+      "epoch": 0.7422896758478545,
+      "grad_norm": 0.2793339788913727,
+      "learning_rate": 0.0001889189189189189,
+      "loss": 0.4468,
+      "step": 2476
+    },
+    {
+      "epoch": 0.7425894697395541,
+      "grad_norm": 0.25252237915992737,
+      "learning_rate": 0.00018887387387387384,
+      "loss": 0.4215,
+      "step": 2477
+    },
+    {
+      "epoch": 0.7428892636312535,
+      "grad_norm": 0.27801933884620667,
+      "learning_rate": 0.00018882882882882883,
+      "loss": 0.4362,
+      "step": 2478
+    },
+    {
+      "epoch": 0.743189057522953,
+      "grad_norm": 0.26056137681007385,
+      "learning_rate": 0.00018878378378378376,
+      "loss": 0.4342,
+      "step": 2479
+    },
+    {
+      "epoch": 0.7434888514146524,
+      "grad_norm": 0.26250821352005005,
+      "learning_rate": 0.00018873873873873873,
+      "loss": 0.4196,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7437886453063519,
+      "grad_norm": 0.2682492733001709,
+      "learning_rate": 0.0001886936936936937,
+      "loss": 0.4368,
+      "step": 2481
+    },
+    {
+      "epoch": 0.7440884391980513,
+      "grad_norm": 0.2572811245918274,
+      "learning_rate": 0.00018864864864864863,
+      "loss": 0.4363,
+      "step": 2482
+    },
+    {
+      "epoch": 0.7443882330897508,
+      "grad_norm": 0.25746074318885803,
+      "learning_rate": 0.0001886036036036036,
+      "loss": 0.4045,
+      "step": 2483
+    },
+    {
+      "epoch": 0.7446880269814502,
+      "grad_norm": 0.25470736622810364,
+      "learning_rate": 0.00018855855855855853,
+      "loss": 0.4062,
+      "step": 2484
+    },
+    {
+      "epoch": 0.7449878208731497,
+      "grad_norm": 0.2766227722167969,
+      "learning_rate": 0.0001885135135135135,
+      "loss": 0.4605,
+      "step": 2485
+    },
+    {
+      "epoch": 0.7452876147648492,
+      "grad_norm": 0.28737902641296387,
+      "learning_rate": 0.00018846846846846846,
+      "loss": 0.4481,
+      "step": 2486
+    },
+    {
+      "epoch": 0.7455874086565486,
+      "grad_norm": 0.2646963894367218,
+      "learning_rate": 0.0001884234234234234,
+      "loss": 0.4212,
+      "step": 2487
+    },
+    {
+      "epoch": 0.745887202548248,
+      "grad_norm": 0.2569124698638916,
+      "learning_rate": 0.00018837837837837836,
+      "loss": 0.4268,
+      "step": 2488
+    },
+    {
+      "epoch": 0.7461869964399476,
+      "grad_norm": 0.25343701243400574,
+      "learning_rate": 0.00018833333333333332,
+      "loss": 0.4285,
+      "step": 2489
+    },
+    {
+      "epoch": 0.746486790331647,
+      "grad_norm": 0.27101901173591614,
+      "learning_rate": 0.00018828828828828826,
+      "loss": 0.4426,
+      "step": 2490
+    },
+    {
+      "epoch": 0.7467865842233464,
+      "grad_norm": 0.2594289779663086,
+      "learning_rate": 0.00018824324324324325,
+      "loss": 0.4333,
+      "step": 2491
+    },
+    {
+      "epoch": 0.7470863781150459,
+      "grad_norm": 0.2643277049064636,
+      "learning_rate": 0.00018819819819819819,
+      "loss": 0.4407,
+      "step": 2492
+    },
+    {
+      "epoch": 0.7473861720067454,
+      "grad_norm": 0.267240047454834,
+      "learning_rate": 0.00018815315315315312,
+      "loss": 0.4448,
+      "step": 2493
+    },
+    {
+      "epoch": 0.7476859658984448,
+      "grad_norm": 0.24963083863258362,
+      "learning_rate": 0.00018810810810810811,
+      "loss": 0.4313,
+      "step": 2494
+    },
+    {
+      "epoch": 0.7479857597901443,
+      "grad_norm": 0.2673603892326355,
+      "learning_rate": 0.00018806306306306305,
+      "loss": 0.4327,
+      "step": 2495
+    },
+    {
+      "epoch": 0.7482855536818437,
+      "grad_norm": 0.25436538457870483,
+      "learning_rate": 0.000188018018018018,
+      "loss": 0.4251,
+      "step": 2496
+    },
+    {
+      "epoch": 0.7485853475735432,
+      "grad_norm": 0.25511813163757324,
+      "learning_rate": 0.00018797297297297298,
+      "loss": 0.441,
+      "step": 2497
+    },
+    {
+      "epoch": 0.7488851414652427,
+      "grad_norm": 0.26634255051612854,
+      "learning_rate": 0.00018792792792792791,
+      "loss": 0.4142,
+      "step": 2498
+    },
+    {
+      "epoch": 0.7491849353569421,
+      "grad_norm": 0.2738245129585266,
+      "learning_rate": 0.00018788288288288285,
+      "loss": 0.446,
+      "step": 2499
+    },
+    {
+      "epoch": 0.7494847292486415,
+      "grad_norm": 0.2478281557559967,
+      "learning_rate": 0.00018783783783783784,
+      "loss": 0.4355,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7494847292486415,
+      "eval_loss": 0.43330296874046326,
+      "eval_runtime": 567.7062,
+      "eval_samples_per_second": 3.803,
+      "eval_steps_per_second": 0.476,
+      "step": 2500
+    },
+    {
+      "epoch": 0.749784523140341,
+      "grad_norm": 0.24915559589862823,
+      "learning_rate": 0.00018779279279279278,
+      "loss": 0.431,
+      "step": 2501
+    },
+    {
+      "epoch": 0.7500843170320405,
+      "grad_norm": 0.25979626178741455,
+      "learning_rate": 0.00018774774774774772,
+      "loss": 0.4336,
+      "step": 2502
+    },
+    {
+      "epoch": 0.7503841109237399,
+      "grad_norm": 0.2514503002166748,
+      "learning_rate": 0.0001877027027027027,
+      "loss": 0.4104,
+      "step": 2503
+    },
+    {
+      "epoch": 0.7506839048154393,
+      "grad_norm": 0.2693893015384674,
+      "learning_rate": 0.00018765765765765764,
+      "loss": 0.4706,
+      "step": 2504
+    },
+    {
+      "epoch": 0.7509836987071389,
+      "grad_norm": 0.2881157696247101,
+      "learning_rate": 0.00018761261261261258,
+      "loss": 0.4346,
+      "step": 2505
+    },
+    {
+      "epoch": 0.7512834925988383,
+      "grad_norm": 0.232576385140419,
+      "learning_rate": 0.00018756756756756757,
+      "loss": 0.42,
+      "step": 2506
+    },
+    {
+      "epoch": 0.7515832864905377,
+      "grad_norm": 0.30108994245529175,
+      "learning_rate": 0.0001875225225225225,
+      "loss": 0.425,
+      "step": 2507
+    },
+    {
+      "epoch": 0.7518830803822372,
+      "grad_norm": 0.28761547803878784,
+      "learning_rate": 0.00018747747747747745,
+      "loss": 0.47,
+      "step": 2508
+    },
+    {
+      "epoch": 0.7521828742739367,
+      "grad_norm": 0.2556571662425995,
+      "learning_rate": 0.0001874324324324324,
+      "loss": 0.4236,
+      "step": 2509
+    },
+    {
+      "epoch": 0.7524826681656361,
+      "grad_norm": 0.27593177556991577,
+      "learning_rate": 0.00018738738738738737,
+      "loss": 0.4209,
+      "step": 2510
+    },
+    {
+      "epoch": 0.7527824620573356,
+      "grad_norm": 0.26506245136260986,
+      "learning_rate": 0.0001873423423423423,
+      "loss": 0.4182,
+      "step": 2511
+    },
+    {
+      "epoch": 0.753082255949035,
+      "grad_norm": 0.31767213344573975,
+      "learning_rate": 0.00018729729729729727,
+      "loss": 0.4574,
+      "step": 2512
+    },
+    {
+      "epoch": 0.7533820498407345,
+      "grad_norm": 0.25470229983329773,
+      "learning_rate": 0.00018725225225225224,
+      "loss": 0.4064,
+      "step": 2513
+    },
+    {
+      "epoch": 0.753681843732434,
+      "grad_norm": 0.25668561458587646,
+      "learning_rate": 0.0001872072072072072,
+      "loss": 0.4247,
+      "step": 2514
+    },
+    {
+      "epoch": 0.7539816376241334,
+      "grad_norm": 0.29275453090667725,
+      "learning_rate": 0.00018716216216216214,
+      "loss": 0.4232,
+      "step": 2515
+    },
+    {
+      "epoch": 0.7542814315158328,
+      "grad_norm": 0.2815520167350769,
+      "learning_rate": 0.0001871171171171171,
+      "loss": 0.4402,
+      "step": 2516
+    },
+    {
+      "epoch": 0.7545812254075324,
+      "grad_norm": 0.24637946486473083,
+      "learning_rate": 0.00018707207207207207,
+      "loss": 0.4209,
+      "step": 2517
+    },
+    {
+      "epoch": 0.7548810192992318,
+      "grad_norm": 0.27894946932792664,
+      "learning_rate": 0.000187027027027027,
+      "loss": 0.453,
+      "step": 2518
+    },
+    {
+      "epoch": 0.7551808131909312,
+      "grad_norm": 0.3131442070007324,
+      "learning_rate": 0.00018698198198198197,
+      "loss": 0.4728,
+      "step": 2519
+    },
+    {
+      "epoch": 0.7554806070826307,
+      "grad_norm": 0.2698810398578644,
+      "learning_rate": 0.00018693693693693693,
+      "loss": 0.4387,
+      "step": 2520
+    },
+    {
+      "epoch": 0.7557804009743302,
+      "grad_norm": 0.24818141758441925,
+      "learning_rate": 0.00018689189189189187,
+      "loss": 0.4065,
+      "step": 2521
+    },
+    {
+      "epoch": 0.7560801948660296,
+      "grad_norm": 0.2903098464012146,
+      "learning_rate": 0.00018684684684684683,
+      "loss": 0.426,
+      "step": 2522
+    },
+    {
+      "epoch": 0.756379988757729,
+      "grad_norm": 0.2602495551109314,
+      "learning_rate": 0.0001868018018018018,
+      "loss": 0.4144,
+      "step": 2523
+    },
+    {
+      "epoch": 0.7566797826494285,
+      "grad_norm": 0.2648625373840332,
+      "learning_rate": 0.00018675675675675673,
+      "loss": 0.4476,
+      "step": 2524
+    },
+    {
+      "epoch": 0.756979576541128,
+      "grad_norm": 0.26782098412513733,
+      "learning_rate": 0.0001867117117117117,
+      "loss": 0.4238,
+      "step": 2525
+    },
+    {
+      "epoch": 0.7572793704328274,
+      "grad_norm": 0.27317121624946594,
+      "learning_rate": 0.00018666666666666666,
+      "loss": 0.4251,
+      "step": 2526
+    },
+    {
+      "epoch": 0.7575791643245269,
+      "grad_norm": 0.2720593214035034,
+      "learning_rate": 0.0001866216216216216,
+      "loss": 0.4667,
+      "step": 2527
+    },
+    {
+      "epoch": 0.7578789582162263,
+      "grad_norm": 0.2541276812553406,
+      "learning_rate": 0.0001865765765765766,
+      "loss": 0.431,
+      "step": 2528
+    },
+    {
+      "epoch": 0.7581787521079258,
+      "grad_norm": 0.27258971333503723,
+      "learning_rate": 0.00018653153153153152,
+      "loss": 0.4263,
+      "step": 2529
+    },
+    {
+      "epoch": 0.7584785459996253,
+      "grad_norm": 0.28021714091300964,
+      "learning_rate": 0.00018648648648648646,
+      "loss": 0.4276,
+      "step": 2530
+    },
+    {
+      "epoch": 0.7587783398913247,
+      "grad_norm": 0.2505019009113312,
+      "learning_rate": 0.00018644144144144145,
+      "loss": 0.4105,
+      "step": 2531
+    },
+    {
+      "epoch": 0.7590781337830241,
+      "grad_norm": 0.28030917048454285,
+      "learning_rate": 0.0001863963963963964,
+      "loss": 0.4428,
+      "step": 2532
+    },
+    {
+      "epoch": 0.7593779276747237,
+      "grad_norm": 0.27447059750556946,
+      "learning_rate": 0.00018635135135135133,
+      "loss": 0.4551,
+      "step": 2533
+    },
+    {
+      "epoch": 0.7596777215664231,
+      "grad_norm": 0.26824313402175903,
+      "learning_rate": 0.00018630630630630626,
+      "loss": 0.4445,
+      "step": 2534
+    },
+    {
+      "epoch": 0.7599775154581225,
+      "grad_norm": 0.2674945294857025,
+      "learning_rate": 0.00018626126126126125,
+      "loss": 0.4513,
+      "step": 2535
+    },
+    {
+      "epoch": 0.760277309349822,
+      "grad_norm": 0.2604798972606659,
+      "learning_rate": 0.0001862162162162162,
+      "loss": 0.4381,
+      "step": 2536
+    },
+    {
+      "epoch": 0.7605771032415215,
+      "grad_norm": 0.27609342336654663,
+      "learning_rate": 0.00018617117117117115,
+      "loss": 0.4441,
+      "step": 2537
+    },
+    {
+      "epoch": 0.7608768971332209,
+      "grad_norm": 0.2614879012107849,
+      "learning_rate": 0.00018612612612612612,
+      "loss": 0.435,
+      "step": 2538
+    },
+    {
+      "epoch": 0.7611766910249204,
+      "grad_norm": 0.25386688113212585,
+      "learning_rate": 0.00018608108108108105,
+      "loss": 0.4434,
+      "step": 2539
+    },
+    {
+      "epoch": 0.7614764849166198,
+      "grad_norm": 0.24181143939495087,
+      "learning_rate": 0.00018603603603603602,
+      "loss": 0.4175,
+      "step": 2540
+    },
+    {
+      "epoch": 0.7617762788083193,
+      "grad_norm": 0.2645350694656372,
+      "learning_rate": 0.00018599099099099098,
+      "loss": 0.4136,
+      "step": 2541
+    },
+    {
+      "epoch": 0.7620760727000188,
+      "grad_norm": 0.2677913010120392,
+      "learning_rate": 0.00018594594594594592,
+      "loss": 0.4436,
+      "step": 2542
+    },
+    {
+      "epoch": 0.7623758665917182,
+      "grad_norm": 0.2717260420322418,
+      "learning_rate": 0.00018590090090090088,
+      "loss": 0.4565,
+      "step": 2543
+    },
+    {
+      "epoch": 0.7626756604834176,
+      "grad_norm": 0.25026705861091614,
+      "learning_rate": 0.00018585585585585585,
+      "loss": 0.4119,
+      "step": 2544
+    },
+    {
+      "epoch": 0.7629754543751172,
+      "grad_norm": 0.24770689010620117,
+      "learning_rate": 0.00018581081081081078,
+      "loss": 0.4097,
+      "step": 2545
+    },
+    {
+      "epoch": 0.7632752482668166,
+      "grad_norm": 0.27625271677970886,
+      "learning_rate": 0.00018576576576576575,
+      "loss": 0.4269,
+      "step": 2546
+    },
+    {
+      "epoch": 0.763575042158516,
+      "grad_norm": 0.27056175470352173,
+      "learning_rate": 0.0001857207207207207,
+      "loss": 0.4499,
+      "step": 2547
+    },
+    {
+      "epoch": 0.7638748360502154,
+      "grad_norm": 0.2812648415565491,
+      "learning_rate": 0.00018567567567567567,
+      "loss": 0.4736,
+      "step": 2548
+    },
+    {
+      "epoch": 0.764174629941915,
+      "grad_norm": 0.26717478036880493,
+      "learning_rate": 0.0001856306306306306,
+      "loss": 0.4072,
+      "step": 2549
+    },
+    {
+      "epoch": 0.7644744238336144,
+      "grad_norm": 0.2870055139064789,
+      "learning_rate": 0.00018558558558558558,
+      "loss": 0.4229,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7647742177253138,
+      "grad_norm": 0.2580265700817108,
+      "learning_rate": 0.00018554054054054054,
+      "loss": 0.4068,
+      "step": 2551
+    },
+    {
+      "epoch": 0.7650740116170133,
+      "grad_norm": 0.28002214431762695,
+      "learning_rate": 0.00018549549549549548,
+      "loss": 0.432,
+      "step": 2552
+    },
+    {
+      "epoch": 0.7653738055087128,
+      "grad_norm": 0.27384141087532043,
+      "learning_rate": 0.00018545045045045044,
+      "loss": 0.4363,
+      "step": 2553
+    },
+    {
+      "epoch": 0.7656735994004122,
+      "grad_norm": 0.2627524137496948,
+      "learning_rate": 0.0001854054054054054,
+      "loss": 0.4197,
+      "step": 2554
+    },
+    {
+      "epoch": 0.7659733932921117,
+      "grad_norm": 0.2666347324848175,
+      "learning_rate": 0.00018536036036036034,
+      "loss": 0.3912,
+      "step": 2555
+    },
+    {
+      "epoch": 0.7662731871838111,
+      "grad_norm": 0.2756651043891907,
+      "learning_rate": 0.00018531531531531528,
+      "loss": 0.4364,
+      "step": 2556
+    },
+    {
+      "epoch": 0.7665729810755106,
+      "grad_norm": 0.2617150545120239,
+      "learning_rate": 0.00018527027027027027,
+      "loss": 0.4468,
+      "step": 2557
+    },
+    {
+      "epoch": 0.7668727749672101,
+      "grad_norm": 0.27227911353111267,
+      "learning_rate": 0.0001852252252252252,
+      "loss": 0.4246,
+      "step": 2558
+    },
+    {
+      "epoch": 0.7671725688589095,
+      "grad_norm": 0.2841823697090149,
+      "learning_rate": 0.00018518018018018014,
+      "loss": 0.4363,
+      "step": 2559
+    },
+    {
+      "epoch": 0.7674723627506089,
+      "grad_norm": 0.253366082906723,
+      "learning_rate": 0.00018513513513513513,
+      "loss": 0.419,
+      "step": 2560
+    },
+    {
+      "epoch": 0.7677721566423085,
+      "grad_norm": 0.2522357106208801,
+      "learning_rate": 0.00018509009009009007,
+      "loss": 0.4124,
+      "step": 2561
+    },
+    {
+      "epoch": 0.7680719505340079,
+      "grad_norm": 0.2550141215324402,
+      "learning_rate": 0.000185045045045045,
+      "loss": 0.4256,
+      "step": 2562
+    },
+    {
+      "epoch": 0.7683717444257073,
+      "grad_norm": 0.27578258514404297,
+      "learning_rate": 0.000185,
+      "loss": 0.4447,
+      "step": 2563
+    },
+    {
+      "epoch": 0.7686715383174068,
+      "grad_norm": 0.2517780661582947,
+      "learning_rate": 0.00018495495495495493,
+      "loss": 0.4167,
+      "step": 2564
+    },
+    {
+      "epoch": 0.7689713322091062,
+      "grad_norm": 0.2627197802066803,
+      "learning_rate": 0.00018490990990990987,
+      "loss": 0.421,
+      "step": 2565
+    },
+    {
+      "epoch": 0.7692711261008057,
+      "grad_norm": 0.2572929263114929,
+      "learning_rate": 0.00018486486486486486,
+      "loss": 0.4295,
+      "step": 2566
+    },
+    {
+      "epoch": 0.7695709199925052,
+      "grad_norm": 0.2549370229244232,
+      "learning_rate": 0.0001848198198198198,
+      "loss": 0.4353,
+      "step": 2567
+    },
+    {
+      "epoch": 0.7698707138842046,
+      "grad_norm": 0.25990357995033264,
+      "learning_rate": 0.00018477477477477474,
+      "loss": 0.4598,
+      "step": 2568
+    },
+    {
+      "epoch": 0.770170507775904,
+      "grad_norm": 0.26102861762046814,
+      "learning_rate": 0.00018472972972972973,
+      "loss": 0.4342,
+      "step": 2569
+    },
+    {
+      "epoch": 0.7704703016676036,
+      "grad_norm": 0.26292112469673157,
+      "learning_rate": 0.00018468468468468466,
+      "loss": 0.4217,
+      "step": 2570
+    },
+    {
+      "epoch": 0.770770095559303,
+      "grad_norm": 0.24879471957683563,
+      "learning_rate": 0.00018463963963963963,
+      "loss": 0.4034,
+      "step": 2571
+    },
+    {
+      "epoch": 0.7710698894510024,
+      "grad_norm": 0.249162495136261,
+      "learning_rate": 0.0001845945945945946,
+      "loss": 0.4213,
+      "step": 2572
+    },
+    {
+      "epoch": 0.7713696833427018,
+      "grad_norm": 0.25036314129829407,
+      "learning_rate": 0.00018454954954954953,
+      "loss": 0.449,
+      "step": 2573
+    },
+    {
+      "epoch": 0.7716694772344014,
+      "grad_norm": 0.2511482238769531,
+      "learning_rate": 0.0001845045045045045,
+      "loss": 0.4285,
+      "step": 2574
+    },
+    {
+      "epoch": 0.7719692711261008,
+      "grad_norm": 0.25358885526657104,
+      "learning_rate": 0.00018445945945945946,
+      "loss": 0.4392,
+      "step": 2575
+    },
+    {
+      "epoch": 0.7722690650178002,
+      "grad_norm": 0.25731760263442993,
+      "learning_rate": 0.0001844144144144144,
+      "loss": 0.4326,
+      "step": 2576
+    },
+    {
+      "epoch": 0.7725688589094997,
+      "grad_norm": 0.24017149209976196,
+      "learning_rate": 0.00018436936936936936,
+      "loss": 0.4285,
+      "step": 2577
+    },
+    {
+      "epoch": 0.7728686528011992,
+      "grad_norm": 0.24697363376617432,
+      "learning_rate": 0.00018432432432432432,
+      "loss": 0.4326,
+      "step": 2578
+    },
+    {
+      "epoch": 0.7731684466928986,
+      "grad_norm": 0.2622368335723877,
+      "learning_rate": 0.00018427927927927926,
+      "loss": 0.4051,
+      "step": 2579
+    },
+    {
+      "epoch": 0.7734682405845981,
+      "grad_norm": 0.26079848408699036,
+      "learning_rate": 0.00018423423423423422,
+      "loss": 0.4296,
+      "step": 2580
+    },
+    {
+      "epoch": 0.7737680344762975,
+      "grad_norm": 0.26790016889572144,
+      "learning_rate": 0.00018418918918918916,
+      "loss": 0.4487,
+      "step": 2581
+    },
+    {
+      "epoch": 0.774067828367997,
+      "grad_norm": 0.26801207661628723,
+      "learning_rate": 0.00018414414414414412,
+      "loss": 0.441,
+      "step": 2582
+    },
+    {
+      "epoch": 0.7743676222596965,
+      "grad_norm": 0.2615436911582947,
+      "learning_rate": 0.00018409909909909909,
+      "loss": 0.4356,
+      "step": 2583
+    },
+    {
+      "epoch": 0.7746674161513959,
+      "grad_norm": 0.26157858967781067,
+      "learning_rate": 0.00018405405405405402,
+      "loss": 0.4624,
+      "step": 2584
+    },
+    {
+      "epoch": 0.7749672100430953,
+      "grad_norm": 0.2570144832134247,
+      "learning_rate": 0.000184009009009009,
+      "loss": 0.4159,
+      "step": 2585
+    },
+    {
+      "epoch": 0.7752670039347949,
+      "grad_norm": 0.25635403394699097,
+      "learning_rate": 0.00018396396396396395,
+      "loss": 0.4479,
+      "step": 2586
+    },
+    {
+      "epoch": 0.7755667978264943,
+      "grad_norm": 0.24715913832187653,
+      "learning_rate": 0.00018391891891891889,
+      "loss": 0.4258,
+      "step": 2587
+    },
+    {
+      "epoch": 0.7758665917181937,
+      "grad_norm": 0.2577861547470093,
+      "learning_rate": 0.00018387387387387388,
+      "loss": 0.4107,
+      "step": 2588
+    },
+    {
+      "epoch": 0.7761663856098932,
+      "grad_norm": 0.24768322706222534,
+      "learning_rate": 0.00018382882882882881,
+      "loss": 0.4208,
+      "step": 2589
+    },
+    {
+      "epoch": 0.7764661795015927,
+      "grad_norm": 0.24486133456230164,
+      "learning_rate": 0.00018378378378378375,
+      "loss": 0.4128,
+      "step": 2590
+    },
+    {
+      "epoch": 0.7767659733932921,
+      "grad_norm": 0.2598220109939575,
+      "learning_rate": 0.00018373873873873874,
+      "loss": 0.4398,
+      "step": 2591
+    },
+    {
+      "epoch": 0.7770657672849915,
+      "grad_norm": 0.2616111636161804,
+      "learning_rate": 0.00018369369369369368,
+      "loss": 0.4472,
+      "step": 2592
+    },
+    {
+      "epoch": 0.777365561176691,
+      "grad_norm": 0.2481420487165451,
+      "learning_rate": 0.00018364864864864862,
+      "loss": 0.4073,
+      "step": 2593
+    },
+    {
+      "epoch": 0.7776653550683905,
+      "grad_norm": 0.26911380887031555,
+      "learning_rate": 0.0001836036036036036,
+      "loss": 0.4584,
+      "step": 2594
+    },
+    {
+      "epoch": 0.77796514896009,
+      "grad_norm": 0.2654714584350586,
+      "learning_rate": 0.00018355855855855854,
+      "loss": 0.4541,
+      "step": 2595
+    },
+    {
+      "epoch": 0.7782649428517894,
+      "grad_norm": 0.2782737612724304,
+      "learning_rate": 0.00018351351351351348,
+      "loss": 0.4496,
+      "step": 2596
+    },
+    {
+      "epoch": 0.7785647367434888,
+      "grad_norm": 0.24328523874282837,
+      "learning_rate": 0.00018346846846846847,
+      "loss": 0.4198,
+      "step": 2597
+    },
+    {
+      "epoch": 0.7788645306351883,
+      "grad_norm": 0.2627139985561371,
+      "learning_rate": 0.0001834234234234234,
+      "loss": 0.4383,
+      "step": 2598
+    },
+    {
+      "epoch": 0.7791643245268878,
+      "grad_norm": 0.25555041432380676,
+      "learning_rate": 0.00018337837837837834,
+      "loss": 0.4097,
+      "step": 2599
+    },
+    {
+      "epoch": 0.7794641184185872,
+      "grad_norm": 0.24429571628570557,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.3932,
+      "step": 2600
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 6670,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.918856291720888e+19,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}